udfLevenshteinDistance1
udfLevenshteinDistance2
int udfLevenshteinDistance1 (str, str)
int udfLevenshteinDistance2 (str, str)
;------------------------------------------------------------------------------------------------------------------------------------------
#DefineFunction udfLevenshteinDistance1 (strString1, strString2) ; The matrix version.
intLen1 = StrLen (strString1)
intLen2 = StrLen (strString2)
If strString1 == strString2 Then Return 0 ; Strings are identical.
If !intLen1 Then Return intLen2
If !intLen2 Then Return intLen1

If intLen1 > 512 Then Return -1 ; See Note at end of function.
If intLen2 > 512 Then Return -1 ; See Note at end of function.

; Initialize matrix.
arrM = ArrDimension (intLen1 + 1, intLen2 + 1)
For intRow = 0 To intLen1
   arrM[intRow, 0] = intRow
Next
For intCol = 0 To intLen2
   arrM[0, intCol] = intCol
Next

; Do the matrix work.
For intRow = 1 To intLen1
   For intCol = 1 To intLen2
      intCost = StrSub (strString1, intRow, 1) != StrSub (strString2, intCol, 1)
      arrM[intRow, intCol] = Min (1 + arrM[intRow - 1, intCol], 1 + arrM[intRow, intCol - 1], intCost + arrM[intRow - 1, intCol - 1])
   Next
Next
Return arrM[intLen1, intLen2]
;..........................................................................................................................................
; This UDF "udfLevenshteinDistance" returns an integer number which indicates the Levenshtein-Distance between
; the two argument strings or -1, if one of the argument strings is longer than the limit of 255 characters
;
; The maximal string size of 512 characters should be more than enough for name or dictionary comparison.
; An array matrix of 512 * 512 elements has about 256k elements!
;
; The Levenshtein distance is defined as the minimal number of characters you have to replace,
; insert or delete to transform strString1 into strString2.
;
; The greater the Levenshtein-Distance, the more different the strings are.
;
; The Levenshtein-Distance is named after the Russian scientist Vladimir Levenshtein,
; who devised the algorithm in 1965.
; In its simplest form the function will take only the two strings as parameter and will calculate
; just the number of insert, replace and delete operations needed to transform strString1 into strString2.
;
; If you cannot spell or pronounce Levenshtein, the metric is also sometimes called 'edit distance'.
; The Levenshtein distance algorithm has been used in:
; - Spell checking, - Speech recognition, - DNA analysis, - Plagiarism detection .
;
; Reference: http://www.merriampark.com/ld.htm
;..........................................................................................................................................
; Detlev Dalitz.20020805.20030212.20100207.20100914.
;..........................................................................................................................................
#EndFunction
;------------------------------------------------------------------------------------------------------------------------------------------



;------------------------------------------------------------------------------------------------------------------------------------------
#DefineFunction udfLevenshteinDistance2 (strString1, strString2) ; The vector version.
If strString1 == strString2 Then Return 0 ; Strings are identical.

arrS1 = ArrayFromStr (strString1)
arrS2 = ArrayFromStr (strString2)
intLen1 = ArrInfo (arrS1, 1)
intLen2 = ArrInfo (arrS2, 1)
If !intLen1 Then Return intLen2
If !intLen2 Then Return intLen1

If intLen1 > intLen2 ; ... then swap string arrays (only their names) and length variable content.
   arrS0 = arrS1
   arrS1 = arrS2
   arrS2 = arrS0
   intLen0 = intLen1
   intLen1 = intLen2
   intLen2 = intLen0
EndIf

; Initialize vectors.
intLenMax = Max (intLen1, intLen2)
arrV1 = ArrDimension (intLenMax + 1)
arrV2 = ArrDimension (intLenMax + 1)
ArrInitialize (arrV2, 0)
For intElem = 0 To intLenMax
   arrV1[intElem] = intElem
Next

; Do the vector work.
intLast1 = intLen1 - 1
intLast2 = intLen2 - 1
For intRow = 0 To intLast1
   arrV2[0] = intRow
   For intCol = 0 To intLast2
      intCost = arrS1[intRow] != arrS2[intCol]
      arrV2[intCol + 1] = Min (1 + arrV2[intCol], 1 + arrV1[intCol + 1], intCost + arrV1[intCol])
   Next
   arrV0 = arrV1 ; Swap vectors (only their names).
   arrV1 = arrV2
   arrV2 = arrV0
Next
Return arrV1[intLenMax]
;..........................................................................................................................................
; This UDF "udfLevenshteinDistance" returns an integer number which indicates the Levenshtein-Distance
; between the two argument strings.
;
; The Levenshtein distance is defined as the minimal number of characters you have to replace,
; insert or delete to transform strString1 into strString2.
;
; The greater the Levenshtein-Distance, the more different the strings are.
;
; The Levenshtein-Distance is named after the Russian scientist Vladimir Levenshtein,
; who devised the algorithm in 1965.
; In its simplest form the function will take only the two strings as parameter and will calculate
; just the number of insert, replace and delete operations needed to transform strString1 into strString2.
;
; If you cannot spell or pronounce Levenshtein, the metric is also sometimes called 'edit distance'.
; The Levenshtein distance algorithm has been used in:
; - Spell checking, - Speech recognition, - DNA analysis, - Plagiarism detection .
;
; Reference: http://www.merriampark.com/ld.htm
;..........................................................................................................................................
; This WinBatch UDF "udfLevenshteinDistance" algorithm has been adapted from article
; "Fast, memory efficient Levenshtein algorithm", Sten Hjelmqvist, 2006-03-27.
; http://www.codeproject.com/KB/recipes/Levenshtein.aspx
;..........................................................................................................................................
; The original Levenshtein distance algorithm creates an array matrix, where the size is StrLen1 * StrLen2.
; If both strings are 1000 characters long, the resulting matrix is 1M elements;
; if the strings are 10000 chars, the matrix will be 100M elements.
; This version of the algorithm uses only 2 * StrLen elements, which is much more memory efficient!
;..........................................................................................................................................
; (c)Detlev Dalitz.20100914.
;..........................................................................................................................................
#EndFunction
;------------------------------------------------------------------------------------------------------------------------------------------



; Test.

; Prepare Test Data.
strCSV = '"String1","String2","Distance","Comment"'
strCSV = strCSV : @CRLF : '"","",0,"identical"'
strCSV = strCSV : @CRLF : '"GLAVIN","GLAVIN",0,"Identical."'
strCSV = strCSV : @CRLF : '"GLAVINE","GLAWYN",3,""'
strCSV = strCSV : @CRLF : '"GLAWYN","GLAVINE",3,""'
strCSV = strCSV : @CRLF : '"Darlitz","Dalitz",1,"High similarity."'
strCSV = strCSV : @CRLF : '"Dahlicz","Dalitz",2,""'
strCSV = strCSV : @CRLF : '"Daley","Dalitz",3,""'
strCSV = strCSV : @CRLF : '"Dallwitz","Dalitz",2,""'
strCSV = strCSV : @CRLF : '"Duhlitz","Dalitz",2,""'
strCSV = strCSV : @CRLF : '"Talfritz","Dalitz",3,""'
strCSV = strCSV : @CRLF : '"FORTRAN","BASIC",7,"Very low similarity."'
strCSV = strCSV : @CRLF : '"Alex","Alexander",5,""'
strCSV = strCSV : @CRLF : '"GLADIOL","GALAKOL",3,""'
strCSV = strCSV : @CRLF : '"GUMBO' : StrFill ("test", 200) : '","GAMBOL' : StrFill ("test", 200) : '",2,""'


; Write test data to temp file.
strFileTemp = FileCreateTemp ("WBT")
intBytesWritten = FilePut (strFileTemp, strCSV)
Drop (strCSV)

; Read test data into array.
arrTest = ArrayFileGetCSV (strFileTemp, 0)
intTestLast = ArrInfo (arrTest, 1) - 1
intTestFirst = 1 ; Skip header line.


; Do the test loop.
intTicks1 = 0

For intT = intTestFirst To intTestLast

   intTicks = GetTickCount ()
   intResult = udfLevenshteinDistance1 (arrTest[intT, 0], arrTest[intT, 1])
   intTicks1 = GetTickCount () - intTicks

   ; Prepare message output.
   If arrTest[intT, 3] != ""
      strComment = arrTest[intT, 3] ; Use comment from CSV file if any.
   Else ; Compare UDF result with given result value from CSV file.
      strComment = "UDF result is invalid."
      If intResult == arrTest[intT, 2] Then strComment = "UDF result is valid."
   EndIf

   strMsgTitle = "Test " : intT : " of " : intTestLast : "|udfLevenshteinDistance1 (strString1, strString2)"
   strMsgText = "String1: " : @LF : '"' : arrTest[intT, 0] : '"'
   strMsgText = strMsgText : @LF : @LF : "String2: " : @LF : '"' : arrTest[intT, 1] : '"'
   strMsgText = strMsgText : @LF : @LF : "Levenshtein Distance: " : @LF : intResult
   strMsgText = strMsgText : @LF : @LF : "Comment: " : @LF : strComment
   Pause (strMsgTitle, strMsgText)

Next


; Do the test loop.
intTicks2 = 0

For intT = intTestFirst To intTestLast

   intTicks = GetTickCount ()
   intResult = udfLevenshteinDistance2 (arrTest[intT, 0], arrTest[intT, 1])
   intTicks2 = GetTickCount () - intTicks

   ; Prepare message output.
   If arrTest[intT, 3] != ""
      strComment = arrTest[intT, 3] ; Use comment from CSV file if any.
   Else ; Compare UDF result with given result value from CSV file.
      strComment = "UDF result is invalid."
      If intResult == arrTest[intT, 2] Then strComment = "UDF result is valid."
   EndIf

   strMsgTitle = "Test " : intT : " of " : intTestLast : "|udfLevenshteinDistance2 (strString1, strString2)"
   strMsgText = "String1: " : @LF : '"' : arrTest[intT, 0] : '"'
   strMsgText = strMsgText : @LF : @LF : "String2: " : @LF : '"' : arrTest[intT, 1] : '"'
   strMsgText = strMsgText : @LF : @LF : "Levenshtein Distance: " : @LF : intResult
   strMsgText = strMsgText : @LF : @LF : "Comment: " : @LF : strComment
   Pause (strMsgTitle, strMsgText)

Next


; Performance comparison.
Decimals (2)
intTicksSum = intTicks1 + intTicks2
intTicksPct1 = 100.0 * intTicks1 / intTicksSum
intTicksPct2 = 100.0 * intTicks2 / intTicksSum

strMsgTitle = "Test Result|udfLevenshteinDistance1 - udfLevenshteinDistance2"
strMsgText = "udfLevenshteinDistance1 Ticks: " : intTicks1
strMsgText = strMsgText : @LF : "udfLevenshteinDistance2 Ticks: " : intTicks2
strMsgText = strMsgText : @LF : @LF : "udfLevenshteinDistance1 Pct: " : intTicksPct1
strMsgText = strMsgText : @LF : "udfLevenshteinDistance2 Pct: " : intTicksPct2
ClipPut (StrReplace (strMsgTitle : @LF : @LF : strMsgText, @LF, @CRLF))
Pause (strMsgTitle, strMsgText)


:CANCEL
If FileExist (strFileTemp) == 1 Then blnResult = FileDelete (strFileTemp)
Exit
;------------------------------------------------------------------------------------------------------------------------------------------
;   Test Result|udfLevenshteinDistance1 - udfLevenshteinDistance2
;
;   udfLevenshteinDistance1 Ticks: 32735
;   udfLevenshteinDistance2 Ticks: 18125
;
;   udfLevenshteinDistance1 Pct: 64.36
;   udfLevenshteinDistance2 Pct: 35.64
;------------------------------------------------------------------------------------------------------------------------------------------