;------------------------------------------------------------------------------------------------------------------------------------------ #DefineFunction udfLevenshteinDistance1 (strString1, strString2) ; The matrix version. intLen1 = StrLen (strString1) intLen2 = StrLen (strString2) If strString1 == strString2 Then Return 0 ; Strings are identical. If !intLen1 Then Return intLen2 If !intLen2 Then Return intLen1 If intLen1 > 512 Then Return -1 ; See Note at end of function. If intLen2 > 512 Then Return -1 ; See Note at end of function. ; Initialize matrix. arrM = ArrDimension (intLen1 + 1, intLen2 + 1) For intRow = 0 To intLen1 arrM[intRow, 0] = intRow Next For intCol = 0 To intLen2 arrM[0, intCol] = intCol Next ; Do the matrix work. For intRow = 1 To intLen1 For intCol = 1 To intLen2 intCost = StrSub (strString1, intRow, 1) != StrSub (strString2, intCol, 1) arrM[intRow, intCol] = Min (1 + arrM[intRow - 1, intCol], 1 + arrM[intRow, intCol - 1], intCost + arrM[intRow - 1, intCol - 1]) Next Next Return arrM[intLen1, intLen2] ;.......................................................................................................................................... ; This UDF "udfLevenshteinDistance" returns an integer number which indicates the Levenshtein-Distance between ; the two argument strings or -1, if one of the argument strings is longer than the limit of 255 characters ; ; The maximal string size of 512 characters should be more than enough for name or dictionary comparison. ; An array matrix of 512 * 512 elements has about 256k elements! ; ; The Levenshtein distance is defined as the minimal number of characters you have to replace, ; insert or delete to transform strString1 into strString2. ; ; The greater the Levenshtein-Distance, the more different the strings are. ; ; The Levenshtein-Distance is named after the Russian scientist Vladimir Levenshtein, ; who devised the algorithm in 1965. ; In its simplest form the function will take only the two strings as parameter and will calculate ; just the number of insert, replace and delete operations needed to transform strString1 into strString2. ; ; If you cannot spell or pronounce Levenshtein, the metric is also sometimes called 'edit distance'. ; The Levenshtein distance algorithm has been used in: ; - Spell checking, - Speech recognition, - DNA analysis, - Plagiarism detection . ; ; Reference: http://www.merriampark.com/ld.htm ;.......................................................................................................................................... ; Detlev Dalitz.20020805.20030212.20100207.20100914. ;.......................................................................................................................................... #EndFunction ;------------------------------------------------------------------------------------------------------------------------------------------ ;------------------------------------------------------------------------------------------------------------------------------------------ #DefineFunction udfLevenshteinDistance2 (strString1, strString2) ; The vector version. If strString1 == strString2 Then Return 0 ; Strings are identical. arrS1 = ArrayFromStr (strString1) arrS2 = ArrayFromStr (strString2) intLen1 = ArrInfo (arrS1, 1) intLen2 = ArrInfo (arrS2, 1) If !intLen1 Then Return intLen2 If !intLen2 Then Return intLen1 If intLen1 > intLen2 ; ... then swap string arrays (only their names) and length variable content. arrS0 = arrS1 arrS1 = arrS2 arrS2 = arrS0 intLen0 = intLen1 intLen1 = intLen2 intLen2 = intLen0 EndIf ; Initialize vectors. intLenMax = Max (intLen1, intLen2) arrV1 = ArrDimension (intLenMax + 1) arrV2 = ArrDimension (intLenMax + 1) ArrInitialize (arrV2, 0) For intElem = 0 To intLenMax arrV1[intElem] = intElem Next ; Do the vector work. intLast1 = intLen1 - 1 intLast2 = intLen2 - 1 For intRow = 0 To intLast1 arrV2[0] = intRow For intCol = 0 To intLast2 intCost = arrS1[intRow] != arrS2[intCol] arrV2[intCol + 1] = Min (1 + arrV2[intCol], 1 + arrV1[intCol + 1], intCost + arrV1[intCol]) Next arrV0 = arrV1 ; Swap vectors (only their names). arrV1 = arrV2 arrV2 = arrV0 Next Return arrV1[intLenMax] ;.......................................................................................................................................... ; This UDF "udfLevenshteinDistance" returns an integer number which indicates the Levenshtein-Distance ; between the two argument strings. ; ; The Levenshtein distance is defined as the minimal number of characters you have to replace, ; insert or delete to transform strString1 into strString2. ; ; The greater the Levenshtein-Distance, the more different the strings are. ; ; The Levenshtein-Distance is named after the Russian scientist Vladimir Levenshtein, ; who devised the algorithm in 1965. ; In its simplest form the function will take only the two strings as parameter and will calculate ; just the number of insert, replace and delete operations needed to transform strString1 into strString2. ; ; If you cannot spell or pronounce Levenshtein, the metric is also sometimes called 'edit distance'. ; The Levenshtein distance algorithm has been used in: ; - Spell checking, - Speech recognition, - DNA analysis, - Plagiarism detection . ; ; Reference: http://www.merriampark.com/ld.htm ;.......................................................................................................................................... ; This WinBatch UDF "udfLevenshteinDistance" algorithm has been adapted from article ; "Fast, memory efficient Levenshtein algorithm", Sten Hjelmqvist, 2006-03-27. ; http://www.codeproject.com/KB/recipes/Levenshtein.aspx ;.......................................................................................................................................... ; The original Levenshtein distance algorithm creates an array matrix, where the size is StrLen1 * StrLen2. ; If both strings are 1000 characters long, the resulting matrix is 1M elements; ; if the strings are 10000 chars, the matrix will be 100M elements. ; This version of the algorithm uses only 2 * StrLen elements, which is much more memory efficient! ;.......................................................................................................................................... ; (c)Detlev Dalitz.20100914. ;.......................................................................................................................................... #EndFunction ;------------------------------------------------------------------------------------------------------------------------------------------ ; Test. ; Prepare Test Data. strCSV = '"String1","String2","Distance","Comment"' strCSV = strCSV : @CRLF : '"","",0,"identical"' strCSV = strCSV : @CRLF : '"GLAVIN","GLAVIN",0,"Identical."' strCSV = strCSV : @CRLF : '"GLAVINE","GLAWYN",3,""' strCSV = strCSV : @CRLF : '"GLAWYN","GLAVINE",3,""' strCSV = strCSV : @CRLF : '"Darlitz","Dalitz",1,"High similarity."' strCSV = strCSV : @CRLF : '"Dahlicz","Dalitz",2,""' strCSV = strCSV : @CRLF : '"Daley","Dalitz",3,""' strCSV = strCSV : @CRLF : '"Dallwitz","Dalitz",2,""' strCSV = strCSV : @CRLF : '"Duhlitz","Dalitz",2,""' strCSV = strCSV : @CRLF : '"Talfritz","Dalitz",3,""' strCSV = strCSV : @CRLF : '"FORTRAN","BASIC",7,"Very low similarity."' strCSV = strCSV : @CRLF : '"Alex","Alexander",5,""' strCSV = strCSV : @CRLF : '"GLADIOL","GALAKOL",3,""' strCSV = strCSV : @CRLF : '"GUMBO' : StrFill ("test", 200) : '","GAMBOL' : StrFill ("test", 200) : '",2,""' ; Write test data to temp file. strFileTemp = FileCreateTemp ("WBT") intBytesWritten = FilePut (strFileTemp, strCSV) Drop (strCSV) ; Read test data into array. arrTest = ArrayFileGetCSV (strFileTemp, 0) intTestLast = ArrInfo (arrTest, 1) - 1 intTestFirst = 1 ; Skip header line. ; Do the test loop. intTicks1 = 0 For intT = intTestFirst To intTestLast intTicks = GetTickCount () intResult = udfLevenshteinDistance1 (arrTest[intT, 0], arrTest[intT, 1]) intTicks1 = GetTickCount () - intTicks ; Prepare message output. If arrTest[intT, 3] != "" strComment = arrTest[intT, 3] ; Use comment from CSV file if any. Else ; Compare UDF result with given result value from CSV file. strComment = "UDF result is invalid." If intResult == arrTest[intT, 2] Then strComment = "UDF result is valid." EndIf strMsgTitle = "Test " : intT : " of " : intTestLast : "|udfLevenshteinDistance1 (strString1, strString2)" strMsgText = "String1: " : @LF : '"' : arrTest[intT, 0] : '"' strMsgText = strMsgText : @LF : @LF : "String2: " : @LF : '"' : arrTest[intT, 1] : '"' strMsgText = strMsgText : @LF : @LF : "Levenshtein Distance: " : @LF : intResult strMsgText = strMsgText : @LF : @LF : "Comment: " : @LF : strComment Pause (strMsgTitle, strMsgText) Next ; Do the test loop. intTicks2 = 0 For intT = intTestFirst To intTestLast intTicks = GetTickCount () intResult = udfLevenshteinDistance2 (arrTest[intT, 0], arrTest[intT, 1]) intTicks2 = GetTickCount () - intTicks ; Prepare message output. If arrTest[intT, 3] != "" strComment = arrTest[intT, 3] ; Use comment from CSV file if any. Else ; Compare UDF result with given result value from CSV file. strComment = "UDF result is invalid." If intResult == arrTest[intT, 2] Then strComment = "UDF result is valid." EndIf strMsgTitle = "Test " : intT : " of " : intTestLast : "|udfLevenshteinDistance2 (strString1, strString2)" strMsgText = "String1: " : @LF : '"' : arrTest[intT, 0] : '"' strMsgText = strMsgText : @LF : @LF : "String2: " : @LF : '"' : arrTest[intT, 1] : '"' strMsgText = strMsgText : @LF : @LF : "Levenshtein Distance: " : @LF : intResult strMsgText = strMsgText : @LF : @LF : "Comment: " : @LF : strComment Pause (strMsgTitle, strMsgText) Next ; Performance comparison. Decimals (2) intTicksSum = intTicks1 + intTicks2 intTicksPct1 = 100.0 * intTicks1 / intTicksSum intTicksPct2 = 100.0 * intTicks2 / intTicksSum strMsgTitle = "Test Result|udfLevenshteinDistance1 - udfLevenshteinDistance2" strMsgText = "udfLevenshteinDistance1 Ticks: " : intTicks1 strMsgText = strMsgText : @LF : "udfLevenshteinDistance2 Ticks: " : intTicks2 strMsgText = strMsgText : @LF : @LF : "udfLevenshteinDistance1 Pct: " : intTicksPct1 strMsgText = strMsgText : @LF : "udfLevenshteinDistance2 Pct: " : intTicksPct2 ClipPut (StrReplace (strMsgTitle : @LF : @LF : strMsgText, @LF, @CRLF)) Pause (strMsgTitle, strMsgText) :CANCEL If FileExist (strFileTemp) == 1 Then blnResult = FileDelete (strFileTemp) Exit ;------------------------------------------------------------------------------------------------------------------------------------------ ; Test Result|udfLevenshteinDistance1 - udfLevenshteinDistance2 ; ; udfLevenshteinDistance1 Ticks: 32735 ; udfLevenshteinDistance2 Ticks: 18125 ; ; udfLevenshteinDistance1 Pct: 64.36 ; udfLevenshteinDistance2 Pct: 35.64 ;------------------------------------------------------------------------------------------------------------------------------------------