;------------------------------------------------------------------------------------------------------------------------------------------ #DefineFunction udfStrPhonex (strString) If strString == "" Then Return "" ; Make all characters uppercase. strString = StrUpper (strString) ; Special pre-processing for german language. ;strString = StrReplace (strString, "SCH", "S") ; German special "SCH". ;strString = StrReplace (strString, "ß", "S") ; German special sharp-s "ß". ; Remove all occurrences of non alpha chars. strString = StrClean (strString, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", @TRUE, 2) If strString == "" Then Return "" ; Load a WinBatch Binary Buffer. intLen = StrLen (strString) hdlBB = BinaryAlloc (intLen + 1) ; Plus one byte for our EndOfString zero byte. BinaryPokeStr (hdlBB, 0, strString) ; Remove all trailing 'S' characters at the end of the name. While 83 == BinaryPeek (hdlBB, intLen - 1) ; S intLen = intLen - 1 EndWhile BinaryPoke (hdlBB, intLen, 0) ; Set EndOfString using zero byte. ; Convert leading letter-pairs as follows: KN to N ; WR to R ; PH to F. intPos = 0 Switch BinaryPeek2 (hdlBB, intPos) Case 20043 ; KN ; 20043 = Char2Num("K") + 256*Char2Num("N") = 75 + 256*78. BinaryPoke (hdlBB, 0, 78) ; N intPos = 1 Break Case 21079 ; WR ; 21079 = Char2Num("W") + 256*Char2Num("R") = 87 + 256*82. BinaryPoke (hdlBB, 0, 82) ; R intPos = 1 Break Case 18512 ; PH ; 18512 = Char2Num("P") + 256*Char2Num("H") = 80 + 256*72. BinaryPoke (hdlBB, 0, 70) ; F intPos = 1 Break EndSwitch If intPos == 0 ; Ignore H first letter. If 72 == BinaryPeek (hdlBB, intPos) Then intPos = 1 ; H ; Convert leading single letters: (A), E, I, O, U, Y to A ; K, Q to C ; P to B ; J to G ; V to F ; Z to S. intChar = BinaryPeek (hdlBB, intPos) Switch intChar Case 65 ; A Case 69 ; E Case 73 ; I Case 79 ; O Case 86 ; U Case 89 ; Y BinaryPoke (hdlBB, 0, 65) ; A Break Case 75 ; K Case 81 ; Q BinaryPoke (hdlBB, 0, 67) ; C Break Case 80 ; P BinaryPoke (hdlBB, 0, 66) ; B Break Case 74 ; J BinaryPoke (hdlBB, 0, 71) ; G Break Case 86 ; V BinaryPoke (hdlBB, 0, 70) ; F Break Case 90 ; Z BinaryPoke (hdlBB, 0, 83) ; S Break EndSwitch EndIf ; Do the coding. intCodePos = 1 While @TRUE intPos = intPos + 1 If intPos >= intLen Then Break If intCodePos > 3 Then Break Switch BinaryPeek (hdlBB, intPos) Case 66 ; B Case 70 ; F Case 80 ; P Case 86 ; V If 49 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 49) ; Code 1. intCodePos = intCodePos + 1 EndIf Break Case 67 ; C Case 71 ; G Case 74 ; J Case 75 ; K Case 81 ; Q Case 83 ; S Case 88 ; X Case 90 ; Z If 50 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 50) ; Code 2. intCodePos = intCodePos + 1 EndIf Break Case 68 ; D Case 84 ; T If 67 != BinaryPeek (hdlBB, intPos + 1) ; C If 51 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 51) ; Code 3. intCodePos = intCodePos + 1 EndIf EndIf Break Case 76 ; L Switch BinaryPeek (hdlBB, intPos + 1) Case 0 ; End of String. Case 65 ; A Case 69 ; E Case 73 ; I Case 79 ; O Case 85 ; U Case 89 ; Y If 52 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 52) ; Code 4. intCodePos = intCodePos + 1 EndIf intPos = intPos + 1 Break EndSwitch Break Case 77 ; M Case 78 ; N Switch BinaryPeek (hdlBB, intPos + 1) Case 68 ; D Case 71 ; G intPos = intPos + 1 Break EndSwitch If 53 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 53) ; Code 5. intCodePos = intCodePos + 1 EndIf Break Case 82 ; R Switch BinaryPeek (hdlBB, intPos + 1) Case 0 ; End of String Case 65 ; A Case 69 ; E Case 73 ; I Case 79 ; O Case 85 ; U Case 89 ; Y If 54 != BinaryPeek (hdlBB, intCodePos - 1) BinaryPoke (hdlBB, intCodePos, 54) ; Code 6. intCodePos = intCodePos + 1 EndIf intPos = intPos + 1 Break EndSwitch Break EndSwitch EndWhile strPhonex = StrFix (BinaryPeekStr (hdlBB, 0, intCodePos), "0", 4) hdlBB = BinaryFree (hdlBB) Return strPhonex ;.......................................................................................................................................... ; Abstract: ; ; The Phonex name-matching algorithm is a compromise between generality and specificity, ; and achieves a comparatively good overall performance when applied to names in the English language. ; Current namematching methods, including the Phonex algorithm, fall into one of two categories, ; those which consider the phonetic structure of names, ; and those which consider the names on a character-by-character basis. ; ; Both of these approaches have advantages and disadvantages that make ; them better or worse when applied to a specific task. ; ; The phonetic-based approaches (such as Phonex) are somewhat more specific to a ; particular language but implement a better appreciation of names that sound similar. ;.......................................................................................................................................... ; The overall accuracy of the Phonex method is marginally lower ; (approximately 0.2%) than that of the Soundex method. ; The percentage of true matches determined by the Phonex method is ; approximately 44% higher than that of the Soundex method. ;.......................................................................................................................................... ; ; The Phonex Algorithm (See page p21 in "Phonex" PDF documentation): ; ; The algorithm converts each name to a four-character code, ; which can be used to identify equivalent names, ; and is structured as follows: ; ; Pre-process the name according to the following rules: ; 1. Remove all trailing 'S' characters at the end of the name. ; 2. Convert leading letter-pairs as follows: ; KN -> N ; WR -> R ; PH -> F ; 3. Convert leading single letters as follows: ; H -> Remove ; E, I, O, U, Y -> A ; K, Q -> C ; P -> B ; J -> G ; V -> F ; Z -> S ; ; Code the pre-processed name according to the following rules: ; 1. Retain the first letter of the name, and drop all occurrences ; of A, E, H, I, O, U, W, Y in other positions. ; 2. Assign the following numbers to the remaining letters after the first: ; B, F, P, V -> 1 ; C, G, J, K, Q, S, X, Z -> 2 ; D, T -> 3 ... If not followed by C. ; L -> 4 ... If not followed by vowel or end of name. ; M, N -> 5 ... Ignore next letter if either D or G. ; R -> 6 ... If not followed by vowel or end of name. ; 3. Ignore the current letter if it has the same code digit as the last character of the code. ; 4. Convert to the form letter, digit, digit, digit by adding trailing zeros ; (if there are less than three digits), or by dropping rightmost digits it there are more than three. ; Although the resulting four-character code is identical in format to that produced by the Soundex coding ; algorithm, these two forms are not compatible. ;.......................................................................................................................................... ; ; Note: ; ; There is a heavy, almost deadly, difference between the descriptions on page p21 and page p27 ; in the in "Phonex" PDF documentation! ; See yourself: Compare the descriptions of cases 'L' and 'R'. ; ; My implementation of the "Phonex" algorithm in WinBatch code relies on the following formal description. ; ; On page p27 in the "Phonex" PDF document a part of the algorithm is described as follows: ; ; The format of the code is the same as for the /soundex/ method (i.e. letter, digit, digit, digit) ; but the manner in which these code characters are determined is slightly different. ; ; The majority of the original character equivalents are retained, ; using the following translations of letters in to code digits ; (the first letter of the name is treated separately and is added directly to the code as a letter not a digit): ; b, f, p, v -> 1 ; c, g, j, k, q, s, x, z -> 2 ; d, t -> 3 ; l -> 4 ; m, n -> 5 ; r -> 6 ; ; The Phonex method does, however, slightly modify this coding technique in that the letters ; D, T, L, M, N, and R are subject to further processing before determining what code digit to add: ; ; - If a D or T is followed by a C, the D or T is not coded since it is considered that, ; as in TCH and CH, the omission of the D or T will enable more true matches to be identified. ; ; - An L is only coded if it is followed by a vowel, or it is the last character of a name. ; ; - If an M or N is followed by a D or G the following letter is overwritten with a duplicate of the ; current (which will then be ignored), since these letter combinations are considered to phonetically equivalent. ; ; - An R is only coded if it is followed by a vowel, or it is the last character of a name. ;.......................................................................................................................................... ; ; Reference: ; ; Phonex documentation in "NameMatching.pdf", Created: 16.04.2001 15:26:54. ; "An Assessment of Name Matching Algorithms" ; A. J. Lait and B. Randell ; Department of Computing Science ; University of Newcastle upon Tyne ; Contact author: Brian.Randell@newcastle.ac.uk ; http://www.cs.ncl.ac.uk/~brian.randell/home.informal/Genealogy/NameMatching.pdf ;.......................................................................................................................................... ; Detlev Dalitz.20020729.20100207. ;.......................................................................................................................................... #EndFunction ;------------------------------------------------------------------------------------------------------------------------------------------ ; Test. strNames = "" strNames = strNames : "123 testsss " ; T230 = 123 testsss strNames = strNames : ",24/7 test " ; T230 = 24/7 test strNames = strNames : ",ZYX test " ; S232 = ZYX test strNames = strNames : ",A " ; A000 = A strNames = strNames : ",Lee " ; L000 = Lee strNames = strNames : ",Kuhne " ; C500 = Kuhne strNames = strNames : ",Meyer-Lansky" ; M452 = Meyer-Lansky strNames = strNames : ",Oepping " ; A150 = Oepping strNames = strNames : ",Daley " ; D400 = Daley strNames = strNames : ",Dalitz " ; D432 = Dalitz strNames = strNames : ",Duhlitz " ; D432 = Duhlitz strNames = strNames : ",De Ledes " ; D430 = De Ledes strNames = strNames : ",Schüßler " ; S246 = Schüßler strNames = strNames : ",Schmidt " ; S253 = Schmidt strNames = strNames : ",Sandemann " ; S500 = Sandemann strNames = strNames : ",Sinatra " ; S536 = Sinatra strNames = strNames : ",Heinrich " ; A562 = Heinrich strNames = strNames : ",Hammerschlag" ; A524 = Hammerschlag strNames = strNames : ",Williams " ; W450 = Williams strNames = strNames : ",Wilms " ; W500 = Wilms strNames = strNames : ",Wilson " ; W250 = Wilson strNames = strNames : ",Worms " ; W500 = Worms strNames = strNames : ",Zedlitz " ; S343 = Zedlitz strNames = strNames : ",Zotteldecke " ; S320 = Zotteldecke strNames = strNames : ",Scherman " ; S250 = Scherman strNames = strNames : ",Schurman " ; S250 = Schurman strNames = strNames : ",Sherman " ; S500 = Sherman strNames = strNames : ",Shireman " ; S650 = Shireman strNames = strNames : ",Shurman " ; S500 = Shurman strNames = strNames : ",Euler " ; A460 = Euler strNames = strNames : ",Ellery " ; A460 = Ellery strNames = strNames : ",Hilbert " ; A130 = Hilbert strNames = strNames : ",Heilbronn " ; A165 = Heilbronn strNames = strNames : ",Gauss " ; G000 = Gauss strNames = strNames : ",Ghosh " ; G200 = Ghosh strNames = strNames : ",Knuth " ; N300 = Knuth strNames = strNames : ",Kant " ; C530 = Kant strNames = strNames : ",Lloyd " ; L430 = Lloyd strNames = strNames : ",Ladd " ; L300 = Ladd strNames = strNames : ",Lukasiewicz " ; L200 = Lukasiewicz strNames = strNames : ",Lissajous " ; L200 = Lissajous strNames = strNames : ",Ashcraft " ; A261 = Ashcraft strNames = strNames : ",Philip " ; F410 = Philip strNames = strNames : ",Fripp " ; F610 = Fripp strNames = strNames : ",Czarkowska " ; C200 = Czarkowska strNames = strNames : ",Hornblower " ; A514 = Hornblower strNames = strNames : ",Looser " ; L260 = Looser intCountNames = ItemCount (strNames, ",") ClipPut ("") strOut = "" For intI = 1 To intCountNames strName = ItemExtract (intI, strNames, ",") strOut = strOut : udfStrPhonex (strName) : " = " : strName : @LF Next IntControl (63, 200, 100, 800, 900) IntControl (28, 1, 0, 0, 0) AskItemlist ("Demo udfStrPhonex (strString)", strOut, @LF, @UNSORTED, @SINGLE) ClipAppend (StrReplace (strOut, @LF, @CRLF) : @CRLF) :CANCEL Exit ;------------------------------------------------------------------------------------------------------------------------------------------ ; Phonex ; ; T230 = 123 testsss ; T230 = 24/7 test ; S232 = ZYX test ; A000 = A ; L000 = Lee ; C500 = Kuhne ; M452 = Meyer-Lansky ; A150 = Oepping ; D400 = Daley ; D432 = Dalitz ; D432 = Duhlitz ; D430 = De Ledes ; S246 = Schüßler ; S253 = Schmidt ; S500 = Sandemann ; S536 = Sinatra ; A562 = Heinrich ; A524 = Hammerschlag ; W450 = Williams ; W500 = Wilms ; W250 = Wilson ; W500 = Worms ; S343 = Zedlitz ; S320 = Zotteldecke ; S250 = Scherman ; S250 = Schurman ; S500 = Sherman ; S650 = Shireman ; S500 = Shurman ; A460 = Euler ; A460 = Ellery ; A130 = Hilbert ; A165 = Heilbronn ; G000 = Gauss ; G200 = Ghosh ; N300 = Knuth ; C530 = Kant ; L430 = Lloyd ; L300 = Ladd ; L200 = Lukasiewicz ; L200 = Lissajous ; A261 = Ashcraft ; F410 = Philip ; F610 = Fripp ; C200 = Czarkowska ; A514 = Hornblower ; L260 = Looser ;------------------------------------------------------------------------------------------------------------------------------------------