udfStrPhonex
str udfStrPhonex (str)
;------------------------------------------------------------------------------------------------------------------------------------------
#DefineFunction udfStrPhonex (strString)
If strString == "" Then Return ""

; Make all characters uppercase.
strString = StrUpper (strString)

; Special pre-processing for german language.
;strString = StrReplace (strString, "SCH", "S") ; German special "SCH".
;strString = StrReplace (strString, "ß", "S")   ; German special sharp-s "ß".

; Remove all occurrences of non alpha chars.
strString = StrClean (strString, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", @TRUE, 2)
If strString == "" Then Return ""

; Load a WinBatch Binary Buffer.
intLen = StrLen (strString)
hdlBB = BinaryAlloc (intLen + 1) ; Plus one byte for our EndOfString zero byte.
BinaryPokeStr (hdlBB, 0, strString)

; Remove all trailing 'S' characters at the end of the name.
While 83 == BinaryPeek (hdlBB, intLen - 1) ; S
   intLen = intLen - 1
EndWhile
BinaryPoke (hdlBB, intLen, 0) ; Set EndOfString using zero byte.

; Convert leading letter-pairs as follows: KN to N ; WR to R ; PH to F.
intPos = 0
Switch BinaryPeek2 (hdlBB, intPos)
Case 20043 ; KN ; 20043 = Char2Num("K") + 256*Char2Num("N") = 75 + 256*78.
   BinaryPoke (hdlBB, 0, 78) ; N
   intPos = 1
   Break
Case 21079 ; WR ; 21079 = Char2Num("W") + 256*Char2Num("R") = 87 + 256*82.
   BinaryPoke (hdlBB, 0, 82) ; R
   intPos = 1
   Break
Case 18512 ; PH ; 18512 = Char2Num("P") + 256*Char2Num("H") = 80 + 256*72.
   BinaryPoke (hdlBB, 0, 70) ; F
   intPos = 1
   Break
EndSwitch

If intPos == 0
   ; Ignore H first letter.
   If 72 == BinaryPeek (hdlBB, intPos) Then intPos = 1 ; H

   ; Convert leading single letters: (A), E, I, O, U, Y to A ; K, Q to C ; P to B ; J to G ; V to F ; Z to S.
   intChar = BinaryPeek (hdlBB, intPos)
   Switch intChar
   Case 65 ; A
   Case 69 ; E
   Case 73 ; I
   Case 79 ; O
   Case 86 ; U
   Case 89 ; Y
      BinaryPoke (hdlBB, 0, 65)  ; A
      Break
   Case 75 ; K
   Case 81 ; Q
      BinaryPoke (hdlBB, 0, 67)  ; C
      Break
   Case 80 ; P
      BinaryPoke (hdlBB, 0, 66)  ; B
      Break
   Case 74 ; J
      BinaryPoke (hdlBB, 0, 71)  ; G
      Break
   Case 86 ; V
      BinaryPoke (hdlBB, 0, 70)  ; F
      Break
   Case 90 ; Z
      BinaryPoke (hdlBB, 0, 83)  ; S
      Break
   EndSwitch
EndIf

; Do the coding.
intCodePos = 1
While @TRUE
   intPos = intPos + 1
   If intPos >= intLen Then Break
   If intCodePos > 3 Then Break

   Switch BinaryPeek (hdlBB, intPos)
   Case 66 ; B
   Case 70 ; F
   Case 80 ; P
   Case 86 ; V
      If 49 != BinaryPeek (hdlBB, intCodePos - 1)
         BinaryPoke (hdlBB, intCodePos, 49)        ; Code 1.
         intCodePos = intCodePos + 1
      EndIf
      Break
   Case 67 ; C
   Case 71 ; G
   Case 74 ; J
   Case 75 ; K
   Case 81 ; Q
   Case 83 ; S
   Case 88 ; X
   Case 90 ; Z
      If 50 != BinaryPeek (hdlBB, intCodePos - 1)
         BinaryPoke (hdlBB, intCodePos, 50)        ; Code 2.
         intCodePos = intCodePos + 1
      EndIf
      Break
   Case 68 ; D
   Case 84 ; T
      If 67 != BinaryPeek (hdlBB, intPos + 1) ; C
         If 51 != BinaryPeek (hdlBB, intCodePos - 1)
            BinaryPoke (hdlBB, intCodePos, 51)     ; Code 3.
            intCodePos = intCodePos + 1
         EndIf
      EndIf
      Break
   Case 76 ; L
      Switch BinaryPeek (hdlBB, intPos + 1)
      Case 0  ; End of String.
      Case 65 ; A
      Case 69 ; E
      Case 73 ; I
      Case 79 ; O
      Case 85 ; U
      Case 89 ; Y
         If 52 != BinaryPeek (hdlBB, intCodePos - 1)
            BinaryPoke (hdlBB, intCodePos, 52)     ; Code 4.
            intCodePos = intCodePos + 1
         EndIf
         intPos = intPos + 1
         Break
      EndSwitch
      Break
   Case 77 ; M
   Case 78 ; N
      Switch BinaryPeek (hdlBB, intPos + 1)
      Case 68 ; D
      Case 71 ; G
         intPos = intPos + 1
         Break
      EndSwitch
      If 53 != BinaryPeek (hdlBB, intCodePos - 1)
         BinaryPoke (hdlBB, intCodePos, 53)        ; Code 5.
         intCodePos = intCodePos + 1
      EndIf
      Break
   Case 82 ; R
      Switch BinaryPeek (hdlBB, intPos + 1)
      Case 0  ; End of String
      Case 65 ; A
      Case 69 ; E
      Case 73 ; I
      Case 79 ; O
      Case 85 ; U
      Case 89 ; Y
         If 54 != BinaryPeek (hdlBB, intCodePos - 1)
            BinaryPoke (hdlBB, intCodePos, 54)     ; Code 6.
            intCodePos = intCodePos + 1
         EndIf
         intPos = intPos + 1
         Break
      EndSwitch
      Break
   EndSwitch
EndWhile

strPhonex = StrFix (BinaryPeekStr (hdlBB, 0, intCodePos), "0", 4)

hdlBB = BinaryFree (hdlBB)

Return strPhonex
;..........................................................................................................................................
; Abstract:
;
;   The Phonex name-matching algorithm is a compromise between generality and specificity,
;   and achieves a comparatively good overall performance when applied to names in the English language.
;   Current namematching methods, including the Phonex algorithm, fall into one of two categories,
;   those which consider the phonetic structure of names,
;   and those which consider the names on a character-by-character basis.
;
;   Both of these approaches have advantages and disadvantages that make
;   them better or worse when applied to a specific task.
;
;   The phonetic-based approaches (such as Phonex) are somewhat more specific to a
;   particular language but implement a better appreciation of names that sound similar.
;..........................................................................................................................................
;   The overall accuracy of the Phonex method is marginally lower
;   (approximately 0.2%) than that of the Soundex method.
;   The percentage of true matches determined by the Phonex method is
;   approximately 44% higher than that of the Soundex method.
;..........................................................................................................................................
;
; The Phonex Algorithm  (See page p21 in "Phonex" PDF documentation):
;
;   The algorithm converts each name to a four-character code,
;   which can be used to identify equivalent names,
;   and is structured as follows:
;
;   Pre-process the name according to the following rules:
;   1. Remove all trailing 'S' characters at the end of the name.
;   2. Convert leading letter-pairs as follows:
;      KN -> N
;      WR -> R
;      PH -> F
;   3. Convert leading single letters as follows:
;      H -> Remove
;      E, I, O, U, Y -> A
;      K, Q -> C
;      P -> B
;      J -> G
;      V -> F
;      Z -> S
;
;   Code the pre-processed name according to the following rules:
;   1. Retain the first letter of the name, and drop all occurrences
;      of A, E, H, I, O, U, W, Y in other positions.
;   2. Assign the following numbers to the remaining letters after the first:
;      B, F, P, V -> 1
;      C, G, J, K, Q, S, X, Z -> 2
;      D, T -> 3 ... If not followed by C.
;      L    -> 4 ... If not followed by vowel or end of name.
;      M, N -> 5 ... Ignore next letter if either D or G.
;      R    -> 6 ... If not followed by vowel or end of name.
;   3. Ignore the current letter if it has the same code digit as the last character of the code.
;   4. Convert to the form ‘letter, digit, digit, digit’ by adding trailing zeros
;      (if there are less than three digits), or by dropping rightmost digits it there are more than three.
;      Although the resulting four-character code is identical in format to that produced by the Soundex coding
;      algorithm, these two forms are not compatible.
;..........................................................................................................................................
;
; Note:
;
;   There is a heavy, almost deadly, difference between the descriptions on page p21 and page p27
;   in the in "Phonex" PDF documentation!
;   See yourself: Compare the descriptions of cases 'L' and 'R'.
;
;   My implementation of the "Phonex" algorithm in WinBatch code relies on the following formal description.
;
;   On page p27 in the "Phonex" PDF document a part of the algorithm is described as follows:
;
;   The format of the code is the same as for the /soundex/ method (i.e. letter, digit, digit, digit)
;   but the manner in which these code characters are determined is slightly different.
;
;   The majority of the original character equivalents are retained,
;   using the following translations of letters in to code digits
;   (the first letter of the name is treated separately and is added directly to the code as a letter not a digit):
;   b, f, p, v -> 1
;   c, g, j, k, q, s, x, z -> 2
;   d, t -> 3
;   l    -> 4
;   m, n -> 5
;   r    -> 6
;
;   The Phonex method does, however, slightly modify this coding technique in that the letters
;   ’D’, ‘T’, ‘L’, ‘M’, ‘N’, and ‘R’ are subject to further processing before determining what code digit to add:
;
;   - If a ‘D’ or ‘T’ is followed by a ‘C’, the ‘D’ or ‘T’ is not coded since it is considered that,
;     as in ‘TCH’ and ‘CH’, the omission of the ‘D’ or ‘T’ will enable more true matches to be identified.
;
;   - An ‘L’ is only coded if it is followed by a vowel, or it is the last character of a name.
;
;   - If an ‘M’ or ‘N’ is followed by a ‘D’ or ‘G’ the following letter is overwritten with a duplicate of the
;     current (which will then be ignored), since these letter combinations are considered to phonetically equivalent.
;
;   - An ‘R’ is only coded if it is followed by a vowel, or it is the last character of a name.
;..........................................................................................................................................
;
; Reference:
;
;   Phonex documentation in "NameMatching.pdf", Created: 16.04.2001 15:26:54.
;   "An Assessment of Name Matching Algorithms"
;   A. J. Lait and B. Randell
;   Department of Computing Science
;   University of Newcastle upon Tyne
;   Contact author: Brian.Randell@newcastle.ac.uk
;   http://www.cs.ncl.ac.uk/~brian.randell/home.informal/Genealogy/NameMatching.pdf
;..........................................................................................................................................
; Detlev Dalitz.20020729.20100207.
;..........................................................................................................................................
#EndFunction
;------------------------------------------------------------------------------------------------------------------------------------------


; Test.

strNames = ""
strNames = strNames : "123 testsss  " ; T230 = 123 testsss
strNames = strNames : ",24/7 test   " ; T230 = 24/7 test
strNames = strNames : ",ZYX test    " ; S232 = ZYX test
strNames = strNames : ",A           " ; A000 = A
strNames = strNames : ",Lee         " ; L000 = Lee
strNames = strNames : ",Kuhne       " ; C500 = Kuhne
strNames = strNames : ",Meyer-Lansky" ; M452 = Meyer-Lansky
strNames = strNames : ",Oepping     " ; A150 = Oepping
strNames = strNames : ",Daley       " ; D400 = Daley
strNames = strNames : ",Dalitz      " ; D432 = Dalitz
strNames = strNames : ",Duhlitz     " ; D432 = Duhlitz
strNames = strNames : ",De Ledes    " ; D430 = De Ledes
strNames = strNames : ",Schüßler    " ; S246 = Schüßler
strNames = strNames : ",Schmidt     " ; S253 = Schmidt
strNames = strNames : ",Sandemann   " ; S500 = Sandemann
strNames = strNames : ",Sinatra     " ; S536 = Sinatra
strNames = strNames : ",Heinrich    " ; A562 = Heinrich
strNames = strNames : ",Hammerschlag" ; A524 = Hammerschlag
strNames = strNames : ",Williams    " ; W450 = Williams
strNames = strNames : ",Wilms       " ; W500 = Wilms
strNames = strNames : ",Wilson      " ; W250 = Wilson
strNames = strNames : ",Worms       " ; W500 = Worms
strNames = strNames : ",Zedlitz     " ; S343 = Zedlitz
strNames = strNames : ",Zotteldecke " ; S320 = Zotteldecke
strNames = strNames : ",Scherman    " ; S250 = Scherman
strNames = strNames : ",Schurman    " ; S250 = Schurman
strNames = strNames : ",Sherman     " ; S500 = Sherman
strNames = strNames : ",Shireman    " ; S650 = Shireman
strNames = strNames : ",Shurman     " ; S500 = Shurman
strNames = strNames : ",Euler       " ; A460 = Euler
strNames = strNames : ",Ellery      " ; A460 = Ellery
strNames = strNames : ",Hilbert     " ; A130 = Hilbert
strNames = strNames : ",Heilbronn   " ; A165 = Heilbronn
strNames = strNames : ",Gauss       " ; G000 = Gauss
strNames = strNames : ",Ghosh       " ; G200 = Ghosh
strNames = strNames : ",Knuth       " ; N300 = Knuth
strNames = strNames : ",Kant        " ; C530 = Kant
strNames = strNames : ",Lloyd       " ; L430 = Lloyd
strNames = strNames : ",Ladd        " ; L300 = Ladd
strNames = strNames : ",Lukasiewicz " ; L200 = Lukasiewicz
strNames = strNames : ",Lissajous   " ; L200 = Lissajous
strNames = strNames : ",Ashcraft    " ; A261 = Ashcraft
strNames = strNames : ",Philip      " ; F410 = Philip
strNames = strNames : ",Fripp       " ; F610 = Fripp
strNames = strNames : ",Czarkowska  " ; C200 = Czarkowska
strNames = strNames : ",Hornblower  " ; A514 = Hornblower
strNames = strNames : ",Looser      " ; L260 = Looser

intCountNames = ItemCount (strNames, ",")

ClipPut ("")

strOut = ""
For intI = 1 To intCountNames
   strName = ItemExtract (intI, strNames, ",")
   strOut = strOut : udfStrPhonex (strName) : " = " : strName : @LF
Next
IntControl (63, 200, 100, 800, 900)
IntControl (28, 1, 0, 0, 0)
AskItemlist ("Demo udfStrPhonex (strString)", strOut, @LF, @UNSORTED, @SINGLE)
ClipAppend (StrReplace (strOut, @LF, @CRLF) : @CRLF)

:CANCEL
Exit
;------------------------------------------------------------------------------------------------------------------------------------------
;   Phonex
;
;   T230 = 123 testsss
;   T230 = 24/7 test
;   S232 = ZYX test
;   A000 = A
;   L000 = Lee
;   C500 = Kuhne
;   M452 = Meyer-Lansky
;   A150 = Oepping
;   D400 = Daley
;   D432 = Dalitz
;   D432 = Duhlitz
;   D430 = De Ledes
;   S246 = Schüßler
;   S253 = Schmidt
;   S500 = Sandemann
;   S536 = Sinatra
;   A562 = Heinrich
;   A524 = Hammerschlag
;   W450 = Williams
;   W500 = Wilms
;   W250 = Wilson
;   W500 = Worms
;   S343 = Zedlitz
;   S320 = Zotteldecke
;   S250 = Scherman
;   S250 = Schurman
;   S500 = Sherman
;   S650 = Shireman
;   S500 = Shurman
;   A460 = Euler
;   A460 = Ellery
;   A130 = Hilbert
;   A165 = Heilbronn
;   G000 = Gauss
;   G200 = Ghosh
;   N300 = Knuth
;   C530 = Kant
;   L430 = Lloyd
;   L300 = Ladd
;   L200 = Lukasiewicz
;   L200 = Lissajous
;   A261 = Ashcraft
;   F410 = Philip
;   F610 = Fripp
;   C200 = Czarkowska
;   A514 = Hornblower
;   L260 = Looser
;------------------------------------------------------------------------------------------------------------------------------------------