4 people like it.

Soundex Algorithm

Algorithms for generating US Census and Daitch-Mokotoff soundex string(s) based on a text input. Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling.

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
195: 
open System
open System.Linq
open System.Text

module Soundex =
    /// The Soundex code for a name consists of a letter followed by three numerical digits: the letter 
    /// is the first letter of the name, and the digits encode the remaining consonants. 
    /// The National Archives and Records Administration (NARA) maintains the current rule set for the 
    /// official implementation of Soundex used by the U.S. Government. The American Soundex is a variant 
    /// of the original Russell Soundex algorithm.
    /// Reference: http://www.archives.gov/research/census/soundex.html
    let American(text : string) =
        let chooser c = 
            match Char.ToLowerInvariant(c) with
            | 'b' | 'f' | 'p' | 'v' -> '1'
            | 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' -> '2'
            | 'd' | 't' -> '3'
            | 'l' -> '4'
            | 'm' | 'n' -> '5'
            | 'r' -> '6'
            | 'h' | 'w' -> '-'                        
            | _ -> if Char.IsDigit(c) then c else '.'

        let folder (state : char list) (c) =
            match chooser(Seq.head state), chooser(c) with
            | p, i when (p <> i && i <> '-') || i = '0' -> i :: state 
            | _, _ -> state                    

        let value = text.Trim() + "000"
        let soundex = Seq.toList (value.Substring(1))
                      |> List.fold folder [ Char.ToUpperInvariant(value.[0]) ]
                      |> List.filter (fun c -> c <> '.')
                      |> List.rev
                      |> Seq.truncate 4
                      |> Seq.toArray        
        String(soundex)

    /// Daitch–Mokotoff Soundex (D–M Soundex) was developed in 1985 by genealogist Gary Mokotoff and later 
    /// improved by genealogist Randy Daitch because of problems they encountered while trying to apply the 
    /// Russell Soundex to Jews with Germanic or Slavic surnames.
    /// References:
    /// <list type="bullet">
    /// <description>http://www.avotaynu.com/soundex.htm</description>
    /// <description>http://www.jewishgen.org/InfoFiles/soundex.html</description>
    /// </list>
    let DaitchMokotoff(text : string) =

        let isVowel(value : string) =
            let first = if String.IsNullOrEmpty(value) then String.Empty else value.Substring(0, 1)
            match first with
            | "A" | "E" | "I" | "O" | "U" | "Y" -> true
            | _ -> false
        
        let (|Match|_|) (values : seq<string>) (codes : seq<string>) (s : string) = 
            let result = Seq.tryFind (fun value -> s.StartsWith(value)) values
            if result.IsNone then None else Some( codes, isVowel(result.Value), s.Substring(result.Value.Length)) 
            
        let (|Group|_|) (s : string) = 
            match s with
            | Match ["AI";"AJ";"AY"]        ["0"; "1"; ""]      result -> Some(result)
            | Match ["AU"]                  ["0"; "7"; ""]      result -> Some(result)
            | Match ["A"]                   ["0"; ""; ""]       result -> Some(result)
            | Match ["B"]                   ["7"; "7"; "7"]     result -> Some(result)
            | Match ["CHS"]                 ["5"; "54"; "54"]   result -> Some(result)
            | Match ["CH"]                  ["KH"; "TCH"]       result -> Some(result)
            | Match ["CK"]                  ["K"; "TSK"]        result -> Some(result)
            | Match ["CZS";"CZ";"CSZ";"CS"] ["4"; "4"; "4"]     result -> Some(result)
            | Match ["C"]                   ["K"; "TZ"]         result -> Some(result)
            | Match ["DRZ";"DRS"]           ["4"; "4"; "4"]     result -> Some(result)
            | Match ["DSZ";"DSH";"DS"]      ["4"; "4"; "4"]     result -> Some(result)
            | Match ["DZH";"DZS";"DZ"]      ["4"; "4"; "4"]     result -> Some(result)
            | Match ["DT"]                  ["3"; "3"; "3"]     result -> Some(result)
            | Match ["EI";"EJ";"EY"]        ["0"; "1"; ""]      result -> Some(result)
            | Match ["EU"]                  ["1"; "1"; ""]      result -> Some(result)
            | Match ["E"]                   ["0"; ""; ""]       result -> Some(result)
            | Match ["FB"; "F"]             ["7"; "7"; "7"]     result -> Some(result)
            | Match ["G"]                   ["5"; "5"; "5"]     result -> Some(result)
            | Match ["H"]                   ["5"; "5"; ""]      result -> Some(result)
            | Match ["IA";"IE";"IO";"IU"]   ["1"; ""; ""]       result -> Some(result)
            | Match ["I"]                   ["0"; ""; ""]       result -> Some(result)
            | Match ["J2Y"]                 ["1"; "1"; "1"]     result -> Some(result)
            | Match ["J"]                   ["J2Y"; "DZH"]      result -> Some(result)
            | Match ["KS"]                  ["5"; "54"; "54"]   result -> Some(result)
            | Match ["KH"; "K"]             ["5"; "5"; "5"]     result -> Some(result)
            | Match ["L"]                   ["8"; "8"; "8"]     result -> Some(result)
            | Match ["MN";"NM"]             ["66"; "66"; "66"]  result -> Some(result)
            | Match ["M"; "N"]              ["6"; "6"; "6"]     result -> Some(result)
            | Match ["OI";"OJ";"OY"]        ["0"; "1"; ""]      result -> Some(result)
            | Match ["O"]                   ["0"; ""; ""]       result -> Some(result)
            | Match ["PF";"PH"; "P"]        ["7"; "7"; "7"]     result -> Some(result)
            | Match ["RTZ"]                 ["94"; "94"; "94"]  result -> Some(result)
            | Match ["RS";"RZ"]             ["RTZ"; "ZH"]       result -> Some(result)
            | Match ["R"]                   ["9"; "9"; "9"]     result -> Some(result)
            | Match ["SCHTSCH";"SCHTSH";"SCHTCH"] ["2"; "4"; "4"] result -> Some(result)
            | Match ["SCH"]                 ["4"; "4"; "4"]     result -> Some(result)
            | Match ["SHTCH";"SHCH";"SHTSH"] ["2"; "4"; "4"]    result -> Some(result)
            | Match ["SHT";"SCHT";"SCHD"]   ["2"; "43"; "43"]   result -> Some(result)
            | Match ["SH"]                  ["4"; "4"; "4"]     result -> Some(result)
            | Match ["STCH";"STSCH";"SC"]   ["2"; "4"; "4"]     result -> Some(result)
            | Match ["STRZ";"STRS";"STSH"]  ["2"; "4"; "4"]     result -> Some(result)
            | Match ["ST"]                  ["2"; "43"; "43"]   result -> Some(result)
            | Match ["SZCZ";"SZCS"]         ["2"; "4"; "4"]     result -> Some(result)
            | Match ["SZT";"SHD";"SZD";"SD"] ["2"; "43"; "43"]  result -> Some(result)
            | Match ["SZ";"S"]              ["4"; "4"; "4"]     result -> Some(result)
            | Match ["TCH";"TTCH";"TTSCH";"THS"] ["4";"4";"4"]  result -> Some(result)
            | Match ["TH"]                  ["3";"3";"3"]       result -> Some(result)
            | Match ["TRZ";"TRS"]           ["4";"4";"4"]       result -> Some(result)
            | Match ["TSCH";"TSH"]          ["4";"4";"4"]       result -> Some(result)
            | Match ["TSK"]                 ["45";"45";"45"]    result -> Some(result)
            | Match ["TTSZ";"TTS";"TC"]     ["4";"4";"4"]       result -> Some(result)
            | Match ["TZS";"TTZ";"TZ";"TSZ";"TS"] ["4";"4";"4"] result -> Some(result)
            | Match ["T"]                   ["3";"3";"3"]       result -> Some(result)
            | Match ["UI";"UJ";"UY"]        ["0";"1";""]        result -> Some(result)
            | Match ["UE";"U"]              ["0";"";""]         result -> Some(result)
            | Match ["V"]                   ["7";"7";"7"]       result -> Some(result)
            | Match ["W"]                   ["7";"7";"7"]       result -> Some(result)
            | Match ["X"]                   ["5";"54";"54"]     result -> Some(result)
            | Match ["Y"]                   ["1";"";""]         result -> Some(result)
            | Match ["ZHDZH";"ZDZH";"ZDZ"]  ["2";"4";"4"]       result -> Some(result)
            | Match ["ZD";"ZHD"]            ["2";"43";"43"]     result -> Some(result)
            | Match ["ZSCH";"ZSH";"ZH";"ZS"] ["4";"4";"4"]      result -> Some(result)
            | Match ["Z"]                   ["4";"4";"4"]       result -> Some(result)
            | _ -> None

        let search(value : string) =
            match value with            
            | Group result -> (*printfn "%s -> %A" value result;*) result
            | _ -> seq[ "" ], false, value.Substring(1)

        let rec decompose (value : string) = 
            let head, isVowel, next = search(value)
            if String.IsNullOrEmpty(next) then 
                seq [ head, isVowel ] 
            else 
                Seq.append [ head, isVowel ] (decompose(next))
        
        let rec encode (codes : seq<string>, start : bool, beforeVowel : bool) =
            let first = Seq.nth 0 codes
            if String.IsNullOrEmpty(first) then
                [ ]
            else
                if String.IsNullOrEmpty(first) || Char.IsDigit(Seq.nth 0 first) then
                    if start then
                        [ (Seq.nth 0 codes) ]
                    else if beforeVowel then
                        [ (Seq.nth 1 codes) ]
                    else
                        [ (Seq.nth 2 codes) ]
                else            
                    let head, isVowel, next = search(first)
                    if Seq.length codes = 1 then 
                        encode(head, start, beforeVowel) 
                    else 
                        List.append (encode(head, start, beforeVowel)) (encode(Seq.skip 1 codes, start, beforeVowel))
                
        let reduce (results : string list) (values : string list) = 
            let appender(value : string) = 
                fun (s : string) -> 
                    let result = (s + value)
                    if result.Length >= 6 then result.Substring(0, 6) else result
            seq { for value in values do yield List.map (appender(value)) results }
            |> Seq.concat
            |> Seq.toList

        let first, second = decompose ( text.ToUpperInvariant() ) 
                            |> Seq.pairwise
                            |> Seq.filter (fun (a, b) -> a <> b)
                            |> Seq.toList
                            |> List.unzip                                    
        let items, vowels = List.unzip (List.head first :: second)

        List.zip items ( List.append (List.tail vowels) [ false ] )
        |> List.mapi (fun (i) (codes, isVowel) -> encode(codes, i = 0, isVowel))
        |> List.filter (fun c -> not(List.isEmpty c) && (List.head c) <> "")
        |> List.reduce reduce 
        |> List.toSeq
        |> Seq.distinct
        |> Seq.map (fun s -> if s.Length >= 6 then s else String.Concat(s, String('0', 6 - s.Length)))

(*
Soundex.American("Ashcraft") // "A261"
Soundex.American("jackson")  // "J250"
Soundex.American("miller")	 // "M460"
Soundex.American("Wilson")	 // "W425"
Soundex.American("Schmit") 	 // "S530"
Soundex.American("Lloyd")	 // "L300"

Soundex.DaitchMokotoff("Peters")	 // [ "739400"; "734000" ];
Soundex.DaitchMokotoff("Peterson")	 // [ "739460"; "734600" ];
Soundex.DaitchMokotoff("Moskowitz")	 // [ "645740" ];
Soundex.DaitchMokotoff("Moskovitz")	 // [ "645740" ];
Soundex.DaitchMokotoff("Auerbach")	 // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Uhrbach")	 // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Jackson")	 // [ "154600"; "454600"; "145460"; "445460" ];
*)
namespace System
namespace System.Linq
namespace System.Text
module Soundex

from Script
val American : text:string -> String

Full name: Script.Soundex.American


 The Soundex code for a name consists of a letter followed by three numerical digits: the letter
 is the first letter of the name, and the digits encode the remaining consonants.
 The National Archives and Records Administration (NARA) maintains the current rule set for the
 official implementation of Soundex used by the U.S. Government. The American Soundex is a variant
 of the original Russell Soundex algorithm.
 Reference: http://www.archives.gov/research/census/soundex.html
val text : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val chooser : (char -> char)
val c : char
type Char =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 1 overload
    static val MaxValue : char
    static val MinValue : char
    static member ConvertFromUtf32 : utf32:int -> string
    static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
    static member GetNumericValue : c:char -> float + 1 overload
    ...
  end

Full name: System.Char
Char.ToLowerInvariant(c: char) : char
Char.IsDigit(c: char) : bool
Char.IsDigit(s: string, index: int) : bool
val folder : (char list -> char -> char list)
val state : char list
Multiple items
val char : value:'T -> char (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.char

--------------------
type char = Char

Full name: Microsoft.FSharp.Core.char
type 'T list = List<'T>

Full name: Microsoft.FSharp.Collections.list<_>
module Seq

from Microsoft.FSharp.Collections
val head : source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.head
val p : char
val i : char
val value : string
String.Trim() : string
String.Trim([<ParamArray>] trimChars: char []) : string
val soundex : char []
val toList : source:seq<'T> -> 'T list

Full name: Microsoft.FSharp.Collections.Seq.toList
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
Char.ToUpperInvariant(c: char) : char
val filter : predicate:('T -> bool) -> list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.filter
val rev : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.rev
val truncate : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.truncate
val toArray : source:seq<'T> -> 'T []

Full name: Microsoft.FSharp.Collections.Seq.toArray
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Encoding) : unit
val DaitchMokotoff : text:string -> seq<string>

Full name: Script.Soundex.DaitchMokotoff


 Daitch–Mokotoff Soundex (D–M Soundex) was developed in 1985 by genealogist Gary Mokotoff and later
 improved by genealogist Randy Daitch because of problems they encountered while trying to apply the
 Russell Soundex to Jews with Germanic or Slavic surnames.
 References:
 <list type="bullet">
 <description>http://www.avotaynu.com/soundex.htm</description>
 <description>http://www.jewishgen.org/InfoFiles/soundex.html</description>
 </list>
val isVowel : (string -> bool)
val first : string
String.IsNullOrEmpty(value: string) : bool
field string.Empty
val values : seq<string>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val codes : seq<string>
val s : string
val result : string option
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option

Full name: Microsoft.FSharp.Collections.Seq.tryFind
String.StartsWith(value: string) : bool
String.StartsWith(value: string, comparisonType: StringComparison) : bool
String.StartsWith(value: string, ignoreCase: bool, culture: Globalization.CultureInfo) : bool
property Option.IsNone: bool
union case Option.None: Option<'T>
union case Option.Some: Value: 'T -> Option<'T>
property Option.Value: string
property String.Length: int
active recognizer Match: seq<string> -> seq<string> -> string -> (seq<string> * bool * string) option
val result : seq<string> * bool * string
val search : (string -> seq<string> * bool * string)
active recognizer Group: string -> (seq<string> * bool * string) option
val decompose : (string -> seq<seq<string> * bool>)
val head : seq<string>
val isVowel : bool
val next : string
val append : source1:seq<'T> -> source2:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.append
val encode : (seq<string> * bool * bool -> string list)
val start : bool
type bool = Boolean

Full name: Microsoft.FSharp.Core.bool
val beforeVowel : bool
val nth : index:int -> source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.nth
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val append : list1:'T list -> list2:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.append
val skip : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.skip
val reduce : (string list -> string list -> string list)
val results : string list
val values : string list
val appender : (string -> string -> string)
val result : string
val map : mapping:('T -> 'U) -> list:'T list -> 'U list

Full name: Microsoft.FSharp.Collections.List.map
val concat : sources:seq<#seq<'T>> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.concat
val first : (seq<string> * bool) list
val second : (seq<string> * bool) list
String.ToUpperInvariant() : string
val pairwise : source:seq<'T> -> seq<'T * 'T>

Full name: Microsoft.FSharp.Collections.Seq.pairwise
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
val a : seq<string> * bool
val b : seq<string> * bool
val unzip : list:('T1 * 'T2) list -> 'T1 list * 'T2 list

Full name: Microsoft.FSharp.Collections.List.unzip
val items : seq<string> list
val vowels : bool list
val head : list:'T list -> 'T

Full name: Microsoft.FSharp.Collections.List.head
val zip : list1:'T1 list -> list2:'T2 list -> ('T1 * 'T2) list

Full name: Microsoft.FSharp.Collections.List.zip
val tail : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.tail
val mapi : mapping:(int -> 'T -> 'U) -> list:'T list -> 'U list

Full name: Microsoft.FSharp.Collections.List.mapi
val i : int
val c : string list
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val isEmpty : list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.isEmpty
val reduce : reduction:('T -> 'T -> 'T) -> list:'T list -> 'T

Full name: Microsoft.FSharp.Collections.List.reduce
val toSeq : list:'T list -> seq<'T>

Full name: Microsoft.FSharp.Collections.List.toSeq
val distinct : source:seq<'T> -> seq<'T> (requires equality)

Full name: Microsoft.FSharp.Collections.Seq.distinct
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
String.Concat([<ParamArray>] values: string []) : string
   (+0 other overloads)
String.Concat(values: Collections.Generic.IEnumerable<string>) : string
   (+0 other overloads)
String.Concat<'T>(values: Collections.Generic.IEnumerable<'T>) : string
   (+0 other overloads)
String.Concat([<ParamArray>] args: obj []) : string
   (+0 other overloads)
String.Concat(arg0: obj) : string
   (+0 other overloads)
String.Concat(str0: string, str1: string) : string
   (+0 other overloads)
String.Concat(arg0: obj, arg1: obj) : string
   (+0 other overloads)
String.Concat(str0: string, str1: string, str2: string) : string
   (+0 other overloads)
String.Concat(arg0: obj, arg1: obj, arg2: obj) : string
   (+0 other overloads)
String.Concat(str0: string, str1: string, str2: string, str3: string) : string
   (+0 other overloads)
Raw view Test code New version

More information

Link:http://fssnip.net/6G
Posted:12 years ago
Author:Matt Wilson
Tags: sequences , algorithms