LDP/LDP/guide/docbook/abs-guide/soundex.sh

158 lines
4.3 KiB
Bash

#!/bin/bash
# soundex.sh: Calculate "soundex" code for names
# =======================================================
# Soundex script
# by
# Mendel Cooper
# thegrendel.abs@gmail.com
# reldate: 23 January, 2002
#
# Placed in the Public Domain.
#
# A slightly different version of this script appeared in
#+ Ed Schaefer's July, 2002 "Shell Corner" column
#+ in "Unix Review" on-line,
#+ http://www.unixreview.com/documents/uni1026336632258/
# =======================================================
ARGCOUNT=1 # Need name as argument.
E_WRONGARGS=90
if [ $# -ne "$ARGCOUNT" ]
then
echo "Usage: `basename $0` name"
exit $E_WRONGARGS
fi
assign_value () # Assigns numerical value
{ #+ to letters of name.
val1=bfpv # 'b,f,p,v' = 1
val2=cgjkqsxz # 'c,g,j,k,q,s,x,z' = 2
val3=dt # etc.
val4=l
val5=mn
val6=r
# Exceptionally clever use of 'tr' follows.
# Try to figure out what is going on here.
value=$( echo "$1" \
| tr -d wh \
| tr $val1 1 | tr $val2 2 | tr $val3 3 \
| tr $val4 4 | tr $val5 5 | tr $val6 6 \
| tr -s 123456 \
| tr -d aeiouy )
# Assign letter values.
# Remove duplicate numbers, except when separated by vowels.
# Ignore vowels, except as separators, so delete them last.
# Ignore 'w' and 'h', even as separators, so delete them first.
#
# The above command substitution lays more pipe than a plumber <g>.
}
input_name="$1"
echo
echo "Name = $input_name"
# Change all characters of name input to lowercase.
# ------------------------------------------------
name=$( echo $input_name | tr A-Z a-z )
# ------------------------------------------------
# Just in case argument to script is mixed case.
# Prefix of soundex code: first letter of name.
# --------------------------------------------
char_pos=0 # Initialize character position.
prefix0=${name:$char_pos:1}
prefix=`echo $prefix0 | tr a-z A-Z`
# Uppercase 1st letter of soundex.
let "char_pos += 1" # Bump character position to 2nd letter of name.
name1=${name:$char_pos}
# ++++++++++++++++++++++++++ Exception Patch ++++++++++++++++++++++++++++++
# Now, we run both the input name and the name shifted one char
#+ to the right through the value-assigning function.
# If we get the same value out, that means that the first two characters
#+ of the name have the same value assigned, and that one should cancel.
# However, we also need to test whether the first letter of the name
#+ is a vowel or 'w' or 'h', because otherwise this would bollix things up.
char1=`echo $prefix | tr A-Z a-z` # First letter of name, lowercased.
assign_value $name
s1=$value
assign_value $name1
s2=$value
assign_value $char1
s3=$value
s3=9$s3 # If first letter of name is a vowel
#+ or 'w' or 'h',
#+ then its "value" will be null (unset).
#+ Therefore, set it to 9, an otherwise
#+ unused value, which can be tested for.
if [[ "$s1" -ne "$s2" || "$s3" -eq 9 ]]
then
suffix=$s2
else
suffix=${s2:$char_pos}
fi
# ++++++++++++++++++++++ end Exception Patch ++++++++++++++++++++++++++++++
padding=000 # Use at most 3 zeroes to pad.
soun=$prefix$suffix$padding # Pad with zeroes.
MAXLEN=4 # Truncate to maximum of 4 chars.
soundex=${soun:0:$MAXLEN}
echo "Soundex = $soundex"
echo
# The soundex code is a method of indexing and classifying names
#+ by grouping together the ones that sound alike.
# The soundex code for a given name is the first letter of the name,
#+ followed by a calculated three-number code.
# Similar sounding names should have almost the same soundex codes.
# Examples:
# Smith and Smythe both have a "S-530" soundex.
# Harrison = H-625
# Hargison = H-622
# Harriman = H-655
# This works out fairly well in practice, but there are numerous anomalies.
#
#
# The U.S. Census and certain other governmental agencies use soundex,
# as do genealogical researchers.
#
# For more information,
#+ see the "National Archives and Records Administration home page",
#+ http://www.nara.gov/genealogy/soundex/soundex.html
# Exercise:
# --------
# Simplify the "Exception Patch" section of this script.
exit 0