LDP/LDP/guide/docbook/abs-guide/wf.sh

61 lines
1.7 KiB
Bash
Raw Normal View History

2002-01-07 15:25:25 +00:00
#!/bin/bash
# wf.sh: Crude word frequency analysis on a text file.
2004-01-05 13:20:57 +00:00
# This is a more efficient version of the "wf2.sh" script.
2002-01-07 15:25:25 +00:00
2008-11-23 22:43:47 +00:00
# Check for input file on command-line.
2002-01-07 15:25:25 +00:00
ARGS=1
2008-11-23 22:43:47 +00:00
E_BADARGS=85
E_NOFILE=86
2002-01-07 15:25:25 +00:00
2002-04-01 16:04:17 +00:00
if [ $# -ne "$ARGS" ] # Correct number of arguments passed to script?
2002-01-07 15:25:25 +00:00
then
echo "Usage: `basename $0` filename"
exit $E_BADARGS
fi
2002-04-01 16:04:17 +00:00
if [ ! -f "$1" ] # Check if file exists.
2002-01-07 15:25:25 +00:00
then
echo "File \"$1\" does not exist."
exit $E_NOFILE
fi
########################################################
2002-04-01 16:04:17 +00:00
# main ()
2004-01-05 13:20:57 +00:00
sed -e 's/\.//g' -e 's/\,//g' -e 's/ /\
2002-01-07 15:25:25 +00:00
/g' "$1" | tr 'A-Z' 'a-z' | sort | uniq -c | sort -nr
# =========================
# Frequency of occurrence
2004-01-05 13:20:57 +00:00
# Filter out periods and commas, and
2002-01-07 15:25:25 +00:00
#+ change space between words to linefeed,
#+ then shift characters to lowercase, and
#+ finally prefix occurrence count and sort numerically.
2004-04-28 12:08:07 +00:00
# Arun Giridhar suggests modifying the above to:
# . . . | sort | uniq -c | sort +1 [-f] | sort +0 -nr
# This adds a secondary sort key, so instances of
#+ equal occurrence are sorted alphabetically.
# As he explains it:
# "This is effectively a radix sort, first on the
#+ least significant column
#+ (word or string, optionally case-insensitive)
#+ and last on the most significant column (frequency)."
2005-10-21 13:31:28 +00:00
#
# As Frank Wang explains, the above is equivalent to
#+ . . . | sort | uniq -c | sort +0 -nr
#+ and the following also works:
#+ . . . | sort | uniq -c | sort -k1nr -k
2002-01-07 15:25:25 +00:00
########################################################
2004-04-28 12:08:07 +00:00
exit 0
2002-04-01 16:04:17 +00:00
# Exercises:
# ---------
2004-01-05 13:20:57 +00:00
# 1) Add 'sed' commands to filter out other punctuation,
#+ such as semicolons.
2005-05-08 20:09:31 +00:00
# 2) Modify the script to also filter out multiple spaces and
2006-10-11 16:39:10 +00:00
#+ other whitespace.