LDP/LDP/guide/docbook/abs-guide/wf.sh

#!/bin/bash
# wf.sh: Crude word frequency analysis on a text file.
# This is a more efficient version of the "wf2.sh" script.


# Check for input file on command-line.
ARGS=1
E_BADARGS=85
E_NOFILE=86

if [ $# -ne "$ARGS" ]  # Correct number of arguments passed to script?
then
  echo "Usage: `basename $0` filename"
  exit $E_BADARGS
fi

if [ ! -f "$1" ]       # Check if file exists.
then
  echo "File \"$1\" does not exist."
  exit $E_NOFILE
fi


########################################################
# main ()
sed -e 's/\.//g'  -e 's/\,//g' -e 's/ /\
/g' "$1" | tr 'A-Z' 'a-z' | sort | uniq -c | sort -nr
#                           =========================
#                            Frequency of occurrence

#  Filter out periods and commas, and
#+ change space between words to linefeed,
#+ then shift characters to lowercase, and
#+ finally prefix occurrence count and sort numerically.

#  Arun Giridhar suggests modifying the above to:
#  . . . | sort | uniq -c | sort +1 [-f] | sort +0 -nr
#  This adds a secondary sort key, so instances of
#+ equal occurrence are sorted alphabetically.
#  As he explains it:
#  "This is effectively a radix sort, first on the
#+ least significant column
#+ (word or string, optionally case-insensitive)
#+ and last on the most significant column (frequency)."
#
#  As Frank Wang explains, the above is equivalent to
#+       . . . | sort | uniq -c | sort +0 -nr
#+ and the following also works:
#+       . . . | sort | uniq -c | sort -k1nr -k
########################################################

exit 0

# Exercises:
# ---------
# 1) Add 'sed' commands to filter out other punctuation,
#+   such as semicolons.
# 2) Modify the script to also filter out multiple spaces and
#+   other whitespace.
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`#!/bin/bash`
			`# wf.sh: Crude word frequency analysis on a text file.`
updated 2004-01-05 13:20:57 +00:00			`# This is a more efficient version of the "wf2.sh" script.`
new entries for abs-guide 2002-01-07 15:25:25 +00:00

updated 2008-11-23 22:43:47 +00:00			`# Check for input file on command-line.`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`ARGS=1`
updated 2008-11-23 22:43:47 +00:00			`E_BADARGS=85`
			`E_NOFILE=86`
new entries for abs-guide 2002-01-07 15:25:25 +00:00
updated 2002-04-01 16:04:17 +00:00			`if [ $# -ne "$ARGS" ] # Correct number of arguments passed to script?`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`then`
			echo "Usage: `basename $0` filename"
			`exit $E_BADARGS`
			`fi`

updated 2002-04-01 16:04:17 +00:00			`if [ ! -f "$1" ] # Check if file exists.`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`then`
			`echo "File \"$1\" does not exist."`
			`exit $E_NOFILE`
			`fi`



			`########################################################`
updated 2002-04-01 16:04:17 +00:00			`# main ()`
updated 2004-01-05 13:20:57 +00:00			`sed -e 's/\.//g' -e 's/\,//g' -e 's/ /\`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`/g' "$1" \| tr 'A-Z' 'a-z' \| sort \| uniq -c \| sort -nr`
			`# =========================`
			`# Frequency of occurrence`

updated 2004-01-05 13:20:57 +00:00			`# Filter out periods and commas, and`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`#+ change space between words to linefeed,`
			`#+ then shift characters to lowercase, and`
			`#+ finally prefix occurrence count and sort numerically.`
updated 2004-04-28 12:08:07 +00:00
			`# Arun Giridhar suggests modifying the above to:`
			`# . . . \| sort \| uniq -c \| sort +1 [-f] \| sort +0 -nr`
			`# This adds a secondary sort key, so instances of`
			`#+ equal occurrence are sorted alphabetically.`
			`# As he explains it:`
			`# "This is effectively a radix sort, first on the`
			`#+ least significant column`
			`#+ (word or string, optionally case-insensitive)`
			`#+ and last on the most significant column (frequency)."`
updated 2005-10-21 13:31:28 +00:00			`#`
			`# As Frank Wang explains, the above is equivalent to`
			`#+ . . . \| sort \| uniq -c \| sort +0 -nr`
			`#+ and the following also works:`
			`#+ . . . \| sort \| uniq -c \| sort -k1nr -k`
new entries for abs-guide 2002-01-07 15:25:25 +00:00			`########################################################`

updated 2004-04-28 12:08:07 +00:00			`exit 0`

updated 2002-04-01 16:04:17 +00:00			`# Exercises:`
			`# ---------`
updated 2004-01-05 13:20:57 +00:00			`# 1) Add 'sed' commands to filter out other punctuation,`
			`#+ such as semicolons.`
updated 2005-05-08 20:09:31 +00:00			`# 2) Modify the script to also filter out multiple spaces and`
updated 2006-10-11 16:39:10 +00:00			`#+ other whitespace.`