mirror of https://github.com/tLDP/LDP
61 lines
1.7 KiB
Bash
61 lines
1.7 KiB
Bash
#!/bin/bash
|
|
# wf.sh: Crude word frequency analysis on a text file.
|
|
# This is a more efficient version of the "wf2.sh" script.
|
|
|
|
|
|
# Check for input file on command-line.
|
|
ARGS=1
|
|
E_BADARGS=85
|
|
E_NOFILE=86
|
|
|
|
if [ $# -ne "$ARGS" ] # Correct number of arguments passed to script?
|
|
then
|
|
echo "Usage: `basename $0` filename"
|
|
exit $E_BADARGS
|
|
fi
|
|
|
|
if [ ! -f "$1" ] # Check if file exists.
|
|
then
|
|
echo "File \"$1\" does not exist."
|
|
exit $E_NOFILE
|
|
fi
|
|
|
|
|
|
|
|
########################################################
|
|
# main ()
|
|
sed -e 's/\.//g' -e 's/\,//g' -e 's/ /\
|
|
/g' "$1" | tr 'A-Z' 'a-z' | sort | uniq -c | sort -nr
|
|
# =========================
|
|
# Frequency of occurrence
|
|
|
|
# Filter out periods and commas, and
|
|
#+ change space between words to linefeed,
|
|
#+ then shift characters to lowercase, and
|
|
#+ finally prefix occurrence count and sort numerically.
|
|
|
|
# Arun Giridhar suggests modifying the above to:
|
|
# . . . | sort | uniq -c | sort +1 [-f] | sort +0 -nr
|
|
# This adds a secondary sort key, so instances of
|
|
#+ equal occurrence are sorted alphabetically.
|
|
# As he explains it:
|
|
# "This is effectively a radix sort, first on the
|
|
#+ least significant column
|
|
#+ (word or string, optionally case-insensitive)
|
|
#+ and last on the most significant column (frequency)."
|
|
#
|
|
# As Frank Wang explains, the above is equivalent to
|
|
#+ . . . | sort | uniq -c | sort +0 -nr
|
|
#+ and the following also works:
|
|
#+ . . . | sort | uniq -c | sort -k1nr -k
|
|
########################################################
|
|
|
|
exit 0
|
|
|
|
# Exercises:
|
|
# ---------
|
|
# 1) Add 'sed' commands to filter out other punctuation,
|
|
#+ such as semicolons.
|
|
# 2) Modify the script to also filter out multiple spaces and
|
|
#+ other whitespace.
|