LDP/LDP/guide/docbook/abs-guide/wstrings.sh

75 lines
2.5 KiB
Bash

#!/bin/bash
# wstrings.sh: "word-strings" (enhanced "strings" command)
#
# This script filters the output of "strings" by checking it
#+ against a standard word list file.
# This effectively eliminates gibberish and noise,
#+ and outputs only recognized words.
# ===========================================================
# Standard Check for Script Argument(s)
ARGS=1
E_BADARGS=85
E_NOFILE=86
if [ $# -ne $ARGS ]
then
echo "Usage: `basename $0` filename"
exit $E_BADARGS
fi
if [ ! -f "$1" ] # Check if file exists.
then
echo "File \"$1\" does not exist."
exit $E_NOFILE
fi
# ===========================================================
MINSTRLEN=3 # Minimum string length.
WORDFILE=/usr/share/dict/linux.words # Dictionary file.
# May specify a different word list file
#+ of one-word-per-line format.
# For example, the "yawl" word-list package,
# http://bash.deta.in/yawl-0.3.2.tar.gz
wlist=`strings "$1" | tr A-Z a-z | tr '[:space:]' Z | \
tr -cs '[:alpha:]' Z | tr -s '\173-\377' Z | tr Z ' '`
# Translate output of 'strings' command with multiple passes of 'tr'.
# "tr A-Z a-z" converts to lowercase.
# "tr '[:space:]'" converts whitespace characters to Z's.
# "tr -cs '[:alpha:]' Z" converts non-alphabetic characters to Z's,
#+ and squeezes multiple consecutive Z's.
# "tr -s '\173-\377' Z" converts all characters past 'z' to Z's
#+ and squeezes multiple consecutive Z's,
#+ which gets rid of all the weird characters that the previous
#+ translation failed to deal with.
# Finally, "tr Z ' '" converts all those Z's to whitespace,
#+ which will be seen as word separators in the loop below.
# ***********************************************************************
# Note the technique of feeding/piping the output of 'tr' back to itself,
#+ but with different arguments and/or options on each successive pass.
# ***********************************************************************
for word in $wlist # Important:
# $wlist must not be quoted here.
# "$wlist" does not work.
# Why not?
do
strlen=${#word} # String length.
if [ "$strlen" -lt "$MINSTRLEN" ] # Skip over short strings.
then
continue
fi
grep -Fw $word "$WORDFILE" # Match whole words only.
# ^^^ # "Fixed strings" and
#+ "whole words" options.
done
exit $?