Skip to content

Instantly share code, notes, and snippets.

@stephenhouser
Last active August 18, 2022 01:47
Show Gist options
  • Save stephenhouser/d6370ce973a7b55889d06caa16ee068d to your computer and use it in GitHub Desktop.
Save stephenhouser/d6370ce973a7b55889d06caa16ee068d to your computer and use it in GitHub Desktop.
Python Word Frequency Analyzer

Simple Python Word Frequency Analyzer

Reads a comma separated value (CSV) file and computes the frequency of words that appear in a specific column. We use this program to extract word usage data from database exports of letters between the Howard brothers at Bowdoin College.

usage: word-counter.py [-h] [-l [LOGLEVEL]] [-k KEY_COLUMN] [-t TEXT_COLUMN]
                       [-n NUMBER] [-w WORDS]
                       [input] [output]
  • KEY_COLUMN is the unique identifer column of your CSV file. This parameter can either be the name of the column or the zero-based index of the column.

  • TEXT_COLUMN is the column that contains the text to analyze. This parameter can either be the name of the column or the zero-based index of the column.

  • WORDS optional file containing keywords to compute frequencies for. If this is not specified the program will compute frequencies for the most common words used in all the TEXT_COLUMN fields across the entire CSV.

  • NUMBER optional number of words to include in the output CSV. The most used NUMBER words.

  • input and output default to stdin and stdout if not specified.

Example: Compute the 10 most frequently used words in the LetterBody field of OOLetters.csv and save them into top100.csv

word-counter.py -n 100 -t LetterBody OOLetters.csv top100.csv

Example: Compute frequency of words from keywords.txt file as used in the LetterBody field of OOLetters.csv and save them into keyword_freq.csv

word-counter.py -w keywords.txt -t LetterBody OOLetters.csv keyword_freq.csv

NOTE: The code does 'word stemming' to count root words. This can cause some probems. Here are some sample translations that happen as a result:

  • ['lord', 'lords', 'lordship', 'lord''s'] becomes ['lord', 'lords', 'lordship', 'lords']
  • ['christ', 'christian', 'christmas'] becomes ['christ', 'christian', 'christma']
  • ['pray', 'praying', 'prayer', 'prayed'] becomes ['pray', 'pray', 'prayer', 'pray']
lord
god
christ
pray
church
bible
religion
brother
sister
mother
father
son
daughter
family
freedmen
freedman
slave
africa
negro
colored
abolition
anti-slavery
peace
fight
battle
conflict
soldier
rebel
confederate
Simple Python Word Frequency Analyzer
#!/usr/bin/env python
import sys
import csv
import string
import nltk
import re
import argparse
import logging
ENCODING = 'mac_roman'
CSV_DIALECT = 'excel'
LOG_FORMAT = '%(message)s'
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.snowball.SnowballStemmer('english')
# Overall word frequencies (all words across all records)
wordfreq_all = nltk.FreqDist()
wordfreq_by_id = {}
def count_words_in_file(file_handle, key_column_name, text_column_name):
global wordfreq_all, wordfreq_by_id
csreader = csv.reader(file_handle, dialect=CSV_DIALECT)
columns = next(csreader)
if text_column_name in columns:
text_col = columns.index(text_column_name)
else:
text_col = int(text_column_name)
if key_column_name in columns:
key_col = columns.index(key_column_name)
else:
key_col = int(key_column_name)
logging.info("Analyzing column #{}...".format(text_col))
# Create a NLTK tokenizer that removes punctaution
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for row in csreader:
row_id = row[key_col]
# Get content to evaluate, convert all to lower case
body = row[text_col].lower()
# Wack-a-doodle for Unicode...
body = re.sub('[^0-9a-zA-Z]+', '*', body)
# Convert content to word list (tokenize)
tokens = tokenizer.tokenize(body)
# Remove single-character tokens (mostly punctuation)
tokens = [w for w in tokens if len(w) > 1]
# Remove numbers
tokens = [w for w in tokens if not w.isnumeric()]
# Stemming words sometimes makes matters worse
tokens = [stemmer.stem(w) for w in tokens]
# Remove stop-words from list of tokens based on English stopwords
tokens = [w for w in tokens if w not in stopwords]
# Convert to NLTK Text object
#wordfreq_by_id[row_id] = nltk.Text(tokens)
# Generate word frequency list for this content
wordfreq_by_id[row_id] = nltk.FreqDist(tokens)
# Add to global freq dist of words
for word in tokens:
wordfreq_all[word] += 1
def write_word_frequencies(file_handle, key_column_name, wordlist):
cswriter = csv.writer(file_handle, dialect=CSV_DIALECT)
# Write out header line
header_row = [key_column_name] + wordlist
cswriter.writerow(header_row)
# Write out Grand Totals as first row
cswriter.writerow(['Total use'] + [wordfreq_all[word] for word in wordlist])
# Write out each row of word counts
for (r_id, wordfreq) in wordfreq_by_id.items():
row = [r_id] + [wordfreq[word] for word in wordlist]
cswriter.writerow(row)
def valid_int(int_str):
try:
int(int_str)
return True
except ValueError:
return False
def setup_logging(loglevel):
# If `-l` is specified then use INFO as default
#LOG_FORMAT = '%(levelname)s:%(message)s'
log_level = 0 # Not set
if loglevel:
if valid_int(loglevel):
log_level = int(loglevel)
else:
log_level = getattr(logging, loglevel.upper(), None)
if not isinstance(log_level, int):
raise ValueError("Invalid log level: {}".format(loglevel))
else:
log_level = 20 # INFO level
logging.basicConfig(format=LOG_FORMAT, level=log_level)
def main():
aparser = argparse.ArgumentParser()
aparser.add_argument("-l", "--log", nargs='?',
dest='loglevel',
default='WARNING',
help="show diagnostic messages (DEBUG, INFO, WARNING, ERROR, CRITICAL)")
aparser.add_argument("-k", "--key", nargs=1,
dest='key_column', default=[0],
help="Name or index of primary key column")
aparser.add_argument("-t", "--text", nargs=1,
dest='text_column', default=[1],
help="Name or index of text column to analyze")
aparser.add_argument("-n", "--number", nargs=1,
type=int,
default=[50],
help="Number of word frequencies to output")
aparser.add_argument("-w", "--words", nargs=1,
type=argparse.FileType('r', encoding=ENCODING),
help="Wordlist to compute frequencies for (one word per line)")
aparser.add_argument("input", nargs='?',
type=argparse.FileType('r', encoding=ENCODING),
default=sys.stdin,
help="CSV input file(s)")
aparser.add_argument("output", nargs='?',
type=argparse.FileType('w', encoding=ENCODING),
default=sys.stdout,
help="CSV output file")
args = aparser.parse_args()
setup_logging(args.loglevel)
output_words = args.number[0]
key_column = args.key_column[0]
text_column = args.text_column[0]
# Read in and count all input files...
logging.info("Reading {}...".format(args.input.name))
count_words_in_file(args.input, key_column, text_column)
# Extract most frequently used words across all rows
if args.words:
logging.info("Using keywords for frequencies...")
wordlist = args.words[0].read().replace('\n', ' ').split()
wordlist = [stemmer.stem(w) for w in wordlist]
else:
logging.info("Computing most frequent words...")
wordlist = [wc[0] for wc in wordfreq_all.most_common(output_words)]
logging.info("Writing results to {}...".format(args.output.name))
write_word_frequencies(args.output, key_column, wordlist)
if __name__ == "__main__":
main()
@Saritalna
Copy link

Hello, when I run your code I get this error: TypeError: argument of type 'NoneType' is not iterable - because of: if text_column_name in columns: text_col = columns.index()
I wonder if I need to define the 'key/text_column:name' in def count_words_in_file? from the start? I have though in the defmain():
output_words = args.number[0]
key_column = args.category_id[0]
text_column = args.document[0].

thanks,

Sarita

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment