Skip to content

Instantly share code, notes, and snippets.

@lernisto
Created April 30, 2019 03:57
Show Gist options
  • Save lernisto/0eab510e7bfe6c29d555e69006f739e5 to your computer and use it in GitHub Desktop.
Save lernisto/0eab510e7bfe6c29d555e69006f739e5 to your computer and use it in GitHub Desktop.
import re
from collections import defaultdict
def wordfreq(words):
freq = defaultdict(int)
for word in words:
freq[word] += 1
return freq
def report(freq):
freq = list(freq.items())
freq.sort(key=lambda x: (-x[1], x[0]))
total = 0
for word, count in freq:
print(count, word, sep="\t")
total += count
print(f"total:\t{total}")
def readwords(*files):
for f in files:
if isinstance(f, str):
f = open(f)
# TODO: replace this simple regex with nltk tokenization
for m in re.finditer(r"([-_a-zA-Z0-9]+)", f.read()):
yield m.group(0)
if __name__ == "__main__":
import sys
files = [sys.stdin] if len(sys.argv) == 1 else sys.argv[1:]
freq = wordfreq(readwords(*files))
report(freq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment