Skip to content

Instantly share code, notes, and snippets.

@cesschneider
Created September 11, 2020 10:30
Show Gist options
  • Save cesschneider/857c8d319e2b14c09609fb78ddf9e019 to your computer and use it in GitHub Desktop.
Save cesschneider/857c8d319e2b14c09609fb78ddf9e019 to your computer and use it in GitHub Desktop.
count words from text
# Improvemets for production environment:
# 1. Optimize filtering of undesireable chars using regex
# 2. Distrubute filter/counting tasks between multiples threads
# 3. Create unit test cases with multiple text contents
import re
input = open('data.txt', 'r')
output = open('results.txt', 'w')
remove_chars = ['\n', '-', '.', ',', '"', '(', ')', '[', ']', ':', ';']
word_list = {}
while True:
line = input.readline()
# if line is empty end of file is reached
if not line:
break
# remove numbers and unexpected chars
line = re.sub(r"\b\d+\b", "", line)
for char in remove_chars:
line = line.replace(char, '')
words = line.split(' ')
for word in words:
word = word.lower()
if len(word) == 0:
continue
if word in word_list:
word_list[word] = word_list[word] + 1
else:
word_list[word] = 1
sorted_list = sorted(word_list.items(), key=lambda x: x[1], reverse=True)
for word in sorted_list:
result = "{} ({})\n".format(word[0], word[1])
print(result, end='')
output.write(result)
input.close()
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment