Created
September 11, 2020 10:30
-
-
Save cesschneider/857c8d319e2b14c09609fb78ddf9e019 to your computer and use it in GitHub Desktop.
count words from text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Improvemets for production environment: | |
# 1. Optimize filtering of undesireable chars using regex | |
# 2. Distrubute filter/counting tasks between multiples threads | |
# 3. Create unit test cases with multiple text contents | |
import re | |
input = open('data.txt', 'r') | |
output = open('results.txt', 'w') | |
remove_chars = ['\n', '-', '.', ',', '"', '(', ')', '[', ']', ':', ';'] | |
word_list = {} | |
while True: | |
line = input.readline() | |
# if line is empty end of file is reached | |
if not line: | |
break | |
# remove numbers and unexpected chars | |
line = re.sub(r"\b\d+\b", "", line) | |
for char in remove_chars: | |
line = line.replace(char, '') | |
words = line.split(' ') | |
for word in words: | |
word = word.lower() | |
if len(word) == 0: | |
continue | |
if word in word_list: | |
word_list[word] = word_list[word] + 1 | |
else: | |
word_list[word] = 1 | |
sorted_list = sorted(word_list.items(), key=lambda x: x[1], reverse=True) | |
for word in sorted_list: | |
result = "{} ({})\n".format(word[0], word[1]) | |
print(result, end='') | |
output.write(result) | |
input.close() | |
output.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment