Last active
December 1, 2018 12:16
-
-
Save dubirajara/3b507b1b0b3988d8ca5529e52abcb057 to your computer and use it in GitHub Desktop.
count words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import chain | |
from operator import itemgetter | |
import re | |
import string | |
import requests | |
urls = ['https://storage.googleapis.com/apache-beam-samples/shakespeare/kinglear.txt', | |
'https://storage.googleapis.com/apache-beam-samples/shakespeare/othello.txt', | |
'https://storage.googleapis.com/apache-beam-samples/shakespeare/romeoandjuliet.txt'] | |
stop_words_url = 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' | |
data_url = [requests.get(url).text.lower().split() for url in urls] # Get urls data and convert in words lists | |
data_url = list(chain.from_iterable(data_url)) # Join words lists | |
stop_words = requests.get(stop_words_url).text.split() # Get stop words data to relevant words classification | |
def clean_words(data): | |
"""Clean the data, punctuation and irrelevant words""" | |
re_path = re.compile('[%s]' % re.escape(string.punctuation)) | |
words_clean_p = [re_path.sub('', w) for w in data] # Clean punctuation | |
relevant_words = [x for x in words_clean_p if x not in stop_words] # Get only relevant words | |
return relevant_words | |
def word_frequency_counter(words): | |
"""Count and analyze word frequencies""" | |
count_word_freq = [words.count(word) for word in words] | |
freq_word_dic = dict(zip(words, count_word_freq)) # Convert the frequency list and word list in Dict | |
return dict(sorted(freq_word_dic.items(), key=itemgetter(1), reverse=True)) # Dict ordered by frequency (gtl) | |
dic = word_frequency_counter(clean_words(data_url)) | |
# Template HTML Table Listing Report. | |
table_base = f"""<style> | |
.i-am-centered {{ margin: auto; max-width: 800px;}} | |
table .alto {{background-color:gray;}} | |
</style> | |
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css"> | |
<br> | |
<div class="i-am-centered"> | |
<div class="row"> | |
<div class="col-lg-8 col-md-8 col-sm-8 col-xs-8"> | |
<h2>Shakespeare's Research</h2> | |
<hr> | |
<table class="table table-hover table-bordered"> | |
<caption>{len(dic)} Relevant words in three Shakespeare's literary masterpieces.</caption> | |
<thead><tr><th class="alto" scope="col">Words</th><th class="alto" scope="col">Frecuency</th></tr></thead><tbody>""" | |
if __name__ == '__main__': | |
# Create a HTML Table Listing Report. | |
for w, f in dic.items(): | |
table_item = f'<tr><th> {str(w)} </th><th> {str(f)} </td></tr>' | |
table_base = table_base + table_item | |
table_base = f'{table_base}</tbody></table></div></div></div></div>' | |
with open("word_frequencies_report.html", "w") as file: | |
file.write(table_base) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment