Skip to content

Instantly share code, notes, and snippets.

@mhbeals
Last active November 10, 2018 20:48
Show Gist options
  • Save mhbeals/91930a5c83dfe7f7d40e11e5e673f96f to your computer and use it in GitHub Desktop.
Save mhbeals/91930a5c83dfe7f7d40e11e5e673f96f to your computer and use it in GitHub Desktop.
A simple python script (Based on work for Programming for the Humanities and Social Sciences workshop series at Loughborough University) to display a barcode of the n most common words in any Gutenberg eBook.
# import libraries
import re
import requests
from matplotlib import pyplot as plt
# import stop words
stop = ["a","about","above","after","again","against","all","am","an","and","any","are","arent","as","at","be","because","been","before","being","below","between","both","but","by","cant","cannot","could","couldnt","did","didnt","do","does","doesnt","doing","dont","down","during","each","few","for","from","further","had","hadnt","has","hasnt","have","havent","having","he","hed","hell","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","ill","im","ive","if","in","into","is","isnt","it","its","its","itself","lets","me","more","most","mustnt","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours ourselves","out","over","own","same","shant","she","shed","shell","shes","should","shouldnt","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","under","until","up","very","was","wasnt","we","wed","well","were","weve","were","werent","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wont","would","wouldnt","you","youd","youll","youre","youve","your","yours","yourself","yourselves"]
# Ask the user questions
text_to_download = input('Provide a Gutenberg Book Number: ')
number_of_words = int(input('How many words would you like to display? '))
# download the text
downloaded_string = requests.get("http://www.gutenberg.org/files/" + text_to_download + "/" + text_to_download + ".txt")
# format the data as plain text
downloaded_string = downloaded_string.text
# capture the title (before normalising the text)
# all Gutenberg books have this line immediately following the title
title_end = downloaded_string.find('\nThis eBook is for the use of anyone')
# use the end defined above and the removal of the first 31 standard characters which proceed the title to obtain the title
# use re.sub to remove any extra whitespace
title = re.sub('[\s]+',' ',downloaded_string[:title_end])
title = re.sub('The','the',title)
title = re.sub('EBook','eBook',title)
# Let the user know you are processing the right book
print("\nNow processing " + title)
# remove punctuation
my_text = re.sub('[^\w^\s_]*','',downloaded_string)
# make lower case
my_text = my_text.lower()
# find the start of the header
start = my_text.find('start of this project gutenberg ebook')
# find the start of the footer
end = my_text.find('end of this project gutenberg ebook')
# trim of the header and footer from the text
my_text = my_text[start+37:end]
# split text into a word list and count the words
my_words = my_text.split()
wordcount = len(my_words) + 1
#create a blank dictionary
dictionary = {}
#set your word number iterator to 1
i = 1
#create a for loop for your word list
for word in my_words:
#create a blank list for your index numbers
list = []
#assign the existing value for that key word to an entry variable
entry = dictionary.get(word)
#ask if that entry is None
if entry == None:
#if it is, assign a list of that word number [i] as the value of that key in your dictionary
entry = [i]
dictionary[word] = entry
#use the else command
else:
#append the new word number to your entry list
entry.append(i)
#assign the expaned entry list as the value of that key in your dictionary
dictionary[word] = entry
# increase you iterator
i = i +1
# create a value variable with a value of 1
value = 1
# create for loop through your dictionary
for key,instances in dictionary.items():
# if the length of the current word's list is greater than value
# and if it is not in the stop list
if len(dictionary[key]) > value and key not in stop:
#update the value of value
value = len(dictionary[key])
# create a blank dictionary for your top words
top_words ={}
# create a counter to stop after the required number of words
i = 0
# create a while loop counting down to zero from the highest instance number
while value > 0:
# create a for loop to go through your dictionary
for key,counter in dictionary.items():
# check if the length of the list (value) is the same as your current counter
if len(dictionary[key]) == value and i < number_of_words:
# append the word to your top five list
top_words[key] = dictionary[key]
# increase your counter
i = i + 1
# subtract one from the counter
value = value - 1
# instantiate the figure
plt.figure(figsize=(20,5))
# create a for loop for your top words
for word,numbers in top_words.items():
# create the legend text
legend_label = word + ": " + str(len(top_words[word]))
# create the bars
plt.bar(numbers,1,label=legend_label, width=10)
print("Completed visualisation of " + word)
# set the limits of the x-axis
plt.xlim(0,wordcount)
plt.ylim(0,1)
# set the tick marks on the x and y-axes
plt.xticks([])
plt.yticks([])
# set the chart title
plt.title(str(number_of_words) + " Most Common Words in " + title)
# set the legend placement
plt.legend(loc="upper right")
# show the chart
plt.show()
# close the chart
plt.close()
#You may need to run twice if importing libraries for the first time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment