Last active November 10, 2018 20:48
A simple python script (Based on work for Programming for the Humanities and Social Sciences workshop series at Loughborough University) to display a barcode of the n most common words in any Gutenberg eBook.
# import libraries
import re
import requests
from matplotlib import pyplot as plt
# import stop words
stop = ["a","about","above","after","again","against","all","am","an","and","any","are","arent","as","at","be","because","been","before","being","below","between","both","but","by","cant","cannot","could","couldnt","did","didnt","do","does","doesnt","doing","dont","down","during","each","few","for","from","further","had","hadnt","has","hasnt","have","havent","having","he","hed","hell","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","ill","im","ive","if","in","into","is","isnt","it","its","its","itself","lets","me","more","most","mustnt","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours ourselves","out","over","own","same","shant","she","shed","shell","shes","should","shouldnt","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","under","until","up","very","was","wasnt","we","wed","well","were","weve","were","werent","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wont","would","wouldnt","you","youd","youll","youre","youve","your","yours","yourself","yourselves"]
# Ask the user questions
text_to_download = input('Provide a Gutenberg Book Number: ')
number_of_words = int(input('How many words would you like to display? '))
# download the text
downloaded_string = requests.get("" + text_to_download + "/" + text_to_download + ".txt")
# format the data as plain text
downloaded_string = downloaded_string.text
# capture the title (before normalising the text)
# all Gutenberg books have this line immediately following the title
title_end = downloaded_string.find('\nThis eBook is for the use of anyone')
# use the end defined above and the removal of the first 31 standard characters which proceed the title to obtain the title
# use re.sub to remove any extra whitespace
title = re.sub('[\s]+',' ',downloaded_string[:title_end])
title = re.sub('The','the',title)
title = re.sub('EBook','eBook',title)
# Let the user know you are processing the right book
print("\nNow processing " + title)
# remove punctuation
my_text = re.sub('[^\w^\s_]*','',downloaded_string)
# make lower case
my_text = my_text.lower()
# find the start of the header
start = my_text.find('start of this project gutenberg ebook')
# find the start of the footer
end = my_text.find('end of this project gutenberg ebook')
# trim of the header and footer from the text
my_text = my_text[start+37:end]
# split text into a word list and count the words
my_words = my_text.split()
wordcount = len(my_words) + 1
#create a blank dictionary
dictionary = {}
#set your word number iterator to 1
i = 1
#create a for loop for your word list
for word in my_words:
#create a blank list for your index numbers
list = []
#assign the existing value for that key word to an entry variable
entry = dictionary.get(word)
#ask if that entry is None
if entry == None:
#if it is, assign a list of that word number [i] as the value of that key in your dictionary
entry = [i]
dictionary[word] = entry
#use the else command
#append the new word number to your entry list
#assign the expaned entry list as the value of that key in your dictionary
dictionary[word] = entry
# increase you iterator
i = i +1
# create a value variable with a value of 1
value = 1
# create for loop through your dictionary
for key,instances in dictionary.items():
# if the length of the current word's list is greater than value
# and if it is not in the stop list
if len(dictionary[key]) > value and key not in stop:
#update the value of value
value = len(dictionary[key])
# create a blank dictionary for your top words
top_words ={}
# create a counter to stop after the required number of words
i = 0
# create a while loop counting down to zero from the highest instance number
while value > 0:
# create a for loop to go through your dictionary
for key,counter in dictionary.items():
# check if the length of the list (value) is the same as your current counter
if len(dictionary[key]) == value and i < number_of_words:
# append the word to your top five list
top_words[key] = dictionary[key]
# increase your counter
i = i + 1
# subtract one from the counter
value = value - 1
# instantiate the figure
# create a for loop for your top words
for word,numbers in top_words.items():
# create the legend text
legend_label = word + ": " + str(len(top_words[word]))
# create the bars,1,label=legend_label, width=10)
print("Completed visualisation of " + word)
# set the limits of the x-axis
# set the tick marks on the x and y-axes
# set the chart title
plt.title(str(number_of_words) + " Most Common Words in " + title)
# set the legend placement
plt.legend(loc="upper right")
# show the chart
# close the chart
#You may need to run twice if importing libraries for the first time
