Skip to content

Instantly share code, notes, and snippets.

@robert-mcdermott
Created January 1, 2024 01:20
Show Gist options
  • Save robert-mcdermott/b6697557962a6ad6bb09bf69cee20eac to your computer and use it in GitHub Desktop.
Save robert-mcdermott/b6697557962a6ad6bb09bf69cee20eac to your computer and use it in GitHub Desktop.
Plot Word Frequencies minus Stop Words - Plotly and Matplotlib
import plotly.express as px
import matplotlib.pyplot as plt
from collections import Counter
import re
import sys
import nltk
from nltk.corpus import stopwords
def plot_word_frequencies_matplot(file_path, top):
# Load stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Try different encodings to open the file
for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
try:
with open(file_path, 'r', encoding=encoding) as file:
text = file.read().lower()
break
except UnicodeDecodeError:
pass
else:
raise ValueError(f"Failed to open file {file_path} with common encodings.")
# Remove non-alphabetic characters and split into words
words = re.findall(r'\b[a-z]+\b', text)
# Filter out stop words
filtered_words = [word for word in words if word not in stop_words]
# Count the frequencies
frequencies = Counter(filtered_words)
# Sort the frequencies
sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)
# Select top N words
top_n = top
top_words = sorted_freq[:top_n]
words, counts = zip(*top_words)
# Plot
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.title(f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
plt.show()
def plot_word_frequencies_plotly(file_path, top):
# Load stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Try different encodings to open the file
for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
try:
with open(file_path, 'r', encoding=encoding) as file:
text = file.read().lower()
break
except UnicodeDecodeError:
pass
else:
raise ValueError(f"Failed to open file {file_path} with common encodings.")
# Remove non-alphabetic characters and split into words
words = re.findall(r'\b[a-z]+\b', text)
# Filter out stop words
filtered_words = [word for word in words if word not in stop_words]
# Count the frequencies
frequencies = Counter(filtered_words)
# Sort and select top N words
top_n = top
sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:top_n]
# Create DataFrame for Plotly
import pandas as pd
df = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency'])
# Create interactive bar plot
fig = px.bar(df, x='Word', y='Frequency', title=f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
fig.update_layout(xaxis_title='Words', yaxis_title='Frequency')
fig.show()
if __name__ == "__main__":
text = sys.argv[1]
plot_word_frequencies_matplot(text, 100)
plot_word_frequencies_plotly(text, 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment