Created
January 1, 2024 01:20
-
-
Save robert-mcdermott/b6697557962a6ad6bb09bf69cee20eac to your computer and use it in GitHub Desktop.
Plot Word Frequencies minus Stop Words - Plotly and Matplotlib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import plotly.express as px | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
import re | |
import sys | |
import nltk | |
from nltk.corpus import stopwords | |
def plot_word_frequencies_matplot(file_path, top): | |
# Load stop words | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('english')) | |
# Try different encodings to open the file | |
for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']: | |
try: | |
with open(file_path, 'r', encoding=encoding) as file: | |
text = file.read().lower() | |
break | |
except UnicodeDecodeError: | |
pass | |
else: | |
raise ValueError(f"Failed to open file {file_path} with common encodings.") | |
# Remove non-alphabetic characters and split into words | |
words = re.findall(r'\b[a-z]+\b', text) | |
# Filter out stop words | |
filtered_words = [word for word in words if word not in stop_words] | |
# Count the frequencies | |
frequencies = Counter(filtered_words) | |
# Sort the frequencies | |
sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True) | |
# Select top N words | |
top_n = top | |
top_words = sorted_freq[:top_n] | |
words, counts = zip(*top_words) | |
# Plot | |
plt.figure(figsize=(10, 6)) | |
plt.bar(words, counts) | |
plt.xlabel('Words') | |
plt.ylabel('Frequency') | |
plt.xticks(rotation=90) | |
plt.title(f'Top {top} Word Frequencies in Text (Excluding Stop Words)') | |
plt.show() | |
def plot_word_frequencies_plotly(file_path, top): | |
# Load stop words | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('english')) | |
# Try different encodings to open the file | |
for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']: | |
try: | |
with open(file_path, 'r', encoding=encoding) as file: | |
text = file.read().lower() | |
break | |
except UnicodeDecodeError: | |
pass | |
else: | |
raise ValueError(f"Failed to open file {file_path} with common encodings.") | |
# Remove non-alphabetic characters and split into words | |
words = re.findall(r'\b[a-z]+\b', text) | |
# Filter out stop words | |
filtered_words = [word for word in words if word not in stop_words] | |
# Count the frequencies | |
frequencies = Counter(filtered_words) | |
# Sort and select top N words | |
top_n = top | |
sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:top_n] | |
# Create DataFrame for Plotly | |
import pandas as pd | |
df = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency']) | |
# Create interactive bar plot | |
fig = px.bar(df, x='Word', y='Frequency', title=f'Top {top} Word Frequencies in Text (Excluding Stop Words)') | |
fig.update_layout(xaxis_title='Words', yaxis_title='Frequency') | |
fig.show() | |
if __name__ == "__main__": | |
text = sys.argv[1] | |
plot_word_frequencies_matplot(text, 100) | |
plot_word_frequencies_plotly(text, 100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment