robert-mcdermott · January 1, 2024 01:20
diff --git a/word_freq_plot.py b/word_freq_plot.py
 import plotly.express as px
 import matplotlib.pyplot as plt
 from collections import Counter
 import re
 import sys
 import nltk
 from nltk.corpus import stopwords

 def plot_word_frequencies_matplot(file_path, top):
    # Load stop words
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    # Try different encodings to open the file
    for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                text = file.read().lower()
            break
        except UnicodeDecodeError:
            pass
    else:
        raise ValueError(f"Failed to open file {file_path} with common encodings.")

    # Remove non-alphabetic characters and split into words
    words = re.findall(r'\b[a-z]+\b', text)

    # Filter out stop words
    filtered_words = [word for word in words if word not in stop_words]

    # Count the frequencies
    frequencies = Counter(filtered_words)

    # Sort the frequencies
    sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)

    # Select top N words
    top_n = top
    top_words = sorted_freq[:top_n]
    words, counts = zip(*top_words)

    # Plot
    plt.figure(figsize=(10, 6))
    plt.bar(words, counts)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=90)
    plt.title(f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
    plt.show()

 def plot_word_frequencies_plotly(file_path, top):
    # Load stop words
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    # Try different encodings to open the file
    for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                text = file.read().lower()
            break
        except UnicodeDecodeError:
            pass
    else:
        raise ValueError(f"Failed to open file {file_path} with common encodings.")


    # Remove non-alphabetic characters and split into words
    words = re.findall(r'\b[a-z]+\b', text)

    # Filter out stop words
    filtered_words = [word for word in words if word not in stop_words]

    # Count the frequencies
    frequencies = Counter(filtered_words)

    # Sort and select top N words
    top_n = top
    sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Create DataFrame for Plotly
    import pandas as pd
    df = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency'])

    # Create interactive bar plot
    fig = px.bar(df, x='Word', y='Frequency', title=f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
    fig.update_layout(xaxis_title='Words', yaxis_title='Frequency')
    fig.show()


 if __name__ == "__main__":
    text = sys.argv[1]
    plot_word_frequencies_matplot(text, 100)
    plot_word_frequencies_plotly(text, 100)
	import plotly.express as px
	import matplotlib.pyplot as plt
	from collections import Counter
	import re
	import sys
	import nltk
	from nltk.corpus import stopwords

	def plot_word_frequencies_matplot(file_path, top):
	# Load stop words
	nltk.download('stopwords')
	stop_words = set(stopwords.words('english'))

	# Try different encodings to open the file
	for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
	try:
	with open(file_path, 'r', encoding=encoding) as file:
	text = file.read().lower()
	break
	except UnicodeDecodeError:
	pass
	else:
	raise ValueError(f"Failed to open file {file_path} with common encodings.")

	# Remove non-alphabetic characters and split into words
	words = re.findall(r'\b[a-z]+\b', text)

	# Filter out stop words
	filtered_words = [word for word in words if word not in stop_words]

	# Count the frequencies
	frequencies = Counter(filtered_words)

	# Sort the frequencies
	sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)

	# Select top N words
	top_n = top
	top_words = sorted_freq[:top_n]
	words, counts = zip(*top_words)

	# Plot
	plt.figure(figsize=(10, 6))
	plt.bar(words, counts)
	plt.xlabel('Words')
	plt.ylabel('Frequency')
	plt.xticks(rotation=90)
	plt.title(f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
	plt.show()

	def plot_word_frequencies_plotly(file_path, top):
	# Load stop words
	nltk.download('stopwords')
	stop_words = set(stopwords.words('english'))

	# Try different encodings to open the file
	for encoding in ['utf-8', 'latin1', 'ISO-8859-1', 'Windows-1252']:
	try:
	with open(file_path, 'r', encoding=encoding) as file:
	text = file.read().lower()
	break
	except UnicodeDecodeError:
	pass
	else:
	raise ValueError(f"Failed to open file {file_path} with common encodings.")


	# Remove non-alphabetic characters and split into words
	words = re.findall(r'\b[a-z]+\b', text)

	# Filter out stop words
	filtered_words = [word for word in words if word not in stop_words]

	# Count the frequencies
	frequencies = Counter(filtered_words)

	# Sort and select top N words
	top_n = top
	sorted_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:top_n]

	# Create DataFrame for Plotly
	import pandas as pd
	df = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency'])

	# Create interactive bar plot
	fig = px.bar(df, x='Word', y='Frequency', title=f'Top {top} Word Frequencies in Text (Excluding Stop Words)')
	fig.update_layout(xaxis_title='Words', yaxis_title='Frequency')
	fig.show()


	if __name__ == "__main__":
	text = sys.argv[1]
	plot_word_frequencies_matplot(text, 100)
	plot_word_frequencies_plotly(text, 100)