caitlinhudon · April 17, 2024 01:52
diff --git a/text_analysis.py b/text_analysis.py
 import pandas as pd
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 from sklearn.feature_extraction.text import CountVectorizer
 from nltk.stem.snowball import SnowballStemmer
 from nltk.tokenize import word_tokenize
 from nltk.util import ngrams
 from collections import Counter

 def generate_ngrams_and_plots(text_input):
    # Convert text input to a DataFrame
    text_input_df = pd.DataFrame({'text': text_input})
    
    # Tokenize and stem the text
    stemmer = SnowballStemmer("english")
    text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize)
    text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])
    
    # Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4
    all_ngrams = []
    for n in range(1, 5):
        text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n)))
        ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist]
        all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened])
    
    # Count the frequency of each n-gram
    ngrams_freq = Counter(all_ngrams)
    
    # Plot the most frequent n-grams for each n
    for n in range(1, 5):
        top_ngrams = [gram for gram in ngrams_freq if gram[0] == n]
        top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10]
        labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams])
        
        plt.figure(figsize=(10, 6))
        plt.bar(labels, values)
        plt.title(f'Most frequent {n}-grams')
        plt.xticks(rotation=45, ha="right")
        plt.xlabel('N-gram')
        plt.ylabel('Frequency')
        plt.show()
    
    # Generate word clouds for the most frequent terms for each n
    for n in range(1, 5):
        wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n]))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f'Word Cloud for {n}-grams')
        plt.show()

 # Example usage
 text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."]
 generate_ngrams_and_plots(text_input)
	import pandas as pd
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	from sklearn.feature_extraction.text import CountVectorizer
	from nltk.stem.snowball import SnowballStemmer
	from nltk.tokenize import word_tokenize
	from nltk.util import ngrams
	from collections import Counter

	def generate_ngrams_and_plots(text_input):
	# Convert text input to a DataFrame
	text_input_df = pd.DataFrame({'text': text_input})

	# Tokenize and stem the text
	stemmer = SnowballStemmer("english")
	text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize)
	text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

	# Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4
	all_ngrams = []
	for n in range(1, 5):
	text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n)))
	ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist]
	all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened])

	# Count the frequency of each n-gram
	ngrams_freq = Counter(all_ngrams)

	# Plot the most frequent n-grams for each n
	for n in range(1, 5):
	top_ngrams = [gram for gram in ngrams_freq if gram[0] == n]
	top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10]
	labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams])

	plt.figure(figsize=(10, 6))
	plt.bar(labels, values)
	plt.title(f'Most frequent {n}-grams')
	plt.xticks(rotation=45, ha="right")
	plt.xlabel('N-gram')
	plt.ylabel('Frequency')
	plt.show()

	# Generate word clouds for the most frequent terms for each n
	for n in range(1, 5):
	wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n]))
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.title(f'Word Cloud for {n}-grams')
	plt.show()

	# Example usage
	text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."]
	generate_ngrams_and_plots(text_input)