Created
April 17, 2024 01:52
-
-
Save caitlinhudon/b3a9afe580eb2432a5c9bd09412bab18 to your computer and use it in GitHub Desktop.
text_analysis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk.tokenize import word_tokenize | |
from nltk.util import ngrams | |
from collections import Counter | |
def generate_ngrams_and_plots(text_input): | |
# Convert text input to a DataFrame | |
text_input_df = pd.DataFrame({'text': text_input}) | |
# Tokenize and stem the text | |
stemmer = SnowballStemmer("english") | |
text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize) | |
text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x]) | |
# Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4 | |
all_ngrams = [] | |
for n in range(1, 5): | |
text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n))) | |
ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist] | |
all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened]) | |
# Count the frequency of each n-gram | |
ngrams_freq = Counter(all_ngrams) | |
# Plot the most frequent n-grams for each n | |
for n in range(1, 5): | |
top_ngrams = [gram for gram in ngrams_freq if gram[0] == n] | |
top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10] | |
labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams]) | |
plt.figure(figsize=(10, 6)) | |
plt.bar(labels, values) | |
plt.title(f'Most frequent {n}-grams') | |
plt.xticks(rotation=45, ha="right") | |
plt.xlabel('N-gram') | |
plt.ylabel('Frequency') | |
plt.show() | |
# Generate word clouds for the most frequent terms for each n | |
for n in range(1, 5): | |
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n])) | |
plt.figure(figsize=(10, 5)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
plt.title(f'Word Cloud for {n}-grams') | |
plt.show() | |
# Example usage | |
text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."] | |
generate_ngrams_and_plots(text_input) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment