Last active
September 14, 2021 13:55
-
-
Save mmmayo13/86b6ce75a3acc6f8ba2ddadc0f7fecb2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from string import punctuation | |
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words | |
import spacy | |
def count_words(tokens): | |
word_counts = {} | |
for token in tokens: | |
if token not in stop_words and token not in punctuation and token is not '\n': | |
if token not in word_counts.keys(): | |
word_counts[token] = 1 | |
else: | |
word_counts[token] += 1 | |
return word_counts | |
def word_freq_distribution(word_counts): | |
freq_dist = {} | |
max_freq = max(word_counts.values()) | |
for word in word_counts.keys(): | |
freq_dist[word] = (word_counts[word]/max_freq) | |
return freq_dist | |
def score_sentences(sents, freq_dist, max_len=40): | |
sent_scores = {} | |
for sent in sents: | |
words = sent.text.split(' ') | |
for word in words: | |
if word.lower() in freq_dist.keys(): | |
if len(words) < max_len: | |
if sent not in sent_scores.keys(): | |
sent_scores[sent] = freq_dist[word.lower()] | |
else: | |
sent_scores[sent] += freq_dist[word.lower()] | |
return sent_scores | |
def summarize(sent_scores, k): | |
top_sents = Counter(sent_scores) | |
summary = '' | |
scores = [] | |
top = top_sents.most_common(k) | |
for t in top: | |
summary += t[0].text | |
scores.append((t[1], t[0])) | |
return summary, scores | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(text) | |
# Tokenize with spaCy | |
tokens = [token.text for token in doc] | |
sents = [sentence for sentence in doc.sents] | |
# Get word counts | |
word_counts = count_words(tokens) | |
# Get word frequency distirbution | |
freq_dist = word_freq_distribution(word_counts) | |
# Score sentences | |
sent_scores = score_sentences(sents, freq_dist) | |
# Summarize text | |
summary, summary_sent_scores = summarize(sent_scores, 3) | |
print(summary) | |
# Print summary sentence scores | |
print(summary_sent_scores) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment