Created
May 18, 2019 03:17
-
-
Save nathairtras/ee25476929ce6cadd77253fbbd1cb1df to your computer and use it in GitHub Desktop.
Extracting words and known phrases without NTLK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# List of phrases | |
phrases = ["computer science","lots of fun"] | |
# Text to parse | |
paper_text = """ | |
This is a sentence that includes the phrase computer science. Computer science is fun. | |
Writing code is lots of fun. | |
""" | |
## String cleanup | |
# Remove punctuation | |
paper_text = re.sub(r'[^\w\s]','',paper_text) | |
# Remove multiple whitespace | |
paper_text = re.sub('\s+', ' ', paper_text) | |
# Strip leading and tailing whitespace, set to all lowercase | |
paper_text = paper_text.lower().strip() | |
## Phrase identification | |
# Fix the phrases to not have whitespace | |
for p in phrases: | |
encoded_phrase = p.replace(" ","_") | |
paper_text = paper_text.replace(p, encoded_phrase) | |
## Word/phrase extraction | |
# Split the text | |
split_text = paper_text.split(" ") | |
# Get a count of phrases | |
counts = {} | |
for word_or_phrase in paper_text.split(" "): | |
# Unfix the phrases to have spaces again | |
if "_" in word_or_phrase: | |
word_or_phrase = word_or_phrase.replace("_"," ") | |
# Count increment | |
counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1 | |
# Output | |
print(counts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment