nathairtras · May 18, 2019 03:17
diff --git a/non_ntlk_phrase_counts.py b/non_ntlk_phrase_counts.py
 import re

 # List of phrases
 phrases = ["computer science","lots of fun"]

 # Text to parse
 paper_text = """
 This is a sentence that includes the phrase computer science.  Computer science is fun.
 Writing code is lots of fun.
 """

 ## String cleanup
 # Remove punctuation
 paper_text = re.sub(r'[^\w\s]','',paper_text)
 # Remove multiple whitespace
 paper_text = re.sub('\s+', ' ', paper_text)
 # Strip leading and tailing whitespace, set to all lowercase
 paper_text = paper_text.lower().strip()

 ## Phrase identification
 # Fix the phrases to not have whitespace
 for p in phrases:
  encoded_phrase = p.replace(" ","_")
  paper_text = paper_text.replace(p, encoded_phrase)

 ## Word/phrase extraction
 # Split the text
 split_text = paper_text.split(" ")

 # Get a count of phrases
 counts = {}

 for word_or_phrase in paper_text.split(" "):
  # Unfix the phrases to have spaces again
  if "_" in word_or_phrase:
    word_or_phrase = word_or_phrase.replace("_"," ")
  # Count increment
  counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1

 # Output
 print(counts)
	import re

	# List of phrases
	phrases = ["computer science","lots of fun"]

	# Text to parse
	paper_text = """
	This is a sentence that includes the phrase computer science. Computer science is fun.
	Writing code is lots of fun.
	"""

	## String cleanup
	# Remove punctuation
	paper_text = re.sub(r'[^\w\s]','',paper_text)
	# Remove multiple whitespace
	paper_text = re.sub('\s+', ' ', paper_text)
	# Strip leading and tailing whitespace, set to all lowercase
	paper_text = paper_text.lower().strip()

	## Phrase identification
	# Fix the phrases to not have whitespace
	for p in phrases:
	encoded_phrase = p.replace(" ","_")
	paper_text = paper_text.replace(p, encoded_phrase)

	## Word/phrase extraction
	# Split the text
	split_text = paper_text.split(" ")

	# Get a count of phrases
	counts = {}

	for word_or_phrase in paper_text.split(" "):
	# Unfix the phrases to have spaces again
	if "_" in word_or_phrase:
	word_or_phrase = word_or_phrase.replace("_"," ")
	# Count increment
	counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1

	# Output
	print(counts)