Skip to content

Instantly share code, notes, and snippets.

@gregjan
Created April 18, 2023 19:13
Show Gist options
  • Save gregjan/76f9a7f561aa554be34178540a915997 to your computer and use it in GitHub Desktop.
Save gregjan/76f9a7f561aa554be34178540a915997 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import spacy
import stanza
import re
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse,constituency', package={'constituency': 'ptb3_bert'})
# Change to a loop through of whatever the document storage is
HTMLFile = open("AgentOrange.html", "r")
#change this to whatever string search you're using
x="problems" # FIXME: Not currently used..
word_pattern = re.compile(" (\w*)\)")
phrase_labels = ["VP", "NP", "S"] # Whatever labels as appropriate
# See http://surdeanu.cs.arizona.edu/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html
def generatePhrases(tree):
children = list(tree.children)
if len(children) == 0: return
if str(children[0].label) in phrase_labels: # tree is a phrase we care about
match = word_pattern.findall(str(tree))
yield " ".join(match)
# NOTE: you could avoid replication below by only pursuing deeper when this tree label is NOT a phrase_label.
# Currently it keeps looking deeper even if phrase label matched above..
for c in children:
yield from generatePhrases(c)
# Reading the file
doc = BeautifulSoup(HTMLFile, 'html.parser')
for pidx, para in enumerate(doc.find_all("p")):
data=nlp(para.text)
for sidx, sentence in enumerate(data.sentences):
tree=sentence.constituency
print(tree)
phrases = generatePhrases(tree)
for phrase in phrases:
print("para %s sent %s phrase: "%(pidx, sidx) + str(phrase))
if pidx > 6: break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment