Skip to content

Instantly share code, notes, and snippets.

@navin-mohan
Created April 4, 2018 07:59
Show Gist options
  • Save navin-mohan/3491e8c38f507742c49768173eee5363 to your computer and use it in GitHub Desktop.
Save navin-mohan/3491e8c38f507742c49768173eee5363 to your computer and use it in GitHub Desktop.
from html.parser import HTMLParser
class TextExtractionParser(HTMLParser):
'''
Custom HTML Parser that extracts
textual data from HTML
'''
def __init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
self.word_list = []
def handle_data(self,data):
'called every time a text node is found'
self.word_list.append(data)
def iterwords(self):
for sentence in self.word_list:
for word in re.finditer(r'([a-z])\w+',sentence.lower()):
yield word.group(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment