Skip to content

Instantly share code, notes, and snippets.

@typesupply
Created March 3, 2021 16:38
Show Gist options
  • Save typesupply/bbf314457dcda0c433a35f49508ec1a8 to your computer and use it in GitHub Desktop.
Save typesupply/bbf314457dcda0c433a35f49508ec1a8 to your computer and use it in GitHub Desktop.
from urllib.request import build_opener
import bs4
from nltk.tokenize import PunktSentenceTokenizer
# Pull some text from a Wikipedia entry.
# The text could also come from a string, file or whatever.
url = "https://en.wikipedia.org/wiki/Font,_Switzerland"
opener = build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]
request = opener.open(url)
html = request.read()
request.close()
try:
soup = bs4.BeautifulSoup(html)
except:
print("[ERROR] Couldn't parse:", url)
tags = ["p", "table", "td", "li", "dfn", "dd"]
all = []
for tag in tags:
for i in soup.find_all(tag):
text = i.text
if tag == "table":
found = text.splitlines()
else:
found = [text]
all.extend(found)
text = "\n".join(all)
# Extract sentences from the text.
sentences = PunktSentenceTokenizer().tokenize(text)
for sentence in sentences:
print(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment