Created
November 6, 2015 04:09
-
-
Save csvoss/c5b90daf5d4dfa6b300b to your computer and use it in GitHub Desktop.
Convert a URL to text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Parse out text, links, images, and more from an HTML file. | |
Modified from extract.py in https://github.com/fephsun/dialup. | |
For example: | |
import extract | |
e = extract.ParsedWebpage("http://en.wikipedia.org/wiki/Frog") | |
print e.title | |
print e.text | |
""" | |
import copy | |
import json | |
import re | |
from urlparse import urljoin | |
import bs4 | |
import requests | |
class ParsedWebpage(object): | |
def __init__(self, url): | |
self.url = url | |
# Raw HTML | |
response = requests.get(url) | |
self.html = response.text | |
self.soup = bs4.BeautifulSoup(self.html, "html.parser") | |
# Delete <script> and <style> tags, comments, and <!DOCTYPE>. | |
# For some reason, doing this twice removes some sticky cases. | |
for i in range(2): | |
[s.extract() for s in self.soup.find_all('script')] | |
[s.extract() for s in self.soup.find_all('style')] | |
[s.extract() for s in self.soup.find_all('form')] | |
comments = self.soup.findAll(text=lambda text:isinstance(text, bs4.Comment)) | |
[comment.extract() for comment in comments] | |
new_html = re.sub("<!--.*?-->", "", unicode(self.soup)) | |
new_html = re.sub("<!DOCTYPE[^>]*>", "", new_html) | |
self.soup = bs4.BeautifulSoup(new_html, "html.parser") | |
# This should be something acceptable to read to the user | |
# as the webpage's title. | |
self.title = self.soup.title.string | |
# Replace images with descriptions of those images. | |
def my_replace(match): | |
raw_tag = match.group() | |
img_soup = bs4.BeautifulSoup(raw_tag, "html.parser") | |
src = img_soup.img.get("src") | |
alt = img_soup.img.get("alt") | |
retval = " An image" | |
if alt: | |
retval += " of %s" % alt | |
return retval + '. ' | |
new_html = re.sub("<img[^>]*\>[^>]*<\\img\>", my_replace, unicode(self.soup)) | |
new_html = re.sub("<img[^>]*\>", my_replace, new_html) | |
self.soup = bs4.BeautifulSoup(new_html, "html.parser") | |
texts = self.soup.find_all(text=True) | |
# This should be the human-readable text of the page. | |
self.text = ' '.join(texts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment