Created
December 27, 2018 19:43
-
-
Save zmwangx/49049218bd89c21ddabd647896af995a to your computer and use it in GitHub Desktop.
Python script to extract URLs from HTML documents.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Extract URLs from HTML documents.""" | |
import argparse | |
import re | |
import sys | |
import urllib.parse | |
import bs4 | |
import requests | |
_TAG_ATTRS = { | |
'a': {'href'}, | |
'applet': {'code', 'archive', 'codebase'}, | |
'area': {'href'}, | |
'audio': {'src'}, | |
'base': {'href'}, | |
'blockquote': {'cite'}, | |
'body': {'background'}, | |
'button': {'formaction'}, | |
'del': {'cite'}, | |
'embed': {'src'}, | |
'form': {'action'}, | |
'frame': {'longdesc', 'src'}, | |
'head': {'profile'}, | |
'html': {'manifest'}, | |
'iframe': {'longdesc', 'src'}, | |
'img': {'longdesc', 'src'}, | |
'input': {'formaction', 'src'}, | |
'ins': {'cite'}, | |
'link': {'href'}, | |
'menuitem': {'icon'}, | |
'object': {'archive', 'codebase', 'data'}, | |
'q': {'cite'}, | |
'script': {'src'}, | |
'source': {'src'}, | |
'video': {'src'}, | |
} | |
"""HTML tags and attributes that can hold URLs. | |
According to W3Schools, only the listed attributes of the listed tags | |
(keys) can hold URLs. Both HTML 4.01 and HTML 5 are included. | |
The data are scraped from w3schools.com on 2015-05-09. | |
""" | |
# disguise as Microsoft Edge, because apparently some servers are not | |
# thrilled to see python-requests in the UA string | |
REQUEST_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240"} | |
def urlgrep(pattern=None, content=None, filepath=None, url=None, | |
selector=None, base=None, deduplicate=True, session=None): | |
"""Extract URLs matching a pattern from an HTML document. | |
The HTML document is either passed in full as a string (the | |
`content` parameter), or is read from a local file (the `filepath` | |
parameter), or is retrieved from a remote URL (the `url` | |
parameter). Only one of the these three -- the first one in | |
`content`, `filepath`, and `url` that is not ``None`` -- is | |
used. One of the three must be specified. | |
Scope of search can be refined with an optional CSS selector | |
(`selector` parameter). If specified, search is performed inside | |
*all* matched tags but no more. This should help greatly in noise | |
reduction, especially from web pages that try to offer suggestions | |
and whatnot at the bottom or in sidebars. | |
Sometimes we need a base URL to resolve relative URLs. The `url` | |
argument or its redirection (if any) is automatically used as base | |
when the document is retrieved from `url`. Otherwise, the `base` | |
parameter, if specified and not None, is used. If none of the above | |
applies, ``http://localhost`` is used as a fallback. Note that the | |
HTML ``<base>`` tag within the document may change the base URL, and | |
that is respected. | |
The `pattern` parameter supplies a regex pattern to match parsed | |
URLs. Only matching URLs are returned. If no pattern is specified, | |
``r"^(?!javascript:)"`` is used to exclude the "``javascript:`` | |
scheme". You may supply an empty string if you want to include | |
``javascript:``. | |
Parameters | |
---------- | |
pattern : str, optional | |
Regex pattern to match parsed URLs against. Default is | |
``r"^(?!javascript:)"``. | |
content : bytes or str, optional | |
An HTML document. | |
filepath : str, optional | |
Path to a local HTML document. | |
url : str, optional | |
URL pointing to an HTML document (note that the ``file`` scheme | |
is not supported by ``requests``). If no scheme is supplied in | |
the URL, use ``http://``. | |
selector : str, optional | |
CSS selector recognized by BeautifulSoup. If specified, search | |
scope is limited to all matched tags but no more. | |
base : str, optional | |
Base URL used for `content` for `filepath`. If no scheme is | |
supplied in the URL, use ``http://`` (for a local path, use | |
``file://``). | |
deduplicate : str, optional | |
Deduplicate matching URLs and only keep the first occurrence of | |
each unique URL. Default is ``True``. | |
session : requests.Session, optional | |
If not ``None``, make HTTP requests within this session. Default | |
is ``None``. | |
Returns | |
------- | |
matching_urls : list | |
List of parsed absolute URLs matching the given pattern. | |
Raises | |
------ | |
ValueError | |
If content, filepath and url are all None. | |
OSError | |
If failed to open the specified file. | |
requests.exceptions.RequestException | |
If requests fail to retrieve the URL specified. | |
""" | |
# pylint: disable=too-many-arguments,too-many-locals,too-many-branches | |
urlscheme = re.compile(r"^\w+://") | |
base = "localhost" if base is None else base | |
if not urlscheme.match(base): | |
base = "http://%s" % base | |
if content is not None: | |
pass | |
elif filepath is not None: | |
with open(filepath, mode='rb') as fileobj: | |
content = fileobj.read() | |
elif url is not None: | |
if not urlscheme.match(url): | |
url = "http://%s" % url | |
if session is None: | |
request = requests.get(url, headers=REQUEST_HEADERS) | |
else: | |
request = session.get(url) | |
content = request.content | |
base = request.url | |
else: | |
raise ValueError("content, filepath and url cannot all be None") | |
regex = (re.compile(pattern) if pattern is not None | |
else re.compile(r"^(?!javascript:)")) | |
soup = bs4.BeautifulSoup(content, "html.parser") | |
# base URL might be modified by the HTML <base> tag, which must | |
# reside inside <head> | |
if soup.head and soup.head.base and "href" in soup.head.base.attrs: | |
base = soup.head.base["href"] | |
# select part of the soup with the optional selector | |
selections = [soup] if selector is None else soup.select(selector) | |
matching_urls = [] | |
for selection in selections: | |
for tag in selection.descendants: | |
if tag.name in _TAG_ATTRS: | |
for attribute in _TAG_ATTRS[tag.name]: | |
if attribute in tag.attrs: | |
parsed_url = urllib.parse.urljoin(base, tag[attribute]) | |
if regex.search(parsed_url): | |
matching_urls.append(parsed_url) | |
if deduplicate: | |
seen = set() | |
seen_add = seen.add | |
matching_urls = [url for url in matching_urls | |
if not (url in seen or seen_add(url))] | |
return matching_urls | |
def main(): | |
"""CLI interface.""" | |
description = """Parse URLs from HTML documents. When invoked with | |
no URLs or files, read from stdin.""" | |
parser = argparse.ArgumentParser(description=description) | |
parser.add_argument("-u", "--url", action="append", | |
help="""URL to an HTML document to be | |
parsed. This option can be specified multiple | |
times on the command line. "http://" is | |
automatically attached if the scheme is left | |
out.""") | |
parser.add_argument("-s", "--selector", | |
help="""CSS selector for restricting search | |
scope. If specified, the search scope is | |
restricted to all selected tags, and no | |
more.""") | |
parser.add_argument("-b", "--base", | |
help="""Base URL. Only used for files or | |
stdin. "http://" is automatically attached if | |
the scheme is left out.""") | |
parser.add_argument("-p", "--pattern", | |
help="""Regexp to match against.""") | |
parser.add_argument("-d", "--preserve-duplicates", action="store_true", | |
help="""Do not deduplicate URLs within a document.""") | |
parser.add_argument("-v", "--verbose", action="store_true", | |
help="""Print additional information to stderr.""") | |
parser.add_argument("filepaths", metavar="FILE", nargs="*", | |
help="""Files to be parsed.""") | |
args = parser.parse_args() | |
urls = args.url if args.url is not None else [] | |
selector = args.selector | |
filepaths = args.filepaths | |
base = args.base | |
pattern = args.pattern | |
deduplicate = not args.preserve_duplicates | |
# verbose if --verbose specified and sources more than one | |
verbose = len(urls) + len(filepaths) >= 2 if args.verbose else False | |
returncode = 0 | |
if not urls and not filepaths: | |
content = sys.stdin.read() | |
matching_urls = urlgrep(pattern=pattern, | |
content=content, | |
selector=selector, | |
base=base, | |
deduplicate=deduplicate) | |
print('\n'.join(matching_urls)) | |
sys.stdout.flush() | |
else: | |
for url in urls: | |
try: | |
matching_urls = urlgrep(pattern=pattern, | |
url=url, | |
selector=selector, | |
deduplicate=deduplicate) | |
if verbose and matching_urls: | |
sys.stderr.write("# from '%s':\n" % url) | |
sys.stderr.flush() | |
print('\n'.join(matching_urls)) | |
sys.stdout.flush() | |
except requests.exceptions.RequestException as err: | |
sys.stderr.write("error: failed to get '%s'\n" % url) | |
sys.stderr.write("error: %s\n" % str(err)) | |
sys.stderr.flush() | |
returncode = 1 | |
for filepath in filepaths: | |
try: | |
matching_urls = urlgrep(pattern=pattern, | |
filepath=filepath, | |
selector=selector, | |
base=base, | |
deduplicate=deduplicate) | |
if verbose and matching_urls: | |
sys.stderr.write("# from '%s':\n" % filepath) | |
sys.stderr.flush() | |
print('\n'.join(matching_urls)) | |
sys.stdout.flush() | |
except OSError as err: | |
sys.stderr.write("error: failed to open '%s'\n" % filepath) | |
sys.stderr.write("error: %s\n" % str(err)) | |
sys.stderr.flush() | |
returncode = 1 | |
return returncode | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A pretty old script I wrote in early 2015. Served me very well throughout the years.