Skip to content

Instantly share code, notes, and snippets.

Forked from zmwangx/urlgrep
Created March 11, 2024 17:05
Show Gist options
  • Save jkonowitch/636dfc2e256bf7d86005c31aa42f32a1 to your computer and use it in GitHub Desktop.
Save jkonowitch/636dfc2e256bf7d86005c31aa42f32a1 to your computer and use it in GitHub Desktop.
Python script to extract URLs from HTML documents.
#!/usr/bin/env python3
"""Extract URLs from HTML documents."""
import argparse
import re
import sys
import urllib.parse
import bs4
import requests
'a': {'href'},
'applet': {'code', 'archive', 'codebase'},
'area': {'href'},
'audio': {'src'},
'base': {'href'},
'blockquote': {'cite'},
'body': {'background'},
'button': {'formaction'},
'del': {'cite'},
'embed': {'src'},
'form': {'action'},
'frame': {'longdesc', 'src'},
'head': {'profile'},
'html': {'manifest'},
'iframe': {'longdesc', 'src'},
'img': {'longdesc', 'src'},
'input': {'formaction', 'src'},
'ins': {'cite'},
'link': {'href'},
'menuitem': {'icon'},
'object': {'archive', 'codebase', 'data'},
'q': {'cite'},
'script': {'src'},
'source': {'src'},
'video': {'src'},
"""HTML tags and attributes that can hold URLs.
According to W3Schools, only the listed attributes of the listed tags
(keys) can hold URLs. Both HTML 4.01 and HTML 5 are included.
The data are scraped from on 2015-05-09.
# disguise as Microsoft Edge, because apparently some servers are not
# thrilled to see python-requests in the UA string
REQUEST_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240"}
def urlgrep(pattern=None, content=None, filepath=None, url=None,
selector=None, base=None, deduplicate=True, session=None):
"""Extract URLs matching a pattern from an HTML document.
The HTML document is either passed in full as a string (the
`content` parameter), or is read from a local file (the `filepath`
parameter), or is retrieved from a remote URL (the `url`
parameter). Only one of the these three -- the first one in
`content`, `filepath`, and `url` that is not ``None`` -- is
used. One of the three must be specified.
Scope of search can be refined with an optional CSS selector
(`selector` parameter). If specified, search is performed inside
*all* matched tags but no more. This should help greatly in noise
reduction, especially from web pages that try to offer suggestions
and whatnot at the bottom or in sidebars.
Sometimes we need a base URL to resolve relative URLs. The `url`
argument or its redirection (if any) is automatically used as base
when the document is retrieved from `url`. Otherwise, the `base`
parameter, if specified and not None, is used. If none of the above
applies, ``http://localhost`` is used as a fallback. Note that the
HTML ``<base>`` tag within the document may change the base URL, and
that is respected.
The `pattern` parameter supplies a regex pattern to match parsed
URLs. Only matching URLs are returned. If no pattern is specified,
``r"^(?!javascript:)"`` is used to exclude the "``javascript:``
scheme". You may supply an empty string if you want to include
pattern : str, optional
Regex pattern to match parsed URLs against. Default is
content : bytes or str, optional
An HTML document.
filepath : str, optional
Path to a local HTML document.
url : str, optional
URL pointing to an HTML document (note that the ``file`` scheme
is not supported by ``requests``). If no scheme is supplied in
the URL, use ``http://``.
selector : str, optional
CSS selector recognized by BeautifulSoup. If specified, search
scope is limited to all matched tags but no more.
base : str, optional
Base URL used for `content` for `filepath`. If no scheme is
supplied in the URL, use ``http://`` (for a local path, use
deduplicate : str, optional
Deduplicate matching URLs and only keep the first occurrence of
each unique URL. Default is ``True``.
session : requests.Session, optional
If not ``None``, make HTTP requests within this session. Default
is ``None``.
matching_urls : list
List of parsed absolute URLs matching the given pattern.
If content, filepath and url are all None.
If failed to open the specified file.
If requests fail to retrieve the URL specified.
# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
urlscheme = re.compile(r"^\w+://")
base = "localhost" if base is None else base
if not urlscheme.match(base):
base = "http://%s" % base
if content is not None:
elif filepath is not None:
with open(filepath, mode='rb') as fileobj:
content =
elif url is not None:
if not urlscheme.match(url):
url = "http://%s" % url
if session is None:
request = requests.get(url, headers=REQUEST_HEADERS)
request = session.get(url)
content = request.content
base = request.url
raise ValueError("content, filepath and url cannot all be None")
regex = (re.compile(pattern) if pattern is not None
else re.compile(r"^(?!javascript:)"))
soup = bs4.BeautifulSoup(content, "html.parser")
# base URL might be modified by the HTML <base> tag, which must
# reside inside <head>
if soup.head and soup.head.base and "href" in soup.head.base.attrs:
base = soup.head.base["href"]
# select part of the soup with the optional selector
selections = [soup] if selector is None else
matching_urls = []
for selection in selections:
for tag in selection.descendants:
if in _TAG_ATTRS:
for attribute in _TAG_ATTRS[]:
if attribute in tag.attrs:
parsed_url = urllib.parse.urljoin(base, tag[attribute])
if deduplicate:
seen = set()
seen_add = seen.add
matching_urls = [url for url in matching_urls
if not (url in seen or seen_add(url))]
return matching_urls
def main():
"""CLI interface."""
description = """Parse URLs from HTML documents. When invoked with
no URLs or files, read from stdin."""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-u", "--url", action="append",
help="""URL to an HTML document to be
parsed. This option can be specified multiple
times on the command line. "http://" is
automatically attached if the scheme is left
parser.add_argument("-s", "--selector",
help="""CSS selector for restricting search
scope. If specified, the search scope is
restricted to all selected tags, and no
parser.add_argument("-b", "--base",
help="""Base URL. Only used for files or
stdin. "http://" is automatically attached if
the scheme is left out.""")
parser.add_argument("-p", "--pattern",
help="""Regexp to match against.""")
parser.add_argument("-d", "--preserve-duplicates", action="store_true",
help="""Do not deduplicate URLs within a document.""")
parser.add_argument("-v", "--verbose", action="store_true",
help="""Print additional information to stderr.""")
parser.add_argument("filepaths", metavar="FILE", nargs="*",
help="""Files to be parsed.""")
args = parser.parse_args()
urls = args.url if args.url is not None else []
selector = args.selector
filepaths = args.filepaths
base = args.base
pattern = args.pattern
deduplicate = not args.preserve_duplicates
# verbose if --verbose specified and sources more than one
verbose = len(urls) + len(filepaths) >= 2 if args.verbose else False
returncode = 0
if not urls and not filepaths:
content =
matching_urls = urlgrep(pattern=pattern,
for url in urls:
matching_urls = urlgrep(pattern=pattern,
if verbose and matching_urls:
sys.stderr.write("# from '%s':\n" % url)
except requests.exceptions.RequestException as err:
sys.stderr.write("error: failed to get '%s'\n" % url)
sys.stderr.write("error: %s\n" % str(err))
returncode = 1
for filepath in filepaths:
matching_urls = urlgrep(pattern=pattern,
if verbose and matching_urls:
sys.stderr.write("# from '%s':\n" % filepath)
except OSError as err:
sys.stderr.write("error: failed to open '%s'\n" % filepath)
sys.stderr.write("error: %s\n" % str(err))
returncode = 1
return returncode
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment