Skip to content

Instantly share code, notes, and snippets.

@QuiteClose
Last active September 23, 2015 22:08
Show Gist options
  • Save QuiteClose/188401e0ff92ec3d4826 to your computer and use it in GitHub Desktop.
Save QuiteClose/188401e0ff92ec3d4826 to your computer and use it in GitHub Desktop.
A simple web crawler.
###############################################################################
# $Author: dsclose@gmail.com
# $Expects: Python version 3.4.3
###############################################################################
"""Crawls a given URL and presents a simple sitemap based upon which
webpages are found. Each page is given as an individual WebResource
with a URL, a title and a list of links within the page.
From the command line:
$ python crawl.py www.example.com
From within python:
>>> from sitemap import crawl
>>> for resource in crawl('www.example.com'):
... print(resource)
...
>>>
"""
###############################################################################
try:
from argparse import ArgumentParser
from urllib.parse import urlparse, urljoin
from urllib.request import urlopen
import re
except:
print("Import Error: Expects Python 3.4.3")
exit(1)
###############################################################################
class WebResource:
""" Stores the title and links found at a crawled URL."""
def __init__(self, url, title):
""" A resource has a URL and a Title. Links are normally added
via the add method.
"""
self.url = url
self.parsed_url = urlparse(url)
self.title = title
self.links = []
def add(self, link_list):
""" Takes a list of links, converts them to URLs based upon the
WebResource's URL and appends them to the WebResource if they
successfully pass the _filter method.
"""
url_list = [urljoin(self.url, link) for link in link_list]
self.links.extend(sorted(set(self._filter(url_list))))
def _filter(self, url_list):
""" Returns a list of URLs, base on 'url_list', that have a
different hostname and path to the resource's url. The effect
is to filter out links to within the same page or to different
hosts.
Overload this method to modify what links are recorded against a
WebResource.
"""
non_page = lambda url: self.parsed_url.path != url.path
local = lambda url: self.parsed_url.hostname == url.hostname
suitable = lambda url: non_page(url) and local(url)
return [url for url in url_list if suitable(urlparse(url))]
def __str__(self):
if len(self.links) is 0:
link_str = ''
else:
link_str = '\t Links: {}'.format('\n\t\t'.join(self.links))
return '{}\n\t Title: {}\n{}'.format(self.url,
self.title,
link_str)
###############################################################################
class CrawlQueue:
""" Queues uncrawled links. """
def __init__(self, start_url):
""" Prepares a Manifest for a crawl starting from 'url'. """
self._queue = [] # uncrawled URLs
self._all =[] # stored to prevent duplicates
self.insert([start_url])
def __iter__(self):
return self
def __next__(self):
""" Iterate until the queue is empty. """
try:
return self._queue.pop(0)
except:
raise StopIteration()
def insert(self, url_list):
""" Adds unique URLs in the given URL list to the queue. """
unique = [url for url in set(url_list) if url not in self._all]
self._all.extend(unique)
self._queue.extend(unique)
###############################################################################
def _is_html(response):
""" Returns True for both HTML and XHTML HTTP reponses. """
content = response.getheader('Content-Type', default='Unknown')
return 'text/html' in content or 'application/xhtml+xml' in content
##############################
def _get_title_from(html):
""" Returns the content of the first <title> tag in 'html'."""
result = re.search(r'<title>(.*)</title>', html, re.IGNORECASE)
if result:
return result.group(1)
else:
return ''
##############################
def _get_links_from(html):
""" Returns a list of the values of the href attribute from any
anchor tags in 'html'.
"""
return re.findall(r'<a[^>]+href=[\'"]?([^\'" >]+)', html, re.IGNORECASE)
##############################
def _get_root_url(url):
""" Returns 'url' ensuring that it has a scheme and a path. """
if '//' not in url: # ensure proper scheme or netloc parsing
return _get_root_url('//' + url)
full_url = urlparse(url, 'http')
if full_url.path == '': # prevent duplicate visits to /
return full_url.geturl() + '/'
else:
return full_url.geturl()
##############################
def crawl(root_url, resource_type=WebResource):
""" Crawls the URL at 'root_url'. Yields a 'resource_type' for
each resource crawled after adding any extracted links.
Successfully added links are followed.
"""
queue = CrawlQueue(_get_root_url(root_url))
for url in queue:
try:
response = urlopen(url)
except:
continue
if response.status == 200 and _is_html(response):
html = response.read().decode()
resource = resource_type(url, _get_title_from(html))
resource.add(_get_links_from(html))
queue.insert(resource.links)
yield resource
###############################################################################
if __name__ == '__main__':
# only one argument, the URL
parser = ArgumentParser(description='Crawls a URL to obtain a sitemap.')
parser.add_argument('URL', help='The URL to crawl.')
args = parser.parse_args()
for resource in crawl(args.URL):
print(resource)
# eof
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment