QuiteClose · September 23, 2015 22:08
diff --git a/sitemap.py b/sitemap.py
 ###############################################################################
 # $Author:  dsclose@gmail.com
 # $Expects: Python version 3.4.3
 ###############################################################################

 """Crawls a given URL and presents a simple sitemap based upon which
 webpages are found. Each page is given as an individual WebResource
 with a URL, a title and a list of links within the page.

 From the command line:

    $ python crawl.py www.example.com

 From within python:

    >>> from sitemap import crawl
    >>> for resource in crawl('www.example.com'):
    ...     print(resource)
    ...
    >>>
 """

 ###############################################################################

 try:
    from argparse import ArgumentParser
    from urllib.parse import urlparse, urljoin
    from urllib.request import urlopen
    import re
 except:
    print("Import Error: Expects Python 3.4.3")
    exit(1)

 ###############################################################################

 class WebResource:
    """ Stores the title and links found at a crawled URL."""

    def __init__(self, url, title):
        """ A resource has a URL and a Title. Links are normally added
        via the add method.
        """
        self.url = url
        self.parsed_url = urlparse(url)
        self.title = title
        self.links = []

    def add(self, link_list):
        """ Takes a list of links, converts them to URLs based upon the
        WebResource's URL and appends them to the WebResource if they
        successfully pass the _filter method.
        """
        url_list = [urljoin(self.url, link) for link in link_list]
        self.links.extend(sorted(set(self._filter(url_list))))

    def _filter(self, url_list):
        """ Returns a list of URLs, base on 'url_list', that have a
        different hostname and path to the resource's url. The effect
        is to filter out links to within the same page or to different
        hosts.

        Overload this method to modify what links are recorded against a
        WebResource.
        """
        non_page = lambda url: self.parsed_url.path != url.path
        local = lambda url: self.parsed_url.hostname == url.hostname
        suitable = lambda url: non_page(url) and local(url)

        return [url for url in url_list if suitable(urlparse(url))]

    def __str__(self):
        if len(self.links) is 0:
            link_str = ''
        else:
            link_str = '\t Links: {}'.format('\n\t\t'.join(self.links))
        return '{}\n\t Title: {}\n{}'.format(self.url,
                                             self.title,
                                             link_str)

 ###############################################################################
                                                      
 class CrawlQueue:
    """ Queues uncrawled links. """

    def __init__(self, start_url):
        """ Prepares a Manifest for a crawl starting from 'url'. """
        self._queue = []    # uncrawled URLs
        self._all =[]       # stored to prevent duplicates
        self.insert([start_url])
        
    def __iter__(self):
        return self

    def __next__(self):
        """ Iterate until the queue is empty. """
        try:
            return self._queue.pop(0)
        except:
            raise StopIteration()

    def insert(self, url_list):
        """ Adds unique URLs in the given URL list to the queue. """
        unique = [url for url in set(url_list) if url not in self._all]
        self._all.extend(unique)
        self._queue.extend(unique)

 ###############################################################################

 def _is_html(response):
    """ Returns True for both HTML and XHTML HTTP reponses. """
    content = response.getheader('Content-Type', default='Unknown')
    return 'text/html' in content or 'application/xhtml+xml' in content

 ##############################

 def _get_title_from(html):
    """ Returns the content of the first <title> tag in 'html'."""
    result = re.search(r'<title>(.*)</title>', html, re.IGNORECASE)
    if result:
        return result.group(1)
    else:
        return ''

 ##############################

 def _get_links_from(html):
    """ Returns a list of the values of the href attribute from any
    anchor tags in 'html'.
    """
    return re.findall(r'<a[^>]+href=[\'"]?([^\'" >]+)', html, re.IGNORECASE)

 ##############################

 def _get_root_url(url):
    """ Returns 'url' ensuring that it has a scheme and a path. """
    if '//' not in url:     # ensure proper scheme or netloc parsing
        return _get_root_url('//' + url)

    full_url = urlparse(url, 'http')

    if full_url.path == '':  # prevent duplicate visits to /
        return full_url.geturl() + '/'
    else:
        return full_url.geturl()

 ##############################

 def crawl(root_url, resource_type=WebResource):
    """ Crawls the URL at 'root_url'. Yields a 'resource_type' for
    each resource crawled after adding any extracted links.
    Successfully added links are followed.
    """
    queue = CrawlQueue(_get_root_url(root_url))

    for url in queue:
        try:
            response = urlopen(url)
        except:
            continue 
        if response.status == 200 and _is_html(response):
            html = response.read().decode()
            resource = resource_type(url, _get_title_from(html))
            resource.add(_get_links_from(html))
            queue.insert(resource.links)
            yield resource

 ###############################################################################

 if __name__ == '__main__':
    # only one argument, the URL
    parser = ArgumentParser(description='Crawls a URL to obtain a sitemap.')
    parser.add_argument('URL', help='The URL to crawl.')
    args = parser.parse_args()

    for resource in crawl(args.URL):
        print(resource)
 # eof
	###############################################################################
	# $Author: dsclose@gmail.com
	# $Expects: Python version 3.4.3
	###############################################################################

	"""Crawls a given URL and presents a simple sitemap based upon which
	webpages are found. Each page is given as an individual WebResource
	with a URL, a title and a list of links within the page.

	From the command line:

	$ python crawl.py www.example.com

	From within python:

	>>> from sitemap import crawl
	>>> for resource in crawl('www.example.com'):
	... print(resource)
	...
	>>>
	"""

	###############################################################################

	try:
	from argparse import ArgumentParser
	from urllib.parse import urlparse, urljoin
	from urllib.request import urlopen
	import re
	except:
	print("Import Error: Expects Python 3.4.3")
	exit(1)

	###############################################################################

	class WebResource:
	""" Stores the title and links found at a crawled URL."""

	def __init__(self, url, title):
	""" A resource has a URL and a Title. Links are normally added
	via the add method.
	"""
	self.url = url
	self.parsed_url = urlparse(url)
	self.title = title
	self.links = []

	def add(self, link_list):
	""" Takes a list of links, converts them to URLs based upon the
	WebResource's URL and appends them to the WebResource if they
	successfully pass the _filter method.
	"""
	url_list = [urljoin(self.url, link) for link in link_list]
	self.links.extend(sorted(set(self._filter(url_list))))

	def _filter(self, url_list):
	""" Returns a list of URLs, base on 'url_list', that have a
	different hostname and path to the resource's url. The effect
	is to filter out links to within the same page or to different
	hosts.

	Overload this method to modify what links are recorded against a
	WebResource.
	"""
	non_page = lambda url: self.parsed_url.path != url.path
	local = lambda url: self.parsed_url.hostname == url.hostname
	suitable = lambda url: non_page(url) and local(url)

	return [url for url in url_list if suitable(urlparse(url))]

	def __str__(self):
	if len(self.links) is 0:
	link_str = ''
	else:
	link_str = '\t Links: {}'.format('\n\t\t'.join(self.links))
	return '{}\n\t Title: {}\n{}'.format(self.url,
	self.title,
	link_str)

	###############################################################################

	class CrawlQueue:
	""" Queues uncrawled links. """

	def __init__(self, start_url):
	""" Prepares a Manifest for a crawl starting from 'url'. """
	self._queue = [] # uncrawled URLs
	self._all =[] # stored to prevent duplicates
	self.insert([start_url])

	def __iter__(self):
	return self

	def __next__(self):
	""" Iterate until the queue is empty. """
	try:
	return self._queue.pop(0)
	except:
	raise StopIteration()

	def insert(self, url_list):
	""" Adds unique URLs in the given URL list to the queue. """
	unique = [url for url in set(url_list) if url not in self._all]
	self._all.extend(unique)
	self._queue.extend(unique)

	###############################################################################

	def _is_html(response):
	""" Returns True for both HTML and XHTML HTTP reponses. """
	content = response.getheader('Content-Type', default='Unknown')
	return 'text/html' in content or 'application/xhtml+xml' in content

	##############################

	def _get_title_from(html):
	""" Returns the content of the first <title> tag in 'html'."""
	result = re.search(r'<title>(.*)</title>', html, re.IGNORECASE)
	if result:
	return result.group(1)
	else:
	return ''

	##############################

	def _get_links_from(html):
	""" Returns a list of the values of the href attribute from any
	anchor tags in 'html'.
	"""
	return re.findall(r'<a[^>]+href=[\'"]?([^\'" >]+)', html, re.IGNORECASE)

	##############################

	def _get_root_url(url):
	""" Returns 'url' ensuring that it has a scheme and a path. """
	if '//' not in url: # ensure proper scheme or netloc parsing
	return _get_root_url('//' + url)

	full_url = urlparse(url, 'http')

	if full_url.path == '': # prevent duplicate visits to /
	return full_url.geturl() + '/'
	else:
	return full_url.geturl()

	##############################

	def crawl(root_url, resource_type=WebResource):
	""" Crawls the URL at 'root_url'. Yields a 'resource_type' for
	each resource crawled after adding any extracted links.
	Successfully added links are followed.
	"""
	queue = CrawlQueue(_get_root_url(root_url))

	for url in queue:
	try:
	response = urlopen(url)
	except:
	continue
	if response.status == 200 and _is_html(response):
	html = response.read().decode()
	resource = resource_type(url, _get_title_from(html))
	resource.add(_get_links_from(html))
	queue.insert(resource.links)
	yield resource

	###############################################################################

	if __name__ == '__main__':
	# only one argument, the URL
	parser = ArgumentParser(description='Crawls a URL to obtain a sitemap.')
	parser.add_argument('URL', help='The URL to crawl.')
	args = parser.parse_args()

	for resource in crawl(args.URL):
	print(resource)
	# eof