Skip to content

Instantly share code, notes, and snippets.

@scoffey
Created May 4, 2011 01:22
Show Gist options
  • Save scoffey/954588 to your computer and use it in GitHub Desktop.
Save scoffey/954588 to your computer and use it in GitHub Desktop.
Python microformats parser that supports hCard, hCalendar, hResume and rel-tag and provides an extensible model for other microformats.
# -*- coding: utf-8 -*-
#!/usr/bin/env python
"""
Model for microformat properties and parsers. A microformat parser can
parse an HTML element into a dictionary of properties, whose keys are
strings and whose values are strings or other dictionary of properties.
As an example, the main program of this script parses an hResume from
given URL.
"""
import pprint
import re
import sys
import urllib
import urllib2
import urlparse
import lxml.html
class BaseMicroformatProperty(object):
""" Base class for a microformat property """
WHITESPACE = re.compile('\\s+')
def __init__(self, root, index=0):
self.root = root
self.index = index # None for multiple values, else selects one
def parse(self, node):
children = self.get_children(node)
if children is None:
return {}
if self.index is None:
value = filter(lambda value: value is not None, \
[self.get_value(child) for child in children]) or None
else:
try:
child = children[self.index]
except IndexError:
value = None
else:
value = self.get_value(child)
return {self.get_key(): value} if value is not None else {}
def get_children(self, node):
return node.find_class(self.root)
def get_key(self):
return self.root
def get_value(self, node):
mapper = {'abbr': ('title',), 'a': ('href', 'alt'), \
'img': ('src',), 'object': ('data',)}
value = None
for attr in mapper.get(node.tag, []):
if attr in node.attrib:
value = node.attrib[attr]
break
if value is None:
value = node.text_content()
return self.transform_value(value)
def transform_value(self, value):
return self.WHITESPACE.sub(' ', value.strip())
class NestingMicroformatProperty(BaseMicroformatProperty):
""" Base class for a microformat property that can have other nested """
def __init__(self, root, index=0, subproperties=None):
super(NestingMicroformatProperty, self).__init__(root, index)
self.subproperties = subproperties or []
def get_value(self, node):
value = {}
for property in self.subproperties:
data = property.parse(node)
if data is not None:
value.update(data)
return value if value else None
class BaseMicroformatParser(NestingMicroformatProperty):
""" Base class for a microformat parser """
DEFINITION = ()
def __init__(self, root, index=None):
super(BaseMicroformatParser, self).__init__(root, index=index, \
subproperties=self.get_subproperties(self.get_definition()))
def get_subproperties(self, definition):
retval = []
for i in definition:
if isinstance(i, basestring):
retval.append(BaseMicroformatProperty(i))
else:
for k, v in i.iteritems():
retval.append(NestingMicroformatProperty(k, \
subproperties=self.get_subproperties(v)))
return retval
def get_definition(self):
return self.DEFINITION
class MultipleMicroformatParser(NestingMicroformatProperty):
""" Base class for a parser of multiple microformat properties """
def __init__(self, root, index=0, subindex=0, parsers=None):
super(MultipleMicroformatParser, self).__init__( \
root, index=index)
for parser_class in parsers:
parser = parser_class(root, index=subindex)
self.subproperties.extend(parser.subproperties)
class HCardParser(BaseMicroformatParser):
""" Implementation of an hCard microformat parser """
DEFINITION = (
'fn',
{'n': ('family-name', 'given-name', 'additional-name', \
'honorific-prefix', 'honorific-suffix')},
{'adr': ('post-office-box', 'extended-address', \
'street-address', 'locality', 'region', \
'postal-code', 'country-name', 'type', 'value')},
'agent',
'bday',
'category',
'class',
{'email': ('type', 'value')},
{'geo': ('latitude', 'longitude')},
'key',
'label',
'logo',
'mailer',
'nickname',
'note',
{'org': ('organization-name', 'organization-unit')},
'photo',
'rev',
'role',
'sort-string',
'sound',
{'tel': ('type', 'value')},
'title',
'tz',
'uid',
'url'
)
def __init__(self, root='vcard', index=None):
super(HCardParser, self).__init__(root, index=index)
class HCalendarParser(BaseMicroformatParser):
""" Implementation of an hCalendar microformat parser """
DEFINITION = ('category', 'class', 'description', \
'dtend', 'dtstart', 'duration', 'location', 'status', \
'summary', 'uid', 'url', 'last-modified', 'rdate', \
'rrule', 'attendee', 'contact', 'organiser',
{'geo': ('latitude', 'longitude')}
)
def __init__(self, root='vevent', index=None):
super(HCalendarParser, self).__init__(root, index=index)
class HResumeParser(BaseMicroformatParser):
""" Implementation of an hResume microformat parser """
DEFINITION = ('summary', 'publications')
def __init__(self, root='hresume', index=None):
super(HResumeParser, self).__init__(root, index=index)
self.subproperties.append(HCardParser('contact', index=0))
self.subproperties.append(HCalendarParser('education'))
self.subproperties.append(MultipleMicroformatParser('experience', \
index=None, subindex=0, parsers= \
[HCardParser, HCalendarParser]))
self.subproperties.append(RelTagParser('skill'))
self.subproperties.append(HCardParser('affiliation'))
class RelTagProperty(BaseMicroformatProperty):
""" Base class for a rel="tag" microformat property """
def __init__(self, root='tag', index=0):
super(RelTagProperty, self).__init__(root, index=index)
def get_children(self, node):
return node.find_rel_links(self.root)
def get_value(self, node):
href = node.attrib.get('href')
value = urlparse.urlsplit(href)[2].split('/')[-1] if href else None
return urllib.unquote_plus(value).decode('utf-8')
class RelTagHrefUrlProperty(RelTagProperty):
def get_key(self):
return 'url'
def get_value(self, node):
return node.attrib.get('href')
class RelTagTextContentProperty(RelTagProperty):
def get_key(self):
return 'text'
def get_value(self, node):
return self.transform_value(node.text_content())
class RelTagParser(BaseMicroformatParser):
""" Implementation of a rel="tag" microformat parser """
def __init__(self, root='tag', index=None, rel='tag'):
super(RelTagParser, self).__init__(root, index=index)
self.rel = rel
self.subproperties = [RelTagProperty(rel), \
RelTagHrefUrlProperty(rel), \
RelTagTextContentProperty(rel)]
def get_children(self, node):
return node.find_rel_links(self.rel)
# More microformats:
# hReview: http://microformats.org/wiki/hreview
# hAtom: http://microformats.org/wiki/hatom
# rel="license": http://microformats.org/wiki/rel-license
# rel="bookmark": http://microformats.org/wiki/rel-bookmark
# XFN: http://www.gmpg.org/xfn/intro
def main(program, url=None):
""" Main program: Parses an hResume from given URL """
if not url:
print 'Usage: python %s <URL>' % program
else:
parser = HResumeParser()
html = urllib2.urlopen(url).read()
root = lxml.html.fromstring(html)
data = parser.parse(root)
pprint.pprint(data)
if __name__ == '__main__':
main(*sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment