Skip to content

Instantly share code, notes, and snippets.

@iafisher
Last active March 9, 2019 22:40
Show Gist options
  • Save iafisher/31979b1f912f023b047101be40ea3145 to your computer and use it in GitHub Desktop.
Save iafisher/31979b1f912f023b047101be40ea3145 to your computer and use it in GitHub Desktop.
DEPRECATED (see my bookmarks_from_sql.py gist): A small Python utility to parse bookmarks exports from Firefox
"""
DEPRECATED: see my bookmarks_from_sql.py gist.
A short script to parse bookmark exports from Firefox so they can be
manipulated with Python.
Author: Ian Fisher (iafisher@protonmail.com)
Version: November 2018
"""
from collections import namedtuple
from html.parser import HTMLParser
Bookmark = namedtuple(
'Bookmark', ['title', 'url', 'add_date', 'last_modified', 'section']
)
class BookmarkParser(HTMLParser):
"""An incremental parser is less convenient to use than a DOM parser, but
the standard library doesn't offer a DOM parser for HTML, and unfortunately
Firefox's export format is not valid XML, so here we are.
"""
EXPECTING_NOTHING = 0
EXPECTING_HEADER = 1
EXPECTING_BOOKMARK = 2
def __init__(self):
super().__init__()
self.current_section = None
self.mode = self.EXPECTING_NOTHING
self.bookmarks = []
self.current_bookmark = None
def handle_starttag(self, tag, attrs):
if tag == 'a':
self.mode = self.EXPECTING_BOOKMARK
self.current_bookmark = Bookmark(
title='',
url=get_attr(attrs, 'href'),
last_modified=int(get_attr(attrs, 'last_modified')),
add_date=int(get_attr(attrs, 'add_date')),
section=self.current_section,
)
elif tag == 'h3':
self.mode = self.EXPECTING_HEADER
else:
self.mode = self.EXPECTING_NOTHING
def handle_data(self, data):
if self.mode == self.EXPECTING_HEADER:
self.current_section = data
elif self.mode == self.EXPECTING_BOOKMARK:
self.current_bookmark = self.current_bookmark._replace(title=data)
self.bookmarks.append(self.current_bookmark)
self.mode = self.EXPECTING_NOTHING
def get_attr(attrs, key):
for akey, aval in attrs:
if akey == key:
return aval
raise KeyError(key)
parser = BookmarkParser()
with open('bookmarks.html') as f:
parser.feed(f.read())
# Bookmark objects are available as `parser.bookmarks`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment