Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created December 13, 2014 15:02
Show Gist options
  • Save mocobeta/6d13a4f4982f448c6b5f to your computer and use it in GitHub Desktop.
Save mocobeta/6d13a4f4982f448c6b5f to your computer and use it in GitHub Desktop.
Python script importing Wikipedia xml dump data to PostgreSQL
# -*- coding: utf-8 -*-
import psycopg2
import xml.sax
from xml.sax.handler import ContentHandler
from dicttoxml import dicttoxml
INSERT_STMT = "INSERT INTO pages (id, page) VALUES('%s', '%s')"
COMMIT_WINDOW = 10000
class WikidataHandler(ContentHandler):
def __init__(self, conn, stop=None):
self.in_page = False
self.buff = ""
self.doc = {} # dict holding xml tag and data
self.tags = [] # list holding xml path (root to leaf node)
self.conn = conn
self.stop = stop
self.processed = 0
def startElement(self, name, attrs):
if name == "page":
self.in_page = True
if self.in_page:
# dig xml path hierarchy
self.tags.append(name)
def endElement(self, name):
if not self.in_page:
return
# update current dict(page) data
if self.tags[0] not in self.doc:
self.doc[self.tags[0]] = {}
node = self.doc[self.tags[0]]
for tag in self.tags[1:]:
if tag not in node:
node[tag] = self.buff if tag == name and self.buff else {}
node = node[tag]
self.buff = ""
# up xml path hierarchy
del self.tags[len(self.tags)-1]
if name == "page":
self.in_page = False
if self.doc["page"] and \
self.doc["page"]["revision"] and \
self.doc["page"]["revision"]["text"] and \
self.doc["page"]["revision"]["text"].startswith("#REDIRECT"):
# skip redirect page
self.doc = {}
self.tags = []
return
try:
# dict to xml string
xmlstr = dicttoxml(self.doc, root=False)
except:
# if failed, skip this page
self.doc = {}
self.tags = []
return
# insert to db
cur = self.conn.cursor()
cur.execute(INSERT_STMT %
(self.doc['page']['id'], xmlstr.decode("utf-8")))
if self.processed % COMMIT_WINDOW == 0:
print("%d pages were processed." % self.processed)
self.conn.commit()
# clear status
self.doc = {}
self.tags = []
self.processed += 1
if self.stop and self.processed >= self.stop:
raise xml.sax.SAXException("%d docs already processed." % self.processed)
def characters(self, content):
c = content.strip()
if self.in_page and c:
self.buff += c
if __name__ == "__main__":
import sys
fname = sys.argv[1]
stop = int(sys.argv[2]) if len(sys.argv) > 2 else None
conn = psycopg2.connect(host="localhost", port=5432, database="wikipedia")
handler = WikidataHandler(conn, stop)
try:
xml.sax.parse(fname, handler)
finally:
conn.commit()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment