Last active May 4, 2020 21:32
Script to generate dataset from DH Q&A archive (preliminary)
#!/usr/bin/env python
Script to parse data DH Q&A archive.
Install python dependencies:
pip install beautifulsoup4 feedparser
Clone DH Q&A archive repository:
Run this script in the top-level directory of the repository.
import csv
import datetime
import glob
import os
import re
from bs4 import BeautifulSoup, Comment
import feedparser
import requests
baseurl = ''
def get_post_info(div, topic_url, feed):
# take a post container and return dict of post info
# takes bs4 div, basec url for this topic, and rss feedparser obj
info = {}
# generate permalink from li id since in at least one
# case the permalink isn't found
info['url'] = '%s#%s' % (topic_url, div['id'])
# first div id includes order information as position-#
info['order'] = div.div['id'].split('-')[1]
threadauthor = div.find('div', class_='threadauthor')
author_url = threadauthor.a['href']
# members have local profile urls
if author_url.startswith('/'):
author_url = '%s%s' % (baseurl, author_url)
info['author url'] = author_url
info['author'] = threadauthor.find('strong').get_text()
# question is in first threadpost
threadpost = div.find('div', class_='threadpost')
# remove 'tweet this question' block and related comments
social = threadpost.find('div', class_="social-it")
if social:
[comment.extract() for comment in threadpost.findAll(
text=lambda text:isinstance(text, Comment))]
# get post content
info['content'] = threadpost.div.prettify()
# check if this is a reply to a specific post
if threadpost.p and threadpost.p.get_text().startswith('Replying to'):
# name could be a link, so get last link in the reply p
reply_to_post = threadpost.p.find_all('a')[-1]['href']
info['reply to'] = '%s%s' % (baseurl, reply_to_post)
# check if marked as a best answer
info['best answer'] = bool(post.find('div', class_='best_answer'))
# post date
poststuff = div.find('div', class_='poststuff')
if poststuff:
relative_post_date = poststuff.text
# Posted x years ago
relative_post_date = relative_post_date.replace('Posted ', '') \
.replace(' Permalink', '')
info['relative date'] = relative_post_date.strip()
print('poststuff div not found for %(url)s' % info)
# find RSS entry for this record if possible
if feed:
entries = [e for e in feed.entries if == info['url']]
if entries:
entry = entries[0]
# convert parsed timestruct into isoformat
info['date'] = datetime.datetime(*entry.published_parsed[:6]) \
# get datetime with published_parsed
# print(entry.published_parsed)
print('ERROR: not in feed %s' % info['url'])
return info
def wayback_machine_timestamp(url):
'''get timestamp for most recent capture of a url from wayback
machine api'''
response = requests.get('',
params={'url': url})
if response.status_code ==
data = response.json()
# if archived snapshots is not empty, return closest timestamp
if data['archived_snapshots']:
return data['archived_snapshots']['closest']['timestamp']
dhqa_posts = []
post_fieldnames = [
'topic url',
'author url',
'best answer',
'relative date',
'snapshot date',
'reply to',
for path in glob.glob('topic/*/index.html'):
# print(path)
# topic meta should include url for topic,
# but is not completely reliable!
# generate from filename instead
topic_url = '%s/%s' % (baseurl, os.path.dirname(path))
capture_date = wayback_machine_timestamp(topic_url)
topic_data = {
'topic url': topic_url,
'snapshot date': capture_date or ''
with open(path) as topicdoc:
soup = BeautifulSoup(topicdoc, 'html.parser')
# page title is question (summary/brief)
topic_data['question'] = soup.find('h2').get_text()
tags = soup.find_all('a', rel='tag')
topic_data['tags'] = ';'.join([t.get_text() for t in tags])
# should tags apply to all posts or just question?
# html doesn't have a proper date but RSS should
# get rss filename from rss link
rss = soup.find('a', class_="rss-link")['href'].lstrip('/')
# print(rss)
if os.path.exists(rss):
# with open(rss) as rssdoc:
feed = feedparser.parse(rss)
# rss_soup = BeautifulSoup(rssdoc, 'lxml')
# items = rss_soup.findAll('item')
if not feed.entries:
print('ERROR: RSS file has no content: %s' % rss)
feed = None
print('ERROR: Missing RSS file: %s' % rss)
feed = None
posts = soup.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
post_data = get_post_info(post, topic_url, feed)
# check for second page (few cases; nothing has more than 2 pages)
next_link = soup.find('a', class_='next')
if next_link:
page_two = '%s/index.html' % next_link['href'].lstrip('/')
with open(page_two) as page_two_doc:
soup2 = BeautifulSoup(page_two_doc, 'html.parser')
posts = soup2.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
# unlikely these will be in RSS feed...
post_data = get_post_info(post, topic_url, feed)
# NOTE: missing 11 topic RSS feeds
# may be able to get date from tag feeds
print('%d posts total' % len(dhqa_posts))
with open('dhqa_data.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, fieldnames=post_fieldnames)
One small request is to have field headers with underscores rather than spaces. Otherwise I can't use tab completion with dot notation in pandas. Thanks 🙏

Copy link

Good to know! I go back and forth on whether field names should be more dev friendly or more readable, sounds like I should use underscores in future.

Copy link

That's a great question! Maybe something to clarify how this data will be used? I could see a more dev-friendly version of the dataset, versus a more data analysis friendly version, if you think having both might be worthwhile.

