Skip to content

Instantly share code, notes, and snippets.

@rlskoeser
Last active May 4, 2020 21:32
Show Gist options
  • Save rlskoeser/00bbb2c2e07b1379da61e025360bca68 to your computer and use it in GitHub Desktop.
Save rlskoeser/00bbb2c2e07b1379da61e025360bca68 to your computer and use it in GitHub Desktop.
Script to generate dataset from DH Q&A archive (preliminary)
#!/usr/bin/env python
'''
Script to parse data DH Q&A archive.
Install python dependencies:
pip install beautifulsoup4 feedparser
Clone DH Q&A archive repository:
https://github.com/achorg/DH-Answers-Archive
Run this script in the top-level directory of the repository.
'''
import csv
import datetime
import glob
import os
import re
from bs4 import BeautifulSoup, Comment
import feedparser
import requests
baseurl = 'http://digitalhumanities.org/answers'
def get_post_info(div, topic_url, feed):
# take a post container and return dict of post info
# takes bs4 div, basec url for this topic, and rss feedparser obj
info = {}
# generate permalink from li id since in at least one
# case the permalink isn't found
info['url'] = '%s#%s' % (topic_url, div['id'])
# first div id includes order information as position-#
info['order'] = div.div['id'].split('-')[1]
threadauthor = div.find('div', class_='threadauthor')
author_url = threadauthor.a['href']
# members have local profile urls
if author_url.startswith('/'):
author_url = '%s%s' % (baseurl, author_url)
info['author url'] = author_url
info['author'] = threadauthor.find('strong').get_text()
# question is in first threadpost
threadpost = div.find('div', class_='threadpost')
# remove 'tweet this question' block and related comments
social = threadpost.find('div', class_="social-it")
if social:
social.extract()
[comment.extract() for comment in threadpost.findAll(
text=lambda text:isinstance(text, Comment))]
# get post content
info['content'] = threadpost.div.prettify()
# check if this is a reply to a specific post
if threadpost.p and threadpost.p.get_text().startswith('Replying to'):
# name could be a link, so get last link in the reply p
reply_to_post = threadpost.p.find_all('a')[-1]['href']
info['reply to'] = '%s%s' % (baseurl, reply_to_post)
# check if marked as a best answer
info['best answer'] = bool(post.find('div', class_='best_answer'))
# post date
poststuff = div.find('div', class_='poststuff')
if poststuff:
relative_post_date = poststuff.text
# Posted x years ago
relative_post_date = relative_post_date.replace('Posted ', '') \
.replace(' Permalink', '')
info['relative date'] = relative_post_date.strip()
else:
print('poststuff div not found for %(url)s' % info)
# find RSS entry for this record if possible
if feed:
entries = [e for e in feed.entries if e.link == info['url']]
if entries:
entry = entries[0]
# convert parsed timestruct into isoformat
info['date'] = datetime.datetime(*entry.published_parsed[:6]) \
.isoformat()
# get datetime with published_parsed
# print(entry.published_parsed)
else:
print('ERROR: not in feed %s' % info['url'])
return info
def wayback_machine_timestamp(url):
'''get timestamp for most recent capture of a url from wayback
machine api'''
response = requests.get('http://archive.org/wayback/available',
params={'url': url})
if response.status_code == requests.codes.ok:
data = response.json()
# if archived snapshots is not empty, return closest timestamp
if data['archived_snapshots']:
return data['archived_snapshots']['closest']['timestamp']
dhqa_posts = []
post_fieldnames = [
'url',
'topic url',
'question',
'tags',
'author',
'author url',
'content',
'best answer',
'date',
'relative date',
'snapshot date',
'order',
'reply to',
]
for path in glob.glob('topic/*/index.html'):
# print(path)
# topic meta should include url for topic,
# but is not completely reliable!
# generate from filename instead
topic_url = '%s/%s' % (baseurl, os.path.dirname(path))
capture_date = wayback_machine_timestamp(topic_url)
topic_data = {
'topic url': topic_url,
'snapshot date': capture_date or ''
}
with open(path) as topicdoc:
soup = BeautifulSoup(topicdoc, 'html.parser')
# page title is question (summary/brief)
topic_data['question'] = soup.find('h2').get_text()
tags = soup.find_all('a', rel='tag')
topic_data['tags'] = ';'.join([t.get_text() for t in tags])
# should tags apply to all posts or just question?
# html doesn't have a proper date but RSS should
# get rss filename from rss link
rss = soup.find('a', class_="rss-link")['href'].lstrip('/')
# print(rss)
if os.path.exists(rss):
# with open(rss) as rssdoc:
feed = feedparser.parse(rss)
# rss_soup = BeautifulSoup(rssdoc, 'lxml')
# items = rss_soup.findAll('item')
if not feed.entries:
print('ERROR: RSS file has no content: %s' % rss)
feed = None
else:
print('ERROR: Missing RSS file: %s' % rss)
feed = None
posts = soup.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
post_data = get_post_info(post, topic_url, feed)
post_data.update(topic_data)
dhqa_posts.append(post_data)
# check for second page (few cases; nothing has more than 2 pages)
next_link = soup.find('a', class_='next')
if next_link:
page_two = '%s/index.html' % next_link['href'].lstrip('/')
with open(page_two) as page_two_doc:
soup2 = BeautifulSoup(page_two_doc, 'html.parser')
posts = soup2.findAll('li', id=re.compile(r'^post-\d+'))
for post in posts:
# unlikely these will be in RSS feed...
post_data = get_post_info(post, topic_url, feed)
post_data.update(topic_data)
dhqa_posts.append(post_data)
# NOTE: missing 11 topic RSS feeds
# may be able to get date from tag feeds
print('%d posts total' % len(dhqa_posts))
with open('dhqa_data.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, fieldnames=post_fieldnames)
writer.writeheader()
writer.writerows(dhqa_posts)
@ZoeLeBlanc
Copy link

One small request is to have field headers with underscores rather than spaces. Otherwise I can't use tab completion with dot notation in pandas. Thanks 🙏

@rlskoeser
Copy link
Author

Good to know! I go back and forth on whether field names should be more dev friendly or more readable, sounds like I should use underscores in future.

@ZoeLeBlanc
Copy link

That's a great question! Maybe something to clarify how this data will be used? I could see a more dev-friendly version of the dataset, versus a more data analysis friendly version, if you think having both might be worthwhile.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment