Created
February 5, 2015 19:13
-
-
Save davclark/6b6ed503d6a1e5e48516 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
'''ungraded.py - extract answers to ungraded problems | |
Data is obtained from the `course_structure` and `courseware_studentmodule` | |
data, which are currently specified at the top of the file. | |
This was developed in response to a request from an instructor to obtain | |
individual student responses to ungraded questions *with 0 weight*. However, | |
upon inspection, I noted that all items had either 0 weight, or had weight | |
unspecified, so I extract answers for the (maybe?) more general class of *all* | |
ungraded problems. It's easy enough for an instructor to ignore a given file, | |
and once we've read the data in, that process is pretty fast - so this slightly | |
more general approach seems apropriate. | |
''' | |
from os import makedirs # Nicer than mkdir | |
import json | |
import pandas as pd | |
# Could make these function parameters | |
# Most folks will need to replace with their own data! | |
structure_fname = 'BerkeleyX-GG101x-1T2014-course_structure-prod-analytics.json' | |
student_fname = 'BerkeleyX-GG101x-1T2014-courseware_studentmodule-prod-analytics.sql' | |
# A JSON file with information about course content | |
with open(structure_fname) as f: | |
course_structure = json.load(f) | |
# A sql dump that's actually a TSV file | |
# Reducing the amount of parsing & data retained here is likely the biggest | |
# optimization target | |
studentmodule = pd.read_csv(student_fname, '\t', na_values='na') | |
# Student answers are unfortunately heavily quoted JSON (so you see things like | |
# "The following is \\"Quoted\\""), as well as ASCII-escaped unicode characters. | |
# I don't want to process ALL student answers, so I only process selected rows | |
# below using this function | |
def extract_student_answers(s): | |
'''Helper for the following loop''' | |
# This will convert to bytes, then convert to unicode on the way back in - | |
# assuming you're on Python 3 If you're dealing with Unicode hell, you | |
# definitely want to be in Python 3 encode() for an ASCII string simply | |
# converts to bytes - there's no real "encoding" | |
s = s.encode().decode('unicode_escape') | |
data = json.loads(s) | |
# This appears to be where the student answers reliably occur. Of the data | |
# I've looked at, this is a blob that also includes an escaped version of | |
# the i4x index. I'm scared to delete it, so I leave it. It looks like this: | |
# i4x-BerkeleyX-GG101x-problem-db71da27320a44bdb45df31d0d801e20_2_1 | |
# The initial index looked like this: | |
# i4x://BerkeleyX/GG101x/problem/db71da27320a44bdb45df31d0d801e20 | |
# Note the lack of the trailing _2_1 | |
return data.get('student_answers', {}) | |
ungraded = {} | |
# Essentially implementing Xpath with for loops. Maybe better to just convert to | |
# XML (or mongo). You'd need to convert from the reference approach to actual | |
# containment for such an approach to work | |
for id, desc in course_structure.items(): | |
# Based on Dav's exploration, all top-level containers are 'sequential' | |
# is this guaranteed to be true? I don't have documentation. | |
if desc['category'] == 'sequential': | |
metadata = desc['metadata'] | |
# Based on Dav's exploratons, graded 'sequential' objects have | |
# 'graded': True. Ungraded objects lack this attribute. | |
if 'graded' not in metadata: | |
vert_ids = desc['children'] | |
for i, vid in enumerate(vert_ids): | |
vert = course_structure[vid] | |
vert_name = vert['metadata']['display_name'] | |
for child_id in vert['children']: | |
child = course_structure[child_id] | |
# There are many other categories, but I'm not sure how to | |
# make sense of all of them. Some instructors are interested | |
# in, e.g., seeing how much of a video was played. | |
if child['category'] == 'problem': | |
# If you want to debug, print stuff here: | |
# print('\t', child['metadata']['display_name']) | |
ungraded.setdefault(vert_name, []).append( | |
(child['metadata']['display_name'], child_id) ) | |
# Create a hierarchy of directories and files corresponding to sections and | |
# student answers to ungraded selected problems | |
for section, problems in ungraded.items(): | |
# Might not work on Windows (same with .to_csv() below) | |
makedirs('ungraded_problems/' + section, exist_ok=True) | |
for name, pid in problems: | |
raw_records = studentmodule.loc[studentmodule.module_id == pid] | |
# This triggers a warning, but we don't want to do this on all rows! | |
# We know we're potentially working on a DataFrame view (but probably | |
# not). | |
raw_records['student_answers'] = raw_records.state.apply( | |
extract_student_answers) | |
outfname = 'ungraded_problems/{}/{}.tsv'.format(section, name) | |
raw_records.to_csv(outfname, sep='\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment