Skip to content

Instantly share code, notes, and snippets.

@corajr
Last active June 18, 2019 19:24
Show Gist options
  • Save corajr/c9289e1a38b04614e6fdbc2bf820be0c to your computer and use it in GitHub Desktop.
Save corajr/c9289e1a38b04614e6fdbc2bf820be0c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Usage: ./learning_time_sample.py SESSION_GAP_IN_MINUTES
Requires: pip install intervaltree
This script sums up sessions from a BigQuery export, generated using the following query:
SELECT
* EXCEPT(row_number)
FROM (
SELECT
kaid,
content.content_id,
activity,
start_time,
end_time,
learning_time_ms,
ROW_NUMBER() OVER (PARTITION BY info.request_id) row_number
FROM
`khanacademy.org:deductive-jet-827.log_streams.learning_time_20190608`
WHERE
activity IN ("PRACTICING",
"WATCHING",
"READING")
AND MOD(FARM_FINGERPRINT(kaid), 10) = 1)
WHERE
row_number = 1
Export this table from BigQuery as `learning_time_sample.json.gz` and store it alongside this script.
"""
import collections
import datetime
import gzip
import intervaltree
import json
import sys
import tqdm
def parse_datetime(s):
try:
return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S.%f %Z')
except ValueError:
return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S %Z')
def parse_sample(input_fname="learning_time_sample.json.gz", session_gap=15):
data = collections.defaultdict(lambda: collections.defaultdict(intervaltree.IntervalTree))
with gzip.open(input_fname, 'rb') as f:
for line in tqdm.tqdm(f, total=580559):
event = json.loads(line)
if 'learning_time_ms' not in event or 'content_id' not in event:
continue
start_time = parse_datetime(event.pop(u'start_time'))
end_time = parse_datetime(event.pop(u'end_time'))
kaid = event[u'kaid']
content_id = event[u'content_id']
data[kaid][content_id].addi(start_time, end_time + datetime.timedelta(minutes=session_gap), event[u'activity'])
return data
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: {} SESSION_GAP_IN_MINUTES".format(sys.argv[0])
sys.exit(1)
session_gap = int(sys.argv[1])
data = parse_sample(session_gap=session_gap)
session_counts = []
session_counts_by_activity = {"READING": [], "PRACTICING": [], "WATCHING": []}
session_lengths = []
user_n = 0
sessions_n = 0
for kaid, intervals_dict in data.iteritems():
user_n += 1
sessions_for_user = 0
sessions_for_user_by_activity = collections.defaultdict(int)
for tree in intervals_dict.itervalues():
tree.merge_overlaps(data_reducer=lambda x, _: x)
for interval in tree:
session_lengths.append(interval.length().total_seconds())
sessions_for_user += 1
sessions_for_user_by_activity[interval.data] += 1
session_counts.append(sessions_for_user)
sessions_n += sessions_for_user
for k in session_counts_by_activity:
session_counts_by_activity[k].append(sessions_for_user_by_activity[k])
print "Average session count per user: {}".format(float(sum(session_counts)) / user_n)
print "Average session length: {}".format(float(sum(session_lengths)) / sessions_n)
for activity_type, counts in session_counts_by_activity.iteritems():
print "Average sessions of type {} per user: {}".format(activity_type, float(sum(counts)) / user_n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment