Created
June 29, 2017 07:03
-
-
Save an-empty-string/a09850b83c253ca7c0cc8d369ba263c2 to your computer and use it in GitHub Desktop.
schedule data converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Schedule-to-JSON converter. | |
Usage: | |
dump_sections.py <term> [-i] [-o <filename>] | |
dump_sections.py (-h | --help) | |
Options: | |
-i --include-canceled Include canceled courses in the data. | |
-o --output=<filename> Send JSON output to a file. | |
Term examples: | |
sprg2017 Spring 2017 | |
fall2017 Fall 2017 | |
sum2017a Summer 2017 academic term | |
sum2017b Summer 2017 session 1 | |
sum2017c Summer 2017 session 2 | |
sum2017mm Summer 2017 Maymester session | |
""" | |
import collections | |
import docopt | |
import json | |
import pprint | |
import requests | |
import sys | |
from typing import Dict, List, Tuple | |
def decode(b: bytes) -> str: | |
return b.decode("utf-8") | |
def avail_state(b: bytes) -> bool: | |
return b != b"Canceled" | |
def int_or_na(b: bytes): | |
if b == b"####": | |
return 0 | |
return int(b) | |
SEP = b"\x0c" | |
COURSE_FIELDS = [ | |
# name conversion length comment | |
("kind" , decode , 4) , # online? honors? etc | |
("crn" , decode , 6) , # Course Reference Number | |
("num" , decode , 3) , # course code | |
("sec" , decode , 6) , # section code | |
("title" , decode , 30) , # course name | |
("credit" , float , 6) , # credit-hours granted | |
("cap" , int_or_na , 4) , # enrollment cap | |
("enrolled" , int , 4) , # current enrollment | |
("available" , avail_state , 8) , # canceled or no? | |
("waitlist" , int , 4) , # students on waitlist | |
("days" , decode , 7) , # days of week course meets | |
("start" , decode , 7) , # start time on days | |
("end" , decode , 7) , # end time on days | |
("building" , decode , 5) , # ... | |
("room" , decode , 10) , # ... | |
("instructor" , decode , 35) , # ... | |
] | |
COURSE_ATTRIBUTES = [ | |
("summer_session_1", "1"), | |
("summer_session_2", "2"), | |
("summer_academic_term", "A"), | |
("distance_learning", "D"), | |
("honors", "H"), | |
("term_j", "J"), | |
("term_k", "K"), | |
("term_l", "L"), | |
("term_m", "M"), | |
("term_n", "N"), | |
("online", "O"), | |
("pass", "P"), | |
("special_offering", "S"), | |
("visiting_student", "V"), | |
("womens_studies", "W"), | |
("hybrid", "Y"), | |
# these attributes are set by our code specially | |
("canceled", "="), | |
] | |
COPY_ATTRIBUTES = ["crn", "num", "sec", "title", "credit", "cap", "enrolled", | |
"waitlist", "building", "room", "instructor"] | |
ParsedCourse = collections.namedtuple("ParsedCourse", | |
["kind", "crn", "num", "sec", "title", "credit", "cap", "enrolled", | |
"available", "waitlist", "days", "start", "end", "building", "room", | |
"instructor"]) | |
MeetingTime = collections.namedtuple("MeetingTime", ["days", "start", "end"]) | |
CourseSection = collections.namedtuple("CourseSection", | |
["crn", "dept", "num", "sec", "title", "credit", "cap", "enrolled", | |
"waitlist", "meetings", "building", "room", "instructor", "attrs", | |
"short"]) | |
CourseAttributes = Dict[str, bool] | |
DepartmentData = Tuple[str, ParsedCourse] | |
def get_raw(term) -> bytes: | |
url = "http://www.uah.edu/schedules/{}.html".format(term) | |
return requests.get(url).content | |
def get_sections(data: bytes) -> List[bytes]: | |
return data.split(SEP)[1:] # first section is blank | |
def get_section_code(section: bytes) -> str: | |
""" <a name="ACC"> """ | |
return section.split(b'"', maxsplit=2)[1].decode() | |
def get_course_data(section: bytes) -> bytes: | |
section = section.split(b"<pre>", maxsplit=1)[1] | |
section = section.split(b"<HR>", maxsplit=1)[0] | |
return section.strip().split(b"\n")[3:] # three lines of headers | |
def parse_course(course_line: bytes) -> ParsedCourse: | |
collected_fields = {} | |
current_index = 0 | |
for field_name, conversion_func, field_len in COURSE_FIELDS: | |
field_len += 1 # include the space separator | |
next_index = current_index + field_len | |
collected_fields[field_name] = \ | |
conversion_func(course_line[current_index:next_index].strip()) | |
current_index = next_index | |
return ParsedCourse(**collected_fields) | |
def parse_attributes(course: ParsedCourse) -> CourseAttributes: | |
result = {} | |
for attr, char in COURSE_ATTRIBUTES: | |
result[attr] = char in course.kind | |
return result | |
def parse_section(section: bytes) -> DepartmentData: | |
course_data = get_course_data(section) | |
parsed_courses = [parse_course(line) for line in course_data] | |
return get_section_code(section), parsed_courses | |
def interpret_department(data: DepartmentData) -> List[CourseSection]: | |
dept_code, parsed_courses = data | |
result = [] | |
crn_courses = collections.defaultdict(list) | |
for course in parsed_courses: | |
crn_courses[course.crn].append(course) | |
for crn, courses in crn_courses.items(): | |
# we can copy a lot of data from the first course | |
new_attributes = {key: getattr(courses[0], key) \ | |
for key in COPY_ATTRIBUTES} | |
# we cannot copy: dept, meetings, attr | |
new_attributes["dept"] = dept_code | |
new_attributes["attrs"] = parse_attributes(courses[0]) | |
# we also add: short | |
new_attributes["short"] = "{}{}-{}".format(dept_code, | |
courses[0].num, courses[0].sec) | |
# our only attribute special case: canceled-ness | |
if not courses[0].available: | |
new_attributes["attrs"]["canceled"] = True | |
# the only difference between duplicate courses is meeting times | |
meetings = [] | |
for course in courses: | |
if course.days != "TBA": # nothing to do in this instance... | |
meetings.append(MeetingTime(days=course.days, | |
start=course.start, end=course.end)) | |
new_attributes["meetings"] = meetings | |
# merged_course now contains parsed attributes and meeting times | |
merged_course = CourseSection(**new_attributes) | |
result.append(merged_course) | |
return result | |
def interpret_all(data: List[bytes]) -> List[CourseSection]: | |
result = [] | |
for section in data: | |
department_data = parse_section(section) | |
course_sections = interpret_department(department_data) | |
result.extend(course_sections) | |
return result | |
def drop_canceled(data: List[CourseSection]) -> List[CourseSection]: | |
return [course for course in data if not course.attr["canceled"]] | |
def to_jsonable(data: List[CourseSection]): | |
result = [] | |
for course in data: | |
course_json = dict(course._asdict()) | |
course_json["meetings"] = \ | |
list(map(lambda meeting: dict(meeting._asdict()), | |
course_json["meetings"])) | |
result.append(course_json) | |
return result | |
def dump_data(term, out, include_canceled=False): | |
data = get_raw(term) | |
sections = get_sections(data)[1:] # we do not care about the index section | |
interpreted = interpret_all(sections) | |
jsonable = to_jsonable(interpreted) | |
json.dump(jsonable, out) | |
def main(): | |
arguments = docopt.docopt(__doc__, version="Schedule-to-JSON converter 0.0.1") | |
if arguments["--output"]: | |
out = open(arguments["--output"], "w") | |
else: | |
out = sys.stdout | |
dump_data(arguments["<term>"], out, arguments["--include-canceled"]) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment