Skip to content

Instantly share code, notes, and snippets.

@mbafford
Last active January 1, 2022 20:11
Show Gist options
  • Save mbafford/48915841a405f5f6528fa5743e7d5ecc to your computer and use it in GitHub Desktop.
Save mbafford/48915841a405f5f6528fa5743e7d5ecc to your computer and use it in GitHub Desktop.
DCU OpenTimeTable module events scrapy code
import scrapy
import json
import scrapy.http
import re
import datetime
def build_weeks():
ret = []
for weekNo in range(16, 30):
week1 = datetime.date(2021, 9, 20)
week = week1 + datetime.timedelta(weeks=weekNo-1)
ret.append({
"WeekNumber": weekNo,
"WeekLabel": str(weekNo),
"FirstDayInWeek": f'{week.strftime("%Y-%m-%d")}T00:00:00+00:00'
})
return ret
class TimetableSpider(scrapy.Spider):
name = 'timetable'
allowed_domains = ['opentimetable.dcu.ie']
def build_index_request_page(self, pageNo: int) -> scrapy.Request:
return scrapy.FormRequest(
f"https://opentimetable.dcu.ie/broker/api/CategoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/Categories/Filter?pageNumber={pageNo}&query=",
formdata=[{"Identity": "241e4d36-60e0-49f8-b27e-99416745d98d","Values": ["null"]}],
headers={
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": self.auth_token
},
callback=self.parse_index_page
)
def build_course_events_fetch(self, courseId: str):
formdata = {
"ViewOptions": {
"Weeks": build_weeks()
},
"CategoryIdentities": [
courseId
]
}
return scrapy.Request(
f"https://opentimetable.dcu.ie/broker/api/categoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/categories/events/filter",
body=json.dumps(formdata),
method='POST',
headers={
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": self.auth_token
},
callback=self.parse_course_details
)
def start_requests(self):
yield scrapy.Request(f"https://opentimetable.dcu.ie", callback=self.parse_index_html)
def parse_index_html(self, response:scrapy.http.HtmlResponse):
scripts = response.css("script::attr(src)").getall()
for src in scripts:
if 'main' in src and '.bundle.js' in src:
yield scrapy.Request(response.urljoin(src), callback=self.extract_auth_token)
def extract_auth_token(self, response:scrapy.http.Response):
m = re.search(r'apiAuthentication:["](basic [^"]+)"', response.text)
assert m is not None
self.auth_token = m.group(1)
yield self.build_index_request_page(1)
def parse_index_page(self, response:scrapy.http.Response):
data = response.json()
# this will call on every page loaded, but be de-duped by Scrapy, right?
for page in range(data['CurrentPage'], data['TotalPages']):
yield self.build_index_request_page(page)
for result in data["Results"]:
yield result
yield self.build_course_events_fetch(result["Identity"])
def parse_course_details(self, response:scrapy.http.Response):
data = response.json()
for record in data:
yield record
grep Extra timetables.jl | jq '.CategoryEvents//[] | .[] | [(.Name | sub("[[].*"; "") ), .Name, .HostKey, .EventType, .Location, .Description, (.ExtraProperties//[] | .[] | select(.DisplayName == "Module Name") | .Value ), (.ExtraProperties//[] | .[] | select(.DisplayName == "Weeks") | .Value), .StartDateTime, .EndDateTime ] | @tsv' -r | pbcopy
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = settings
BOT_NAME = 'dcu_opentimetable'
USER_AGENT = 'opentimetable scraper - please add bookmarks support'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 10
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 4
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 3600
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [503, 504, 505, 500, 400, 401, 402, 403, 404]
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment