-
-
Save john-hix/60ca80129c1127dbdb556e4fc3c8ec78 to your computer and use it in GitHub Desktop.
Scrape your Canvas website and download all content to a folder. Tested on 2020-05-14.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!bin/python3 | |
import argparse | |
import os | |
import re | |
from pathvalidate import sanitize_filename | |
from canvasapi import Canvas | |
from canvasapi.course import Course | |
from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist | |
from canvasapi.file import File | |
from canvasapi.module import Module, ModuleItem | |
def extract_files(text): | |
text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE) | |
groups = set(text_search) | |
return groups | |
def get_course_files(course): | |
modules = course.get_modules() | |
files_downloaded = set() # Track downloaded files for this course to avoid duplicates | |
for module in modules: | |
module: Module = module | |
module_items = module.get_module_items() | |
for item in module_items: | |
item: ModuleItem = item | |
path = f"{output}/" \ | |
f"{sanitize_filename(course.attributes['name'])}/" \ | |
f"{sanitize_filename(module.attributes['name'])}/" | |
if not os.path.exists(path): | |
os.makedirs(path) | |
item_type = item.attributes["type"] | |
print(f"{course.attributes['name']} - " | |
f"{module.attributes['name']} - " | |
f"{item.attributes['title']} ({item_type})") | |
if item_type == "File": | |
file = canvas.get_file(item.attributes["content_id"]) | |
files_downloaded.add(item.attributes["content_id"]) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
elif item_type == "Page": | |
page = course.get_page(item.attributes["page_url"]) | |
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f: | |
f.write(page.attributes["body"] or "") | |
files = extract_files(page.attributes["body"] or "") | |
for file_id in files: | |
if file_id in files_downloaded: | |
continue | |
try: | |
file = course.get_file(file_id) | |
files_downloaded.add(file_id) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
except ResourceDoesNotExist: | |
pass | |
elif item_type == "ExternalUrl": | |
url = item.attributes["external_url"] | |
with open(path + sanitize_filename(item.attributes['title']) + ".url", "w") as f: | |
f.write("[InternetShortcut]\n") | |
f.write("URL=" + url) | |
elif item_type == "Assignment": | |
assignment = course.get_assignment(item.attributes["content_id"]) | |
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f: | |
f.write(assignment.attributes["description"] or "") | |
files = extract_files(assignment.attributes["description"] or "") | |
for file_id in files: | |
if file_id in files_downloaded: | |
continue | |
try: | |
file = course.get_file(file_id) | |
files_downloaded.add(file_id) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
except ResourceDoesNotExist: | |
pass | |
try: | |
files = course.get_files() | |
for file in files: | |
file: File = file | |
if not file.attributes["id"] in files_downloaded: | |
print(f"{course.attributes['name']} - {file.attributes['filename']}") | |
path = f"{output}/{sanitize_filename(course.attributes['name'])}/" \ | |
f"{sanitize_filename(file.attributes['filename'])}" | |
file.download(path) | |
except Unauthorized: | |
pass | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download all content from Canvas") | |
parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl") | |
parser.add_argument("token", help="Token generated in the settings page on Canvas") | |
parser.add_argument("output", help="Path to the output folder, e.g. output/") | |
parser.add_argument("courses", help="Comma-separated course IDs or 'all'", nargs="?", const="all") | |
args = parser.parse_args() | |
# Handle args | |
output = args.output.rstrip("/") + "/" | |
if args.courses is None: | |
args.courses = "all" | |
print("No courses specified. Scraping all courses.") | |
canvas = Canvas(args.url, args.token) | |
courses = [] # courses to scrape | |
# Select courses to scrape, default to all | |
if args.courses != "all": | |
courses = [] | |
ids = args.courses.split(",") | |
for id in ids: | |
courses.append(canvas.get_course( int(id) )) | |
else: | |
courses = canvas.get_courses() | |
# Perform scrape | |
for course in courses: | |
course: Course = course | |
get_course_files(course) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment