Last active
May 28, 2024 16:52
-
-
Save cboulanger/bd9300ad4481d29b87c3b35a5fcc798c to your computer and use it in GitHub Desktop.
Download a pad.gwdg.de presentation for local offline viewing. Script written with the help of ChatGPT4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import argparse | |
def download_resource(session, url, base_dir, level=0): | |
try: | |
response = session.get(url, stream=True) | |
response.raise_for_status() | |
# Check the content type to ensure it's not an HTML error page | |
content_type = response.headers.get('Content-Type', '') | |
content_is_code = content_type.split("; ")[0] in ['application/javascript', 'text/javascript', 'text/css'] | |
if 'text/html' in content_type and not url.endswith(".html"): | |
print(f"Skipped downloading {url}: MIME type is text/html (likely a 404 page)") | |
return | |
parsed_url = urlparse(url) | |
resource_path = os.path.join(base_dir, parsed_url.path.lstrip('/')) | |
resource_dir = os.path.dirname(resource_path) | |
os.makedirs(resource_dir, exist_ok=True) | |
print(f'{" "*level}Saving {parsed_url.path} to {resource_path}') | |
with open(resource_path, 'wb') as f: | |
content = b"" | |
for chunk in response.iter_content(chunk_size=8192): | |
# Monkey-patch the javascript | |
if resource_path.endswith('slide-pack.9fe42901cee029fba75d.js'): | |
chunk_str = chunk.decode('utf-8') | |
chunk_str = chunk_str.replace('src:serverurl+"/build/', 'src:"build/') | |
chunk = chunk_str.encode('utf-8') | |
content += chunk | |
f.write(chunk) | |
# If the downloaded file is JavaScript or CSS, check for more URLs | |
if content_is_code: | |
content_str = content.decode('utf-8') | |
matches = re.findall(r'https://pad\.gwdg\.de/[^\'"\s\)\]]+', content_str) | |
matches += ['https://pad.gwdg.de/' + p for p in re.findall(r'/(build|css|js)/[^\'"\s\)\]]+', content_str)] | |
for match in matches: | |
if not match.endswith("/"): | |
download_resource(session, match, base_dir, level+1) | |
except Exception as e: | |
print(f"{" "*level}Failed to download {url}: {e}") | |
def remove_csp(soup): | |
for meta in soup.find_all("meta"): | |
if 'http-equiv' in meta.attrs and meta.attrs['http-equiv'] == 'Content-Security-Policy': | |
meta.decompose() | |
def replace_and_download_resources(session, soup, base_url, base_dir): | |
for tag in soup.find_all(['img', 'link', 'script']): | |
if tag.name == 'img' and tag.get('src'): | |
resource_url = urljoin(base_url, tag['src']) | |
download_resource(session, resource_url, base_dir) | |
elif tag.name == 'link' and tag.get('href'): | |
resource_url = urljoin(base_url, tag['href']) | |
download_resource(session, resource_url, base_dir) | |
elif tag.name == 'script' and tag.get('src'): | |
resource_url = urljoin(base_url, tag['src']) | |
download_resource(session, resource_url, base_dir) | |
# Handle dynamically generated URLs in script content | |
for script in soup.find_all('script'): | |
if script.string: | |
updated_script = script.string | |
matches = re.findall(r'https://pad\.gwdg\.de/([^\'"\s]+)', updated_script) | |
for match in matches: | |
resource_url = urljoin(base_url, match) | |
download_resource(session, resource_url, base_dir) | |
updated_script = updated_script.replace(resource_url, match) | |
script.string.replace_with(updated_script) | |
def download_uploads_resources(session, html_str, base_dir): | |
matches = re.findall(r'https://pad\.gwdg\.de/uploads/[^\s\'\"\)\]]+', html_str) | |
for match in matches: | |
resource_url = match | |
download_resource(session, resource_url, base_dir) | |
def download_additional_resources(session, base_url, base_dir, additional_paths): | |
for path in additional_paths: | |
resource_url = urljoin(base_url, path) | |
download_resource(session, resource_url, base_dir) | |
def download_html_and_resources(slide_id, output_html, base_dir, additional_paths): | |
base_url = "https://pad.gwdg.de" | |
page_url = f"{base_url}/p/{slide_id}" | |
session = requests.Session() | |
response = session.get(page_url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
remove_csp(soup) # Remove CSP settings | |
# Replace occurrences of the base URL with the local paths and download resources | |
replace_and_download_resources(session, soup, base_url, base_dir) | |
# Convert the soup object to string to find and download uploads resources | |
html_str = str(soup) | |
download_uploads_resources(session, html_str, base_dir) | |
# Download additional specified resources | |
download_additional_resources(session, base_url, base_dir, additional_paths) | |
# Convert all occurrences of the base URL to local paths | |
html_str = html_str.replace(base_url + '/', './') | |
# Ensure the base directory exists | |
os.makedirs(base_dir, exist_ok=True) | |
# Save the modified HTML | |
with open(os.path.join(base_dir, output_html), 'w', encoding='utf-8') as f: | |
f.write(html_str) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download and save HTML and resources from pad.gwdg.de") | |
parser.add_argument('-i', '--id', required=True, help="The slide ID from pad.gwdg.de") | |
parser.add_argument('-d', '--dir', required=True, help="The directory where the resources will be saved") | |
args = parser.parse_args() | |
slide_id = args.id | |
base_dir = args.dir | |
output_html = "index.html" | |
# this is a manual collection of resources that were not discovered by the script, probably incomplete | |
additional_paths = [ | |
"build/85934a8a31bd9b8b75e68eeb57b6859810055d48742953766c4a5c2b5a0d5266.woff", | |
"build/8810ba3440bf482ced33d2f74b7803bba711f689d8e4caa7da5c6ae6844a1b49.woff2", | |
"build/006708d6691753cfc46eec2dae88fbdafa22823a89194149d9f223050dc78998.woff", | |
"build/4f319287827e35f841069eb471c092eccf97d2f7830aa4d8bd7301ded418bf49.ttf", | |
"build/MathJax/jax/input/TeX/config.js?V=2.7.9", | |
"build/MathJax/jax/input/MathML/config.js?V=2.7.9", | |
"build/MathJax/jax/output/HTML-CSS/config.js?V=2.7.9", | |
"build/MathJax/jax/output/NativeMML/config.js?V=2.7.9", | |
"build/MathJax/jax/output/PreviewHTML/config.js?V=2.7.9", | |
"build/MathJax/extensions/tex2jax.js?V=2.7.9", | |
"build/MathJax/extensions/mml2jax.js?V=2.7.9", | |
"build/MathJax/extensions/MathEvents.js?V=2.7.9", | |
"build/MathJax/extensions/MathZoom.js?V=2.7.9", | |
"build/MathJax/extensions/MathMenu.js?V=2.7.9", | |
"build/MathJax/extensions/toMathML.js?V=2.7.9", | |
"build/MathJax/extensions/TeX/noErrors.js?V=2.7.9", | |
"build/MathJax/extensions/TeX/noUndefined.js?V=2.7.9", | |
"build/MathJax/extensions/TeX/AMSmath.js?V=2.7.9", | |
"build/MathJax/extensions/TeX/AMSsymbols.js?V=2.7.9", | |
"build/MathJax/extensions/fast-preview.js?V=2.7.9", | |
"build/MathJax/extensions/AssistiveMML.js?V=2.7.9", | |
"build/MathJax/extensions/a11y/accessibility-menu.js?V=2.7.9", | |
"build/MathJax/extensions/Safe.js?V=2.7.9", | |
"build/29.5f5bdb9120d6b9c39930.js", | |
"build/27.fbb6b5bbda6765f0a1f1.js", | |
"build/reveal.js/plugin/notes/notes.js", | |
"build/reveal.js/css/theme/white.css", | |
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro.css", | |
"build/reveal.js/css/print/paper.css", | |
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.ttf", | |
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.woff" | |
] | |
download_html_and_resources(slide_id, output_html, base_dir, additional_paths) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment