Skip to content

Instantly share code, notes, and snippets.

Last active May 20, 2024 22:01
Show Gist options
  • Save carlosrobles/839e57f5c312f1f582e2374be8c97459 to your computer and use it in GitHub Desktop.
Save carlosrobles/839e57f5c312f1f582e2374be8c97459 to your computer and use it in GitHub Desktop.
Create a table of content for Substack using Python
import requests
#you can install lxml with `sudo pip3 install lxml`
from lxml.html import fromstring
import sys
from urllib.parse import urlparse, urlunparse, urlencode, parse_qs
import time
if len(sys.argv) < 2:
print("No arguments were given. Use URL [output_file_name]")
#To avoid cache we add a parameter withe the timestamp
def add_timestamp_to_url(url):
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query) | {"timestamp": int(time.time())}
return urlunparse(parsed_url._replace(query=urlencode(query_params, doseq=True)))
url = sys.argv[1]
filename = sys.argv[2] if len(sys.argv) > 2 else None
#fetch the HTML
tree = fromstring(requests.get(str(add_timestamp_to_url(url))).content)
path = "//*[@class='header-with-anchor-widget']"
current_level = 100
ul_open = 0
output = "<ul>"
#get all headers with the right class define above
for header in tree.xpath(path):
header_level = int(header.tag[1])
print("H"+str(header_level) +" - " + str(header.text))
#we will nest subheaders inside of parents
if header_level > current_level:
print("nesting ")
output = output[:-5] + "<ul>"
ul_open = ul_open + 1
#close current tree and go back to a higher rank header
elif header_level < current_level and ul_open > 0:
while (ul_open > 0):
print("unnesting " + str(ul_open))
ul_open = ul_open -1
output = output + "</ul></li>"
current_level = header_level
#create link
link = header[0].get('id')
output = output + "<li><a href='"+url+"#"+str(link)+"'>"
output = output + str(header.text) + "</a></li>"
#if this is the last one, close current tree before closing the main UL
while (ul_open > 0):
print("unnesting " + str(ul_open))
ul_open = ul_open -1
output = output + "</ul></li>"
output = output + "</ul>"
if filename:
with open(filename, 'w') as file:
file.write (output)
print ("\nSaved to ./" + filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment