Skip to content

Instantly share code, notes, and snippets.

@brandonrobertz
Created August 14, 2024 04:41
Show Gist options
  • Save brandonrobertz/9be67d1d64aba90399dda050d20d9bc3 to your computer and use it in GitHub Desktop.
Save brandonrobertz/9be67d1d64aba90399dda050d20d9bc3 to your computer and use it in GitHub Desktop.
HTML to PDF when you need to preserve style and layout
#!/usr/bin/env python
import csv
import re
import sys
from playwright.sync_api import sync_playwright
from tablib import Dataset
from pdfplumber import PDF
csv.field_size_limit(sys.maxsize)
try:
outputTextColumn = "html_text_parsed";
htmlColumn = sys.argv[-3];
inputFilePath = sys.argv[-2];
outputFilePath = sys.argv[-1];
except IndexError:
print("USAGE: html-to-text.mjs html_in_this_column infile.csv outfile.csv")
sys.exit(1)
TEMP_PDF_PATH = ".tmp.pdf";
def html2pdf(page, html_text):
# Set the HTML content of the page
while retries := 10:
try:
page.set_content(html_text, wait_until='domcontentloaded', timeout=30)
break
except Exception as e:
print("ERROR loading HTML", e)
retries -= 1
# Generate the PDF and save it to the specified path
page.pdf(
path=TEMP_PDF_PATH,
format='A4', # You can customize the page format or size
# printBackground=True # Ensures background colors and images are included
)
return TEMP_PDF_PATH
def pdf2text(pdf_path):
all_text = ""
with PDF.open(pdf_path) as pdf:
for page in pdf.pages:
pg_text_raw = page.extract_text(layout=True)
pg_stripped = "\n".join([l.strip() for l in pg_text_raw.split("\n")])
pg_text = re.sub(r"\n\n\n+", "\n\n", pg_stripped)
all_text += pg_text
print(pg_text)
all_text += "\n"
return all_text
def html2text(browser, html_text):
pdf_path = html2pdf(browser, html_text)
text = pdf2text(pdf_path)
return text
def run(input_csv):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
out_rows = []
for row in input_csv.dict:
row[outputTextColumn] = html2text(page, row[htmlColumn])
out_rows.append(row)
browser.close()
return out_rows
with open(inputFilePath, "r") as f:
input_csv = Dataset().load(f)
out_rows = run(input_csv)
out_csv = Dataset(headers=list(out_rows[0].keys()))
with open(outputFilePath, "w") as f:
for row in out_rows:
out_csv.append(row.values())
f.write(out_csv.csv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment