Created
August 14, 2024 04:41
-
-
Save brandonrobertz/9be67d1d64aba90399dda050d20d9bc3 to your computer and use it in GitHub Desktop.
HTML to PDF when you need to preserve style and layout
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import re | |
import sys | |
from playwright.sync_api import sync_playwright | |
from tablib import Dataset | |
from pdfplumber import PDF | |
csv.field_size_limit(sys.maxsize) | |
try: | |
outputTextColumn = "html_text_parsed"; | |
htmlColumn = sys.argv[-3]; | |
inputFilePath = sys.argv[-2]; | |
outputFilePath = sys.argv[-1]; | |
except IndexError: | |
print("USAGE: html-to-text.mjs html_in_this_column infile.csv outfile.csv") | |
sys.exit(1) | |
TEMP_PDF_PATH = ".tmp.pdf"; | |
def html2pdf(page, html_text): | |
# Set the HTML content of the page | |
while retries := 10: | |
try: | |
page.set_content(html_text, wait_until='domcontentloaded', timeout=30) | |
break | |
except Exception as e: | |
print("ERROR loading HTML", e) | |
retries -= 1 | |
# Generate the PDF and save it to the specified path | |
page.pdf( | |
path=TEMP_PDF_PATH, | |
format='A4', # You can customize the page format or size | |
# printBackground=True # Ensures background colors and images are included | |
) | |
return TEMP_PDF_PATH | |
def pdf2text(pdf_path): | |
all_text = "" | |
with PDF.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
pg_text_raw = page.extract_text(layout=True) | |
pg_stripped = "\n".join([l.strip() for l in pg_text_raw.split("\n")]) | |
pg_text = re.sub(r"\n\n\n+", "\n\n", pg_stripped) | |
all_text += pg_text | |
print(pg_text) | |
all_text += "\n" | |
return all_text | |
def html2text(browser, html_text): | |
pdf_path = html2pdf(browser, html_text) | |
text = pdf2text(pdf_path) | |
return text | |
def run(input_csv): | |
with sync_playwright() as p: | |
browser = p.chromium.launch() | |
page = browser.new_page() | |
out_rows = [] | |
for row in input_csv.dict: | |
row[outputTextColumn] = html2text(page, row[htmlColumn]) | |
out_rows.append(row) | |
browser.close() | |
return out_rows | |
with open(inputFilePath, "r") as f: | |
input_csv = Dataset().load(f) | |
out_rows = run(input_csv) | |
out_csv = Dataset(headers=list(out_rows[0].keys())) | |
with open(outputFilePath, "w") as f: | |
for row in out_rows: | |
out_csv.append(row.values()) | |
f.write(out_csv.csv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment