Last active
August 26, 2024 23:35
-
-
Save mara004/8ef3a803531fdd42b29bbfa2889ff7f3 to your computer and use it in GitHub Desktop.
PDF rendering with Ghostscript (via subprocess)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-FileCopyrightText: 2024 James R. Barlow <james@purplerock.ca> | |
# SPDX-License-Identifier: MPL-2.0 | |
# Initial code derived from ocrmypdf/_exec/ghostscript.py | |
# Note that Ghostscript is AGPL-licensed. However, we are calling it via subprocess here, so not sure whether copyleft would actually apply. | |
# See also https://www.gnu.org/licenses/gpl-faq.en.html#MereAggregation | |
import io | |
import os | |
import re | |
import sys | |
import shutil | |
import logging | |
import subprocess | |
import PIL.Image | |
logger = logging.getLogger(__name__) | |
def get_ghostscript(): | |
# TODO consider searching the windows registry, as ocrmypdf does | |
# https://github.com/jbarlow83/OCRmyPDF/blob/master/src/ocrmypdf/subprocess/_windows.py | |
if sys.platform.startswith('win32'): | |
gs = shutil.which('gswin64c') | |
if not gs: | |
gs = shutil.which('gswin32c') | |
else: | |
gs = shutil.which('gs') | |
return gs | |
def _gs_error_reported(stream): | |
if re.search('error', stream, flags=re.IGNORECASE): | |
return True | |
else: | |
return False | |
def _gs_rasterise_pdf( | |
input_file, | |
*, | |
pageno, | |
raster_dpi, | |
password = None, | |
raster_device = 'png16m', | |
): | |
""" | |
Rasterize one page of a PDF at resolution *raster_dpi*. | |
*pageno* is the visual (1-based) page number. | |
Note that Ghostscript takes /UserUnit into account on its own. | |
""" | |
raster_dpi = round(raster_dpi, 6) | |
gs = get_ghostscript() | |
if not gs: | |
raise RuntimeError("Ghostscript could not be found. Make sure it is installed and added to $PATH.") | |
args_gs = [] | |
args_gs.extend( | |
[ | |
gs, | |
'-dQUIET', | |
'-dSAFER', | |
'-dBATCH', | |
'-dNOPAUSE', | |
'-dNOPROMPT', | |
f'-sDEVICE={raster_device}', | |
f'-dFirstPage={pageno}', | |
f'-dLastPage={pageno}', | |
f'-r{raster_dpi:f}x{raster_dpi:f}', | |
'-dTextAlphaBits=4', | |
'-dGraphicsAlphaBits=4', | |
'-dInterpolateControl=-1', | |
] | |
) | |
if password is not None: | |
args_gs.append(f'-sPDFPassword={password}') | |
args_gs.extend( | |
[ | |
'-o', | |
'-', | |
'-sstdout=%stderr', | |
'-dAutoRotatePages=/None', | |
'-f', | |
os.fspath(input_file), | |
] | |
) | |
#logger.debug(args_gs) | |
try: | |
pipe = subprocess.run( | |
args_gs, | |
stdout = subprocess.PIPE, | |
stderr = subprocess.PIPE, | |
check = True, | |
) | |
except subprocess.CalledProcessError as error_msg: | |
logger.error(error_msg.stderr.decode(errors='replace')) | |
raise RuntimeError('Ghostscript rasterizing failed') | |
else: | |
stderr = pipe.stderr.decode(errors='replace') | |
if _gs_error_reported(stderr): | |
logger.error(stderr) | |
return PIL.Image.open( io.BytesIO(pipe.stdout) ) | |
def invoke_ghostscript_shell(filepath, index, scale=4, password=None): | |
# Note, this does not support rotation. | |
return _gs_rasterise_pdf( | |
filepath, | |
password = password, | |
pageno = index + 1, | |
raster_dpi = scale * 72, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TODO: see if we can call ghostscript through the ABI-level bindings from https://gitlab.com/pdftools/python-ghostscript
I seem to remember having already tried this at some point, a long time ago, but don't remember exactly what was the problem back then. I think it might have been that I didn't manage to get the in-memory data, and had to use file output.