Last active
March 26, 2023 16:36
-
-
Save martok/403b71fdd9322b3b806f4dfb26c86edf to your computer and use it in GitHub Desktop.
Count Color/BW pages in PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import sys | |
from pathlib import Path | |
import subprocess | |
import tempfile | |
import cv2 | |
import numpy as np | |
GS = r"gswin64c.exe" | |
KIND_NAMES = { | |
"bw": "B/W", | |
"gray": "Grayscale", | |
"color": "Color", | |
} | |
def gs(args): | |
with subprocess.Popen([GS, *args], bufsize=0, stdout=subprocess.PIPE) as p: | |
for line in iter(p.stdout.readline, b''): | |
yield line.rstrip().decode("utf-8") | |
def gs_inkcov(file: Path): | |
yield from gs(["-o", "-", "-sDEVICE=inkcov", "-f", str(file)]) | |
def gs_page_tiffsep(file: Path, pageno: int, dpi=150): | |
tmpbase = Path(tempfile.gettempdir()) / f"gspagecount{pageno}.tif" | |
files = [Path(tempfile.gettempdir()) / f"gspagecount{pageno}({col}).tif" | |
for col in ["Cyan", "Magenta", "Yellow", "Black"]] | |
def cleanup(): | |
for f in files: | |
f.unlink(missing_ok=True) | |
tmpbase.unlink(missing_ok=True) | |
cleanup() | |
try: | |
for output in gs(["-o", str(tmpbase), f"-r{dpi}x{dpi}", "-sDEVICE=tiffsep", "-sPageList=" + str(pageno), "-f", str(file)]): | |
# print(output) | |
pass | |
images = [cv2.imread(str(col), cv2.IMREAD_UNCHANGED) for col in files] | |
return np.dstack(images) | |
finally: | |
cleanup() | |
pass | |
def precise_measure(file: Path, current: int, dpi: int): | |
cmyk = gs_page_tiffsep(file, current, dpi) | |
if cmyk is None: | |
return | |
color_use = 1.0 - cmyk / np.full(cmyk.shape, 255) | |
painted = np.max(color_use, axis=2) > 0.0 | |
color_grayvalue = np.mean(color_use[..., :3], axis=2) | |
# pixels that are printed with more black than their color grey value don't count as color | |
darkened = color_use[..., 3] > color_grayvalue | |
color_use[..., :3][darkened] = 0.0 | |
return color_use.mean(axis=0).mean(axis=0) | |
def classify(cmyk) -> str: | |
if max(cmyk[:3]) < 1e-4: | |
return "bw" | |
elif cmyk[0] == cmyk[1] == cmyk[2]: | |
return "gray" | |
return "color" | |
def run_file(pdf: Path, *, | |
summarize: bool, | |
precise_dpi: int): | |
ST = 0 | |
total = 0 | |
first = 0 | |
current = 0 | |
current_kind = "" | |
counts = { | |
"total": 0, | |
"bw": 0, | |
"gray": 0, | |
"color": 0, | |
} | |
if summarize: | |
def print_head(): | |
print(f"{'Pages':>9s} {'BW':>6s} {'Gray':>6s} {'Color':>6s}") | |
def print_page(): | |
print(f"{counts['total']:>4}/{total:>4} {counts['bw']:>6} {counts['gray']:>6} {counts['color']:>6}", end="\r") | |
else: | |
def print_head(): | |
print(f"{'Page':>4s} {'Type':<6s} {'Cyan':>5s} {'Mag':>5s} {'Yell':>5s} {'Black':>5s}") | |
def print_page(): | |
print(f"{current:>4} {KIND_NAMES.get(current_kind):<6s} {cmyk[0]:.3f} {cmyk[1]:.3f} {cmyk[2]:.3f} {cmyk[3]:.3f}") | |
for ln in gs_inkcov(pdf): | |
words = list(filter(None, map(str.strip, ln.split(" ")))) | |
if ST == 0: | |
if ln.startswith("Processing pages"): | |
first = int(words[2]) | |
total = int(words[4][:-1]) | |
print_head() | |
ST = 1 | |
elif ST == 1: | |
if ln.startswith("Page "): | |
current = int(words[1]) | |
ST = 2 | |
elif ST == 2: | |
cmyk = [float(x) for x in words[0:4]] | |
counts["total"] += 1 | |
if precise_dpi: | |
cmyk = precise_measure(pdf, current, precise_dpi) | |
current_kind = classify(cmyk) | |
if current_kind: | |
counts[current_kind] += 1 | |
print_page() | |
ST = 1 | |
print("") | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("FILE", type=str, | |
help="PDF file") | |
parser.add_argument("-s", "--summarize", action="store_true", | |
help="Summarize Totals") | |
parser.add_argument("-p", "--precise", type=int, default=0, nargs="?", const=36, | |
help="Precise per-pixel count (account for overprinting)") | |
args = parser.parse_args() | |
return run_file(Path(args.FILE), | |
summarize=args.summarize, precise_dpi=args.precise) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment