Created
September 9, 2021 07:20
-
-
Save frankgeerlings/e9d59e1ecbce1adadf40c042fb398cc0 to your computer and use it in GitHub Desktop.
Bundle, rectify and OCR (sometimes poorly) scanned multi-page PDFs in current directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, subprocess | |
from glob import glob | |
from pprint import pprint | |
_, directories, files = next(os.walk('.')) | |
def raad_taal_en_titel(directory): | |
split = directory.split(' ', 1) | |
if len(split) is 2 and split[0] in ['eng', 'deu', 'nld']: | |
return split | |
return ('nld', directory) | |
# Dit zijn files die uit 1 stuk bestaan, dat is geen eigen dir waard | |
# dus pak ik die rechtstreeks op. Eindigt op PDF, niet op pdf! | |
inputfiles = [file[:-4] for file in files if file.endswith('.PDF') or file.endswith('.pdf')] | |
for file in inputfiles: | |
taal, titel = raad_taal_en_titel(file) | |
print(titel) | |
ocrmypdf = ['ocrmypdf', '-l', taal, '--skip-text', '--deskew', f'{file}.PDF', f'{titel}.pdf'] | |
subprocess.run(ocrmypdf) | |
for directory in directories: | |
files = glob(directory + '/*.PDF') + glob(directory + '/*.pdf') | |
if not any(files): | |
print(f"De directory {directory} bevat geen PDFs") | |
continue | |
taal, titel = raad_taal_en_titel(directory) | |
print(titel) | |
pdfunite = ['pdfunite'] + files + [f"{titel}.pdf"] | |
ocrmypdf = ['ocrmypdf', '-l', taal, '--skip-text', '--deskew', f'{titel}.pdf', f'{titel}.pdf'] | |
subprocess.run(pdfunite) | |
subprocess.run(ocrmypdf) | |
"pdfunite {directory}/*.{pdf,PDF} {directory}.pdf" | |
"exec ocrmypdf -l nld --skip-text --deskew {directory}.pdf --sidecar {directory}.txt" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment