Last active
December 14, 2022 03:11
-
-
Save AlisterH/ca0bb34a18d4f265f6f1c92510ffda90 to your computer and use it in GitHub Desktop.
script for Puppy Linux to recursively find and combine pdf files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# joinpdf: script for Puppy Linux to recursively find and combine pdf files. | |
# Required: basename and qpdf. | |
# Also: which, realpath, file and gawk (any awk would work) | |
# Recommended: natsort | |
# Command line only version. See gjoinpdf for gui version. | |
# Version 9 by disciple, 14/12/2022. | |
# Switched back to processing each file with qpdf in a separate thread before merging - generally faster on a multiprocessor machine, but will be slower with some files than just using pdfunite. | |
# Sejda may be better, but will generate larger files in some cases, and requires java. | |
# http://www.murga-linux.com/puppy/viewtopic.php?p=149208#149208 | |
# Currently has NO ERROR HANDLING for the actual join operation. | |
# You may experience errors if you have pdfs that are corrupted. | |
# Set defaults | |
# $$ is the process ID, to prevent collisions between more than one simultaneous instance | |
OUTPUTFILE="combined-$(date '+%Y%m%d-%H%M')-PID$$.pdf" | |
# Use natsort if it is available, for better sorting | |
# `msort -l -w -c h` also gives good results but isn't designed for piping | |
if hash natsort 2>/dev/null | |
then | |
SORT="natsort" | |
else | |
echo "joinpdf: using sort" | |
echo "for better results install natsort from https://github.com/jjk-jacky/natsort" | |
sort -V 1>/dev/null 2>&1 <<<1 | |
if [ $? = "0" ] | |
then | |
SORT="sort -V" | |
else | |
SORT="sort" | |
echo "joinpdf: sort does not support -V" | |
echo "I hope you were careful with any file numbering" | |
fi | |
fi | |
# Initialise filecount | |
# Will need to increase this if we want to join 10000+ files! | |
# I have not tested to find the maximum number of files in each input directory, | |
# although it performed well on my system with an input directory containing 960 files. | |
# Note that if there are too many our pipes will break! | |
FILECOUNT=1000 | |
# print usage instructions if run without input arguments. | |
# TODO: better usage message. | |
test -sd "$@" 2>/dev/null | |
if [ "$?" = "0" ] | |
then | |
echo " | |
joinpdf: at least one input pdf or directory is required | |
Usage: joinpdf [input files and/or folders] | |
Joins pdf inputs and pdfs in any folder inputs (found recursively in any subfolders and sorted in normal alphanumeric order) in the order specified. | |
Use gjoinpdf for gui version. | |
" | |
exit | |
fi | |
# Just combine the pdfs if run with input arguments. | |
# Create temporary directory | |
TEMPFOLDER="`mktemp -dt joinpdf-XXX`" | |
# Find all files! | |
#see comment further down re mime types vs file extensions | |
for i in "$@" | |
do | |
#find -L "$i" -type f -name '*.pdf' | $SORT >> "$TEMPFOLDER"/files.txt | |
#this might actually cause problems if we have more files than we can pipe in one go | |
#I guess find will pipe them to two different instances of sort, so they won't all be sorted | |
find -L "$i" -type f -exec realpath {} + | $SORT >> "$TEMPFOLDER"/files0.txt | |
done | |
# get input files; detect pdfs by mime-type now | |
# this is more 'nixy and will pick up pdfs without a file extension, and not files with a .pdf extension that aren't actually pdfs | |
# need to use realpath above to pick up symlinks to pdfs (readlink would be a more portable alternative - would it be slower?) | |
# unfortunately (in terms of performance) we can't use the file command via find -exec because it doesn't get passed the realpath | |
# see https://unix.stackexchange.com/questions/79222/how-can-i-efficiently-dereference-all-symlinks-in-find-output-filenames | |
# make sure we create file so there is no error in the next while | |
touch "$TEMPFOLDER"/files.txt | |
while IFS='' read -r line | |
do | |
file -F $'\t' --mime-type "$line" | grep 'application/pdf$' | cut -f 1 >> "$TEMPFOLDER"/files.txt | |
done < "$TEMPFOLDER"/files0.txt | |
# Make sure output file has an extension | |
OUTPUTFILE="`echo $OUTPUTFILE | gawk '{gsub (/\.pdf$|\.PDF$/,"",$0); print $0'}`" | |
OUTPUTFILE="$OUTPUTFILE.pdf" | |
# Symlink files for us to join | |
while read line | |
do FILECOUNT=$(($FILECOUNT+1)) | |
(qpdf --empty --remove-page-labels --pages "$line" -- "$TEMPFOLDER"/$FILECOUNT) & | |
done < "$TEMPFOLDER"/files.txt | |
wait | |
# Remove lists | |
rm -f "$TEMPFOLDER"/files0.txt | |
rm -f "$TEMPFOLDER"/files.txt | |
if [ -z "`ls -A "$TEMPFOLDER"`" ] | |
then | |
echo "error: no input pdfs found" | |
else | |
qpdf --empty --stream-data=compress --object-streams=generate --pages "$TEMPFOLDER"/* -- "$OUTPUTFILE" && echo "created: $OUTPUTFILE" | |
fi | |
#remove temporary directory | |
rm -rf "$TEMPFOLDER" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment