AlisterH · December 14, 2022 03:11
diff --git a/joinpdf b/joinpdf
 #! /bin/bash
 # joinpdf: script for Puppy Linux to recursively find and combine pdf files.
 # Required: basename and qpdf.
 # Also:     which, realpath, file and gawk (any awk would work)
 # Recommended: natsort
 # Command line only version.  See gjoinpdf for gui version.
 # Version 9 by disciple, 14/12/2022.
 # Switched back to processing each file with qpdf in a separate thread before merging - generally faster on a multiprocessor machine, but will be slower with some files than just using pdfunite.
 # Sejda may be better, but will generate larger files in some cases, and requires java.
 # http://www.murga-linux.com/puppy/viewtopic.php?p=149208#149208
 # Currently has NO ERROR HANDLING for the actual join operation.
 # You may experience errors if you have pdfs that are corrupted.

 # Set defaults
 # $$ is the process ID, to prevent collisions between more than one simultaneous instance
 OUTPUTFILE="combined-$(date '+%Y%m%d-%H%M')-PID$$.pdf"

 # Use natsort if it is available, for better sorting
 # `msort -l -w -c h` also gives good results but isn't designed for piping
 if hash natsort 2>/dev/null
 then
 SORT="natsort"
 else
 echo "joinpdf: using sort"
 echo "for better results install natsort from https://github.com/jjk-jacky/natsort"
 sort -V 1>/dev/null 2>&1 <<<1
 if [ $? = "0" ]
 then
  SORT="sort -V"
 else
  SORT="sort"
  echo "joinpdf: sort does not support -V"
  echo "I hope you were careful with any file numbering"
 fi
 fi

 # Initialise filecount
 # Will need to increase this if we want to join 10000+ files!
 # I have not tested to find the maximum number of files in each input directory,
 # although it performed well on my system with an input directory containing 960 files.
 # Note that if there are too many our pipes will break!
 FILECOUNT=1000

 # print usage instructions if run without input arguments.
 # TODO: better usage message.
 test -sd "$@" 2>/dev/null
 if [ "$?" = "0" ]
 then
 echo "
 joinpdf: at least one input pdf or directory is required

 Usage: joinpdf [input files and/or folders]

 Joins pdf inputs and pdfs in any folder inputs (found recursively in any subfolders and sorted in normal alphanumeric order) in the order specified.
 Use gjoinpdf for gui version.
 "
 exit
 fi

 # Just combine the pdfs if run with input arguments.

 # Create temporary directory
 TEMPFOLDER="`mktemp -dt joinpdf-XXX`"

 # Find all files!
 #see comment further down re mime types vs file extensions
 for i in "$@"
 do
 #find -L "$i" -type f -name '*.pdf' | $SORT >> "$TEMPFOLDER"/files.txt
 #this might actually cause problems if we have more files than we can pipe in one go
 #I guess find will pipe them to two different instances of sort, so they won't all be sorted
 find -L "$i" -type f -exec realpath {} + | $SORT >> "$TEMPFOLDER"/files0.txt
 done

 # get input files; detect pdfs by mime-type now
 # this is more 'nixy and will pick up pdfs without a file extension, and not files with a .pdf extension that aren't actually pdfs
 # need to use realpath above to pick up symlinks to pdfs (readlink would be a more portable alternative - would it be slower?)
 # unfortunately (in terms of performance) we can't use the file command via find -exec because it doesn't get passed the realpath
 # see https://unix.stackexchange.com/questions/79222/how-can-i-efficiently-dereference-all-symlinks-in-find-output-filenames

 # make sure we create file so there is no error in the next while
 touch "$TEMPFOLDER"/files.txt
 while IFS='' read -r line
 do
 file -F $'\t' --mime-type "$line" | grep 'application/pdf$' | cut -f 1 >> "$TEMPFOLDER"/files.txt
 done < "$TEMPFOLDER"/files0.txt

 # Make sure output file has an extension
 OUTPUTFILE="`echo $OUTPUTFILE | gawk '{gsub (/\.pdf$|\.PDF$/,"",$0); print $0'}`"
 OUTPUTFILE="$OUTPUTFILE.pdf"

 # Symlink files for us to join
 while read line
 do FILECOUNT=$(($FILECOUNT+1))
 (qpdf --empty  --remove-page-labels --pages "$line" -- "$TEMPFOLDER"/$FILECOUNT) &
 done < "$TEMPFOLDER"/files.txt
 wait

 # Remove lists
 rm -f "$TEMPFOLDER"/files0.txt
 rm -f "$TEMPFOLDER"/files.txt

 if [ -z "`ls -A "$TEMPFOLDER"`" ]
 then
 echo "error: no input pdfs found"
 else
 qpdf --empty --stream-data=compress --object-streams=generate --pages "$TEMPFOLDER"/* -- "$OUTPUTFILE" && echo "created: $OUTPUTFILE"
 fi

 #remove temporary directory
 rm -rf "$TEMPFOLDER"
	#! /bin/bash
	# joinpdf: script for Puppy Linux to recursively find and combine pdf files.
	# Required: basename and qpdf.
	# Also: which, realpath, file and gawk (any awk would work)
	# Recommended: natsort
	# Command line only version. See gjoinpdf for gui version.
	# Version 9 by disciple, 14/12/2022.
	# Switched back to processing each file with qpdf in a separate thread before merging - generally faster on a multiprocessor machine, but will be slower with some files than just using pdfunite.
	# Sejda may be better, but will generate larger files in some cases, and requires java.
	# http://www.murga-linux.com/puppy/viewtopic.php?p=149208#149208
	# Currently has NO ERROR HANDLING for the actual join operation.
	# You may experience errors if you have pdfs that are corrupted.

	# Set defaults
	# $$ is the process ID, to prevent collisions between more than one simultaneous instance
	OUTPUTFILE="combined-$(date '+%Y%m%d-%H%M')-PID$$.pdf"

	# Use natsort if it is available, for better sorting
	# `msort -l -w -c h` also gives good results but isn't designed for piping
	if hash natsort 2>/dev/null
	then
	SORT="natsort"
	else
	echo "joinpdf: using sort"
	echo "for better results install natsort from https://github.com/jjk-jacky/natsort"
	sort -V 1>/dev/null 2>&1 <<<1
	if [ $? = "0" ]
	then
	SORT="sort -V"
	else
	SORT="sort"
	echo "joinpdf: sort does not support -V"
	echo "I hope you were careful with any file numbering"
	fi
	fi

	# Initialise filecount
	# Will need to increase this if we want to join 10000+ files!
	# I have not tested to find the maximum number of files in each input directory,
	# although it performed well on my system with an input directory containing 960 files.
	# Note that if there are too many our pipes will break!
	FILECOUNT=1000

	# print usage instructions if run without input arguments.
	# TODO: better usage message.
	test -sd "$@" 2>/dev/null
	if [ "$?" = "0" ]
	then
	echo "
	joinpdf: at least one input pdf or directory is required

	Usage: joinpdf [input files and/or folders]

	Joins pdf inputs and pdfs in any folder inputs (found recursively in any subfolders and sorted in normal alphanumeric order) in the order specified.
	Use gjoinpdf for gui version.
	"
	exit
	fi

	# Just combine the pdfs if run with input arguments.

	# Create temporary directory
	TEMPFOLDER="`mktemp -dt joinpdf-XXX`"

	# Find all files!
	#see comment further down re mime types vs file extensions
	for i in "$@"
	do
	#find -L "$i" -type f -name '*.pdf' \| $SORT >> "$TEMPFOLDER"/files.txt
	#this might actually cause problems if we have more files than we can pipe in one go
	#I guess find will pipe them to two different instances of sort, so they won't all be sorted
	find -L "$i" -type f -exec realpath {} + \| $SORT >> "$TEMPFOLDER"/files0.txt
	done

	# get input files; detect pdfs by mime-type now
	# this is more 'nixy and will pick up pdfs without a file extension, and not files with a .pdf extension that aren't actually pdfs
	# need to use realpath above to pick up symlinks to pdfs (readlink would be a more portable alternative - would it be slower?)
	# unfortunately (in terms of performance) we can't use the file command via find -exec because it doesn't get passed the realpath
	# see https://unix.stackexchange.com/questions/79222/how-can-i-efficiently-dereference-all-symlinks-in-find-output-filenames

	# make sure we create file so there is no error in the next while
	touch "$TEMPFOLDER"/files.txt
	while IFS='' read -r line
	do
	file -F $'\t' --mime-type "$line" \| grep 'application/pdf$' \| cut -f 1 >> "$TEMPFOLDER"/files.txt
	done < "$TEMPFOLDER"/files0.txt

	# Make sure output file has an extension
	OUTPUTFILE="`echo $OUTPUTFILE \| gawk '{gsub (/\.pdf$\|\.PDF$/,"",$0); print $0'}`"
	OUTPUTFILE="$OUTPUTFILE.pdf"

	# Symlink files for us to join
	while read line
	do FILECOUNT=$(($FILECOUNT+1))
	(qpdf --empty --remove-page-labels --pages "$line" -- "$TEMPFOLDER"/$FILECOUNT) &
	done < "$TEMPFOLDER"/files.txt
	wait

	# Remove lists
	rm -f "$TEMPFOLDER"/files0.txt
	rm -f "$TEMPFOLDER"/files.txt

	if [ -z "`ls -A "$TEMPFOLDER"`" ]
	then
	echo "error: no input pdfs found"
	else
	qpdf --empty --stream-data=compress --object-streams=generate --pages "$TEMPFOLDER"/* -- "$OUTPUTFILE" && echo "created: $OUTPUTFILE"
	fi

	#remove temporary directory
	rm -rf "$TEMPFOLDER"