donnaken15 · August 27, 2024 00:06 · donnaken15 · Jun 10, 2024 · donnaken15 · Aug 25, 2024
diff --git a/dedupe b/dedupe
 #!/bin/zsh
 [ $# -lt 2 ] && {
 	[ $# -eq 1 ] &&
 		echo 'you must specify more than one file to be deduped' &&
 		echo
 	echo 'dedupe [input files]'
 	echo '- replace multiple unchanging copies of'
 	echo '  the same files with hardlinks to save space'
 	echo '- as of now, it is recommended to execute this'
 	echo '  only on files that exist on a singular device'
 	exit 1
 }

 function userval() { export "$1"="$2" }
 dotload=$(command -v dotload || echo /usr/share/dk15/dotload) && source "${dotload}" 'dedupe.conf' 2>/dev/null
 mp=4
 userval passes $mp
 userval hash b2
 userval force_relink 0
 userval sanity_check 0
 userval hide_invalid 0
 hash_params=()
 [ ! "$hash" = b3 ] && hash_params+=(-b)
 [ "$hash" = b2 -o "$hash" = b3 ] && {
 	userval hash_length 20 # bytes, being conservative for big files
 	[ "$hash" = b2 ] && {
 		hash_params+=(-l $((${hash_length}<<2)))
 	} || {
 		hash_threads=1
 		hash_params+=(-l ${hash_length} --num-threads ${hash_threads} --no-names)
 		# hashing a bunch of files that use 16 threads
 		# each feels iffy, should scale up with file size
 	}
 }

 NUL='/dev/null'
 [ $passes -ge 1 -a $passes -le 4 ] || {
 	echo "Invalid config: passes = ${passes}. Setting to ${mp}." 1>&2
 	passes=$mp
 }
 which fsutil.exe 2>$NUL >$NUL
 fsutil=$?
 [ $passes -ge 3 -a $fsutil -ne 0 ] && {
 	echo '[91mWARNING: Passes 3 and 4 may take a while to find and' 1>&2
 	echo 'merge groups of hardlinks using the inputs provided![0m' 1>&2
 }
 { which "${hash}sum" } 2>$NUL >$NUL || {
 	echo "Cannot find ${hash}sum as a hashing program." 1>&2
 	hash=sha256
 	echo "Faulting to ${hash}." 1>&2
 }
 { which "${hash}sum" } 2>$NUL >$NUL || {
 	echo "Fallback hashing program ${hash}sum does not exist. Aborting..." 1>&2
 	exit 1
 }
 hashstrlen=$((${hash_length}<<1))
 hashset=()
 baseset=()
 counters=() # for skipping unique files in subsequent passes that had no duplicates counted up
 filter=() # 0 = if pass>1, don't process, 1 = retry on later passes
 function wpathfail() { echo $2; }
 function dsize() {
 	local test=(`df -xtmpfs -xdevtmpfs --output=avail --total`) && echo $((${test[-1]} * 1024)) # absurd
 }
 fsize=(stat -Lc%s) # attempts to reduce amount of new processes, but substitution will always require it
 inode=(stat -Lc%i) # location of the data in the raw drive
 mntpt=(df --output=target)
 sfxlp=('/proc' '/dev' '/tmp' '/var/tmp' '/sys' '/boot')
 sfind=()
 for f in "${sfxlp[@]}"; do
 	sfind+=(! -path "\"${f}/*\"")
 done
 unset sfxlp
 #profile=(date '+%s%N')
 #profileend() { echo "profile script" $(($(($($profile) - $1)) / 1000000000.0)) }
 dls=() # drive letters
 mps=() # mount points
 (mount | sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\|\/mnt\/\w\|\/\w\).*/\1 \2/p') |
 while read -r lt; do
 	l=(${(@s: :)lt}) # absurd
 	dls+=(${l[1]:l}) # C:
 	mps+=(${l[2]:l}) # /cygdrive/c/
 done
 # i hate linux
 function wsl_path() {
 	l1="${2:l}"
 	local test
 	[[ ! "$l1" = /* ]] && {
 		[ "$1" = '-u' -a "$l1" = 'NUL' ] && {
 			echo "$NUL"
 			return
 		}
 		[ ! "$1" = '-u' -a "$l1" = "$NUL" ] && {
 			echo 'NUL'
 			return
 		}
 		wslpath $1 "$2"
 		return
 	} || {
 		[[ "$l1" = /cygdrive/* ]] &&
 			test=${mps[1][1,-2]}${1:10} ||
 		{
 			esc="${2//\\//}"
 			for i in {1..${#dls}}; do
 				[[ "$l1" = /mnt${mps[$i][-2,-1]}* ]] && { # wsl default
 					test="${mps[$i][1,-2]}${esc:5}"
 					break
 				}
 				[[ "$l1" = ${mps[$i][-2,-1]}/* ]] && { # msys
 					test="${mps[$i]}/${esc:3}"
 					break
 				}
 				[[ "$l1" = ${dls[$i]}* ]] && {
 					test="${mps[$i]}${esc:2}"
 					break
 				}
 				[[ "$l1" = ${mps[$i]}* ]] && {
 					test="${mps[$i]}${esc:${#mps[$i]}}"
 					break
 				}
 			done
 			test="${test:-$2}"
 		}
 		[ "$1" = '-u' ] && {
 			echo "$test"
 			return
 		}
 	}
 	wslpath $1 "$test" || echo "[31mMISHANDLED PATH: $test[0m" 1>&2
 }
 function ugh() {
 	echo -n "$2"
 }
 wpath=wpathfail
 wpath=`which cygpath 2>$NUL` || {
 	which wslpath 2>$NUL >$NUL && wpath=wsl_path || {
 		# wtf to do here
 		#echo 'No path converter utility found.' 1>&2
 		#exit 1
 		wpath=ugh
 		# assume linux only, forgetting i keep testing on
 		# windows environments, lol, linooks suk
 	}
 }
 function errgate() {
 	[ "${map[12]}" = "${map[1]}" ] && return 11
 	# try to utilize KSH [ extension instead: -ef
 	# also applies to symlink :/
 	[ ! ${map[10]} = ${map[11]} ] && return 12
 	[ \
 		${map[6]} = ${map[7]} -a \
 		\( \
 			\( $force_relink -eq 0 \) -o \
 			\( $force_relink -ne 0 -a $pass -gt 1 \) \
 		\) \
 	] && return 13
 	[ ! ${map[4]} = ${map[5]} ] && return 14
 	return 0
 }
 units="kmgt"
 function hus() {
 	[ $1 -lt 1024 ] && {
 		printf '%.0fb\n' $1
 		return 1
 	}
 	a=$1
 	u=0
 	until [ `printf '%.0f' $(($a-0.499999999999))` -lt 1024 ]; do # hate
 		a=$(($a/1024.0))
 		u=$(($u+1))
 	done
 	printf '%.2f%sb\n' $(($a-0.004999)) ${units[$u]}
 }
 err='[91;1m'
 rc='[0m'
 #total=0
 lasttest=0
 copycount=0
 expected=0
 rawphysd=0
 test=''
 IFS=$'\n'
 infiles=()
 preperrs=("not a file" "blank")
 echo "[97mdedupe - Building file list...${rc}"
 # should probably also generally avoid deduping files
 # less than at least 8 bytes for whatever reason, in
 # the case of like blank text files which would have
 # one newline (LF or CR LF) or something
 # this is why you should be selective with what
 # files you want to dedupe, like certain extensions
 function prep_check() # i don't want to have to rely on this as its own function
 {
 	[ ! -e "$1" -o -d "$1" ] && return 1
 	[ ! -s "$1" ] && return 2
 	return 0
 }
 # this pains me as i'm processing thousands to ten thousand files
 # wonder what would even be more noticeable, this or doing realpath
 # for every pass right when a file needs to be scanned
 # blazing fast on artix vm
 i=1
 for f in "$@"; do
 	echo -n "($i/$#)"$'\r'; i=$(($i+1))
 	prep_check "$f" || {
 		ERR=$?
 		[ $hide_invalid -eq 0 ] && echo "${err}$f is ${preperrs[$ERR]}.${rc}" 1>&2
 		continue
 	}
 	infiles+=("`realpath -s "$($wpath -u "$f")"`")
 done
 # next totally sane step: move hash generation to an array creation block
 # because time will be wasted on subsequent passes
 before=`dsize`
 echo "[97m(${#infiles} files)${rc}"
 for pass in {1..$passes}; do
 	batched=0
 	i=1
 	for f in "${infiles[@]}"; do
 		[ $sanity_check -ne 0 ] && echo "[90m$i (${counters[$i]:--}): $f[0m"
 		[ \
 			$pass -gt 1 -a \( \
 				\( ! -z ${filter[$i]} -a ${filter[$i]} -ne 0 \) -o \
 				\( ! -z ${counters[$i]} -a ${counters[$i]} -le 0 \) \
 			\) \
 		] && { i=$(($i + 1)) && continue }
 		map=(
 			"$f"
 			`${hash}sum ${hash_params} "$f"`
 			"${f:t}"
 		)
 		local tmp=${test:-0}
 		test=${map[2][0,$hashstrlen]}
 		check=${hashset[(Ie)$test]:-0}
 		[ $check -eq 0 ] && {
 			hashset+=($test)
 			baseset+=("$f")
 			counters[$i]=0
 			i=$(($i + 1))
 			filter+=(0)
 			continue
 		}
 		# FORGOT, NEED TO CHECK DIFFERENT MOUNT POINTS CONFLICTING IN THIS PLAIN ARRAY
 		base="${baseset[$check]}"
 		map+=($(
 			$fsize "$f" "$base"
 			$inode "$f" "$base"
 			dsize
 			$mntpt "$f" "$base"
 		)
 			"$base")
 		errgate ${map[@]}
 		ERR=$?
 		[ $pass -eq 1 ] && filter+=(${ERR})
 		[ $ERR -ne 0 ] && {
 			[ $pass -eq 1 ] && {
 				case ${ERR} in
 					'11')
 						errstr="${map[3]} cannot be linked to itself.";;
 					'12')
 						errstr="Mount points do not match for"
 						errstr="${errstr} ${map[10]}/.../${map[3]} and ${map[11]}/.../$(basename "$base")";;
 					'13')
 						#copycount=$(($copycount + 1)) # ...
 						errstr="${map[3]} is already hardlinked.";;
 					'14')
 						errstr="[${hashset[$check]:0:15}, ${map[4]}] ${map[3]} and [${test:0:15}, ${map[5]}]"
 						errstr="${errstr} $(basename "$base") have matching hashes but different size!!!!";;
 					*)
 						errstr="Uncaught error $ERR";;
 				esac
 				# there has to be a way to make this into an array thing instead
 				echo "${err}${errstr}${rc}" 1>&2
 			}
 			i=$(($i + 1))
 			continue
 		}
 		s=${map[4]}
 		b=${map[8]}
 	#	prof=`$profile`
 		{
 			# TODO: handle permission denied error just
 			# so pass number text isn't printed prematurely
 			lasttest=$tmp
 			[ "$f" -nt "$base" ] && { # absurd but muh archives/history reasons
 				target="$f"
 				source="$base"
 				swap=1
 			} || {
 				target="$base"
 				source="$f"
 				swap=0
 			}
 			[ $copycount -eq 0 -a $pass -eq 1 ] && echo 'Deduping...'
 			[ $batched -eq 0 -a $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}"
 			[ $pass -eq 1 ] && copycount=$(($copycount + 1))
 			case "$pass" in
 				1)	;&
 				2)	uhh=`dsize`
 					ln -f "$source" "$target" && {
 						rawphysd=$(($rawphysd + `dsize` - ${uhh}))
 						echo "[95;1m[${test:0:15}]${rc}" \
 							"[97m${base:t}${rc}" \
 							"[93;1m←${rc} [96;1m$f${rc}"
 						[ $pass -eq 1 ] && {
 							expected=$(($expected + $s))
 							counters[$i]=$((${counters[$i]}+1))
 						}
 						batched=$(($batched + 1))
 					#	total=$(($total + 1))
 					}
 					;;
 				3)	;&
 				4)	ffs=0
 					[ $fsutil -ne 0 ] && {
 						map=(
 							"${map[$((10+$swap))]}"
 							"${map[$((11-$swap))]}"
 							"${map[$((6+$swap))]}"
 							"${map[$((7-$swap))]}"
 						) && {
 							# NOT A GOOD IDEA
 							find "${map[2]}" -xdev ${sfind[@]} -inum ${map[4]} | while read -r hl; do
 								uhh=`dsize`
 								ln -f "$source" "$hl" 2>$NUL &&
 								ffs=$(($ffs + 1)) && # * MAKE FUNCTION
 								batched=$(($batched + 1)) &&
 								rawphysd=$(($rawphysd + `dsize` - ${uhh})) && {
 									[ $ffs -eq 1 ] && \
 										echo "[95;1m[${test:0:15}]${rc}" \
 											"[97m${source}${rc}" \
 											'[90m<group merge>'
 									echo "[33;1m↑${rc} [96;1m${hl}${rc}"
 								} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
 							done
 							[ $ffs -gt 0 ]
 						} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
 						true
 					} || {
 						test2="`$wpath -m "$target"`"
 						point="${map[$((10+$swap))]}"
 						fsutil.exe hardlink list "$test2" >$NUL && {
 							ffs=0
 							lasttest=$test
 							fsutil.exe hardlink list "$test2" | sed 's/\r//g; s/\\/\//g' | while read -r hl; do
 								map=($(
 									$wpath -u "${point}${hl}" &&
 									dsize
 								)) && {
 									ln -f "${source}" "${map[1]}" &&
 									ffs=$(($ffs + 1)) && # *
 									batched=$(($batched + 1)) &&
 									rawphysd=$(($rawphysd + `dsize` - ${map[2]})) && {
 										[ $ffs -eq 1 ] && \
 											echo "[95;1m[${test:0:15}]${rc}" \
 												"[97m${source:t}${rc}" \
 												'[90m<group merge>'
 										echo "[33;1m↑${rc} [96;1m${map[1]}${rc}"
 									} || false
 								} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
 							done
 							[ $ffs -gt 0 ]
 						} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
 					}
 					;;
 			esac
 		}
 		i=$(($i + 1))
 	#	profileend $prof
 	done
 	[ $batched -eq 0 ] && break
 done
 map=($(
 	hus $before
 	hus `dsize`
 	hus $expected
 	[ $rawphysd -lt 1 ] && echo || echo +
 	hus $rawphysd
 ))
 echo '[92mDone![0m'
 echo 'Free space:'
 echo "Before: ${map[1]}"
 echo "After : ${map[2]}"
 echo "Expected space saved: ${map[3]}"
 echo "Disk space difference: ${map[4]}${map[5]}"
 echo "Found ${#hashset[@]} unique files, $copycount duplicates"
 #read -rsn
 exit 0

diff --git a/dedupe.1 b/dedupe.1
 .TH dedupe 1 "23 Aug 2024" "0.4-24.08.24"
 .SH NAME
 dedupe
 .SH SYNPOSIS
 .B dedupe
 [\fBFILES\fR] ...
 .SH USAGE
 Provide a list of at least two files that contain duplicates of one another to group them into hardlinks.

 For deduping of an entire folder containing duplicate items without specifying every single file, add
 .B **/*.ext
 to the argument list to recursively search for matching files in the current directory.

 It's recommended to run this on a singular drive at a time with folders of
 old system backups or small to medium size archival data or old downloads.
 .SH CONFIGURATION
 Configuration is stored in the user's home directory at \fI~/.config/dk15/dedupe\fR

 A list of acceptable values to change:
 .TP
 .BR passes " = " \fIcount\fR
 Set the number of passes per command. \fIMinimum\fR = \fB1\fR, \fImaximum\fR = \fB4\fR.

 For passes 1 and 2, the utility plainly hardlinks the files provided.
 For passes 3 and 4, which exclusively work on Windows, the files' individual
 hardlinks found on the current storage device are merged into the first copy
 of said duplicates to ensure that there are not two or more separate hardlink
 groups of the exact same file.

 If there's not a single file that is processed on a pass, the utility will exit.
 .TP
 .\" actual suffering
 .BR hash " = " \fIsum_prefix\fR

 Set the preferred hashing function. Recommended functions are BLAKE2 (\fBb2\fR),
 BLAKE3 (\fBb3\fR), or SHA256 (\fBsha256\fR). The name of the chosen function must match
 the name of an existing executable that is suffxed with -sum, like the above examples.

 .TP
 .BR hash_length " = " \fIbyte_count\fR

 BLAKE2 and BLAKE3 only. Set the preferred hash length, adjustable
 in the case of handling an unusual amount of files at once.

 .TP
 .BR force_relink " = " [ \fI0\fR | \fI1\fR ]

 Force the hardlinking of duplicates which have already been hardlinked. Only applies to pass 1.

 .TP
 .BR sanity_check " = " [ \fI0\fR | \fI1\fR ]

 Print every file that is processed for testing.
 .SH AUTHOR
 donnaken15 <wesley@gmx.it>

diff --git a/dedupe.conf.example b/dedupe.conf.example
 # dedupe config
 #
 # Each pass processes the entire file list given to the command.
 # Pass 1, 2: Plainly hardlinks exact copies of input files.
 # Pass 3, 4 (Windows only): Check input duplicate files that
 # already share hardlinks, or just became hardlinks from the
 # current execution, and merge them together with the source
 # file. Separate groups of hardlinks of the same file is an
 # issue which requires a complex explanation.
 passes	=	4

 # Preferred hashing function, recommended: BLAKE2 or BLAKE3
 # The name of the chosen function must match the name of an
 # existing executable that is suffixed with -sum. As such:
 # SHA256: sha256sum, hash = sha256
 # BLAKE3: b3sum, hash = b3
 hash	=	b2

 # Preferred BLAKE hash length in bytes, adjustable in the
 # case of handling an unusual amount of files at once.
 hash_length		=	20

 # Force hardlinking of duplicates which are already hardlinked.
 # Only applies to pass 1.
 force_relink	=	0

 # Print every file that is processed.
 sanity_check	=	0

 # Don't print errors about files that don't exist or are blank.
 hide_invalid	=	0
	#!/bin/zsh
	[ $# -lt 2 ] && {
	[ $# -eq 1 ] &&
	echo 'you must specify more than one file to be deduped' &&
	echo
	echo 'dedupe [input files]'
	echo '- replace multiple unchanging copies of'
	echo ' the same files with hardlinks to save space'
	echo '- as of now, it is recommended to execute this'
	echo ' only on files that exist on a singular device'
	exit 1
	}

	function userval() { export "$1"="$2" }
	dotload=$(command -v dotload \|\| echo /usr/share/dk15/dotload) && source "${dotload}" 'dedupe.conf' 2>/dev/null
	mp=4
	userval passes $mp
	userval hash b2
	userval force_relink 0
	userval sanity_check 0
	userval hide_invalid 0
	hash_params=()
	[ ! "$hash" = b3 ] && hash_params+=(-b)
	[ "$hash" = b2 -o "$hash" = b3 ] && {
	userval hash_length 20 # bytes, being conservative for big files
	[ "$hash" = b2 ] && {
	hash_params+=(-l $((${hash_length}<<2)))
	} \|\| {
	hash_threads=1
	hash_params+=(-l ${hash_length} --num-threads ${hash_threads} --no-names)
	# hashing a bunch of files that use 16 threads
	# each feels iffy, should scale up with file size
	}
	}

	NUL='/dev/null'
	[ $passes -ge 1 -a $passes -le 4 ] \|\| {
	echo "Invalid config: passes = ${passes}. Setting to ${mp}." 1>&2
	passes=$mp
	}
	which fsutil.exe 2>$NUL >$NUL
	fsutil=$?
	[ $passes -ge 3 -a $fsutil -ne 0 ] && {
	echo '[91mWARNING: Passes 3 and 4 may take a while to find and' 1>&2
	echo 'merge groups of hardlinks using the inputs provided![0m' 1>&2
	}
	{ which "${hash}sum" } 2>$NUL >$NUL \|\| {
	echo "Cannot find ${hash}sum as a hashing program." 1>&2
	hash=sha256
	echo "Faulting to ${hash}." 1>&2
	}
	{ which "${hash}sum" } 2>$NUL >$NUL \|\| {
	echo "Fallback hashing program ${hash}sum does not exist. Aborting..." 1>&2
	exit 1
	}
	hashstrlen=$((${hash_length}<<1))
	hashset=()
	baseset=()
	counters=() # for skipping unique files in subsequent passes that had no duplicates counted up
	filter=() # 0 = if pass>1, don't process, 1 = retry on later passes
	function wpathfail() { echo $2; }
	function dsize() {
	local test=(`df -xtmpfs -xdevtmpfs --output=avail --total`) && echo $((${test[-1]} * 1024)) # absurd
	}
	fsize=(stat -Lc%s) # attempts to reduce amount of new processes, but substitution will always require it
	inode=(stat -Lc%i) # location of the data in the raw drive
	mntpt=(df --output=target)
	sfxlp=('/proc' '/dev' '/tmp' '/var/tmp' '/sys' '/boot')
	sfind=()
	for f in "${sfxlp[@]}"; do
	sfind+=(! -path "\"${f}/*\"")
	done
	unset sfxlp
	#profile=(date '+%s%N')
	#profileend() { echo "profile script" $(($(($($profile) - $1)) / 1000000000.0)) }
	dls=() # drive letters
	mps=() # mount points
	(mount \| sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\\|\/mnt\/\w\\|\/\w\).*/\1 \2/p') \|
	while read -r lt; do
	l=(${(@s: :)lt}) # absurd
	dls+=(${l[1]:l}) # C:
	mps+=(${l[2]:l}) # /cygdrive/c/
	done
	# i hate linux
	function wsl_path() {
	l1="${2:l}"
	local test
	[[ ! "$l1" = /* ]] && {
	[ "$1" = '-u' -a "$l1" = 'NUL' ] && {
	echo "$NUL"
	return
	}
	[ ! "$1" = '-u' -a "$l1" = "$NUL" ] && {
	echo 'NUL'
	return
	}
	wslpath $1 "$2"
	return
	} \|\| {
	[[ "$l1" = /cygdrive/* ]] &&
	test=${mps[1][1,-2]}${1:10} \|\|
	{
	esc="${2//\\//}"
	for i in {1..${#dls}}; do
	[[ "$l1" = /mnt${mps[$i][-2,-1]}* ]] && { # wsl default
	test="${mps[$i][1,-2]}${esc:5}"
	break
	}
	[[ "$l1" = ${mps[$i][-2,-1]}/* ]] && { # msys
	test="${mps[$i]}/${esc:3}"
	break
	}
	[[ "$l1" = ${dls[$i]}* ]] && {
	test="${mps[$i]}${esc:2}"
	break
	}
	[[ "$l1" = ${mps[$i]}* ]] && {
	test="${mps[$i]}${esc:${#mps[$i]}}"
	break
	}
	done
	test="${test:-$2}"
	}
	[ "$1" = '-u' ] && {
	echo "$test"
	return
	}
	}
	wslpath $1 "$test" \|\| echo "[31mMISHANDLED PATH: $test[0m" 1>&2
	}
	function ugh() {
	echo -n "$2"
	}
	wpath=wpathfail
	wpath=`which cygpath 2>$NUL` \|\| {
	which wslpath 2>$NUL >$NUL && wpath=wsl_path \|\| {
	# wtf to do here
	#echo 'No path converter utility found.' 1>&2
	#exit 1
	wpath=ugh
	# assume linux only, forgetting i keep testing on
	# windows environments, lol, linooks suk
	}
	}
	function errgate() {
	[ "${map[12]}" = "${map[1]}" ] && return 11
	# try to utilize KSH [ extension instead: -ef
	# also applies to symlink :/
	[ ! ${map[10]} = ${map[11]} ] && return 12
	[ \
	${map[6]} = ${map[7]} -a \
	\( \
	\( $force_relink -eq 0 \) -o \
	\( $force_relink -ne 0 -a $pass -gt 1 \) \
	\) \
	] && return 13
	[ ! ${map[4]} = ${map[5]} ] && return 14
	return 0
	}
	units="kmgt"
	function hus() {
	[ $1 -lt 1024 ] && {
	printf '%.0fb\n' $1
	return 1
	}
	a=$1
	u=0
	until [ `printf '%.0f' $(($a-0.499999999999))` -lt 1024 ]; do # hate
	a=$(($a/1024.0))
	u=$(($u+1))
	done
	printf '%.2f%sb\n' $(($a-0.004999)) ${units[$u]}
	}
	err='[91;1m'
	rc='[0m'
	#total=0
	lasttest=0
	copycount=0
	expected=0
	rawphysd=0
	test=''
	IFS=$'\n'
	infiles=()
	preperrs=("not a file" "blank")
	echo "[97mdedupe - Building file list...${rc}"
	# should probably also generally avoid deduping files
	# less than at least 8 bytes for whatever reason, in
	# the case of like blank text files which would have
	# one newline (LF or CR LF) or something
	# this is why you should be selective with what
	# files you want to dedupe, like certain extensions
	function prep_check() # i don't want to have to rely on this as its own function
	{
	[ ! -e "$1" -o -d "$1" ] && return 1
	[ ! -s "$1" ] && return 2
	return 0
	}
	# this pains me as i'm processing thousands to ten thousand files
	# wonder what would even be more noticeable, this or doing realpath
	# for every pass right when a file needs to be scanned
	# blazing fast on artix vm
	i=1
	for f in "$@"; do
	echo -n "($i/$#)"$'\r'; i=$(($i+1))
	prep_check "$f" \|\| {
	ERR=$?
	[ $hide_invalid -eq 0 ] && echo "${err}$f is ${preperrs[$ERR]}.${rc}" 1>&2
	continue
	}
	infiles+=("`realpath -s "$($wpath -u "$f")"`")
	done
	# next totally sane step: move hash generation to an array creation block
	# because time will be wasted on subsequent passes
	before=`dsize`
	echo "[97m(${#infiles} files)${rc}"
	for pass in {1..$passes}; do
	batched=0
	i=1
	for f in "${infiles[@]}"; do
	[ $sanity_check -ne 0 ] && echo "[90m$i (${counters[$i]:--}): $f[0m"
	[ \
	$pass -gt 1 -a \( \
	\( ! -z ${filter[$i]} -a ${filter[$i]} -ne 0 \) -o \
	\( ! -z ${counters[$i]} -a ${counters[$i]} -le 0 \) \
	\) \
	] && { i=$(($i + 1)) && continue }
	map=(
	"$f"
	`${hash}sum ${hash_params} "$f"`
	"${f:t}"
	)
	local tmp=${test:-0}
	test=${map[2][0,$hashstrlen]}
	check=${hashset[(Ie)$test]:-0}
	[ $check -eq 0 ] && {
	hashset+=($test)
	baseset+=("$f")
	counters[$i]=0
	i=$(($i + 1))
	filter+=(0)
	continue
	}
	# FORGOT, NEED TO CHECK DIFFERENT MOUNT POINTS CONFLICTING IN THIS PLAIN ARRAY
	base="${baseset[$check]}"
	map+=($(
	$fsize "$f" "$base"
	$inode "$f" "$base"
	dsize
	$mntpt "$f" "$base"
	)
	"$base")
	errgate ${map[@]}
	ERR=$?
	[ $pass -eq 1 ] && filter+=(${ERR})
	[ $ERR -ne 0 ] && {
	[ $pass -eq 1 ] && {
	case ${ERR} in
	'11')
	errstr="${map[3]} cannot be linked to itself.";;
	'12')
	errstr="Mount points do not match for"
	errstr="${errstr} ${map[10]}/.../${map[3]} and ${map[11]}/.../$(basename "$base")";;
	'13')
	#copycount=$(($copycount + 1)) # ...
	errstr="${map[3]} is already hardlinked.";;
	'14')
	errstr="[${hashset[$check]:0:15}, ${map[4]}] ${map[3]} and [${test:0:15}, ${map[5]}]"
	errstr="${errstr} $(basename "$base") have matching hashes but different size!!!!";;
	*)
	errstr="Uncaught error $ERR";;
	esac
	# there has to be a way to make this into an array thing instead
	echo "${err}${errstr}${rc}" 1>&2
	}
	i=$(($i + 1))
	continue
	}
	s=${map[4]}
	b=${map[8]}
	# prof=`$profile`
	{
	# TODO: handle permission denied error just
	# so pass number text isn't printed prematurely
	lasttest=$tmp
	[ "$f" -nt "$base" ] && { # absurd but muh archives/history reasons
	target="$f"
	source="$base"
	swap=1
	} \|\| {
	target="$base"
	source="$f"
	swap=0
	}
	[ $copycount -eq 0 -a $pass -eq 1 ] && echo 'Deduping...'
	[ $batched -eq 0 -a $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}"
	[ $pass -eq 1 ] && copycount=$(($copycount + 1))
	case "$pass" in
	1) ;&
	2) uhh=`dsize`
	ln -f "$source" "$target" && {
	rawphysd=$(($rawphysd + `dsize` - ${uhh}))
	echo "[95;1m[${test:0:15}]${rc}" \
	"[97m${base:t}${rc}" \
	"[93;1m←${rc} [96;1m$f${rc}"
	[ $pass -eq 1 ] && {
	expected=$(($expected + $s))
	counters[$i]=$((${counters[$i]}+1))
	}
	batched=$(($batched + 1))
	# total=$(($total + 1))
	}
	;;
	3) ;&
	4) ffs=0
	[ $fsutil -ne 0 ] && {
	map=(
	"${map[$((10+$swap))]}"
	"${map[$((11-$swap))]}"
	"${map[$((6+$swap))]}"
	"${map[$((7-$swap))]}"
	) && {
	# NOT A GOOD IDEA
	find "${map[2]}" -xdev ${sfind[@]} -inum ${map[4]} \| while read -r hl; do
	uhh=`dsize`
	ln -f "$source" "$hl" 2>$NUL &&
	ffs=$(($ffs + 1)) && # * MAKE FUNCTION
	batched=$(($batched + 1)) &&
	rawphysd=$(($rawphysd + `dsize` - ${uhh})) && {
	[ $ffs -eq 1 ] && \
	echo "[95;1m[${test:0:15}]${rc}" \
	"[97m${source}${rc}" \
	'[90m<group merge>'
	echo "[33;1m↑${rc} [96;1m${hl}${rc}"
	} \|\| echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
	done
	[ $ffs -gt 0 ]
	} \|\| echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
	true
	} \|\| {
	test2="`$wpath -m "$target"`"
	point="${map[$((10+$swap))]}"
	fsutil.exe hardlink list "$test2" >$NUL && {
	ffs=0
	lasttest=$test
	fsutil.exe hardlink list "$test2" \| sed 's/\r//g; s/\\/\//g' \| while read -r hl; do
	map=($(
	$wpath -u "${point}${hl}" &&
	dsize
	)) && {
	ln -f "${source}" "${map[1]}" &&
	ffs=$(($ffs + 1)) && # *
	batched=$(($batched + 1)) &&
	rawphysd=$(($rawphysd + `dsize` - ${map[2]})) && {
	[ $ffs -eq 1 ] && \
	echo "[95;1m[${test:0:15}]${rc}" \
	"[97m${source:t}${rc}" \
	'[90m<group merge>'
	echo "[33;1m↑${rc} [96;1m${map[1]}${rc}"
	} \|\| false
	} \|\| echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
	done
	[ $ffs -gt 0 ]
	} \|\| echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
	}
	;;
	esac
	}
	i=$(($i + 1))
	# profileend $prof
	done
	[ $batched -eq 0 ] && break
	done
	map=($(
	hus $before
	hus `dsize`
	hus $expected
	[ $rawphysd -lt 1 ] && echo \|\| echo +
	hus $rawphysd
	))
	echo '[92mDone![0m'
	echo 'Free space:'
	echo "Before: ${map[1]}"
	echo "After : ${map[2]}"
	echo "Expected space saved: ${map[3]}"
	echo "Disk space difference: ${map[4]}${map[5]}"
	echo "Found ${#hashset[@]} unique files, $copycount duplicates"
	#read -rsn
	exit 0
	.TH dedupe 1 "23 Aug 2024" "0.4-24.08.24"
	.SH NAME
	dedupe
	.SH SYNPOSIS
	.B dedupe
	[\fBFILES\fR] ...
	.SH USAGE
	Provide a list of at least two files that contain duplicates of one another to group them into hardlinks.

	For deduping of an entire folder containing duplicate items without specifying every single file, add
	.B */.ext
	to the argument list to recursively search for matching files in the current directory.

	It's recommended to run this on a singular drive at a time with folders of
	old system backups or small to medium size archival data or old downloads.
	.SH CONFIGURATION
	Configuration is stored in the user's home directory at \fI~/.config/dk15/dedupe\fR

	A list of acceptable values to change:
	.TP
	.BR passes " = " \fIcount\fR
	Set the number of passes per command. \fIMinimum\fR = \fB1\fR, \fImaximum\fR = \fB4\fR.

	For passes 1 and 2, the utility plainly hardlinks the files provided.
	For passes 3 and 4, which exclusively work on Windows, the files' individual
	hardlinks found on the current storage device are merged into the first copy
	of said duplicates to ensure that there are not two or more separate hardlink
	groups of the exact same file.

	If there's not a single file that is processed on a pass, the utility will exit.
	.TP
	.\" actual suffering
	.BR hash " = " \fIsum_prefix\fR

	Set the preferred hashing function. Recommended functions are BLAKE2 (\fBb2\fR),
	BLAKE3 (\fBb3\fR), or SHA256 (\fBsha256\fR). The name of the chosen function must match
	the name of an existing executable that is suffxed with -sum, like the above examples.

	.TP
	.BR hash_length " = " \fIbyte_count\fR

	BLAKE2 and BLAKE3 only. Set the preferred hash length, adjustable
	in the case of handling an unusual amount of files at once.

	.TP
	.BR force_relink " = " [ \fI0\fR \| \fI1\fR ]

	Force the hardlinking of duplicates which have already been hardlinked. Only applies to pass 1.

	.TP
	.BR sanity_check " = " [ \fI0\fR \| \fI1\fR ]

	Print every file that is processed for testing.
	.SH AUTHOR
	donnaken15 <wesley@gmx.it>
	# dedupe config
	#
	# Each pass processes the entire file list given to the command.
	# Pass 1, 2: Plainly hardlinks exact copies of input files.
	# Pass 3, 4 (Windows only): Check input duplicate files that
	# already share hardlinks, or just became hardlinks from the
	# current execution, and merge them together with the source
	# file. Separate groups of hardlinks of the same file is an
	# issue which requires a complex explanation.
	passes = 4

	# Preferred hashing function, recommended: BLAKE2 or BLAKE3
	# The name of the chosen function must match the name of an
	# existing executable that is suffixed with -sum. As such:
	# SHA256: sha256sum, hash = sha256
	# BLAKE3: b3sum, hash = b3
	hash = b2

	# Preferred BLAKE hash length in bytes, adjustable in the
	# case of handling an unusual amount of files at once.
	hash_length = 20

	# Force hardlinking of duplicates which are already hardlinked.
	# Only applies to pass 1.
	force_relink = 0

	# Print every file that is processed.
	sanity_check = 0

	# Don't print errors about files that don't exist or are blank.
	hide_invalid = 0