Skip to content

Instantly share code, notes, and snippets.

@spuklo
Last active November 18, 2019 08:56
Show Gist options
  • Save spuklo/f707666807d95014de51 to your computer and use it in GitHub Desktop.
Save spuklo/f707666807d95014de51 to your computer and use it in GitHub Desktop.
Script that walks through directories and looks for duplicated files.

Script that walks through directories and looks for duplicated files. This particular scripts looks for .jpg, .nef and .raf files. First file found is considered a original one, and all other which have matching MD5 hash, are considered duplicates. Script usage:

usage: duplicates.py [-h] [--remove] [-e EXTENSIONS] root_dir

positional arguments:
root_dir              Root directory that will be searched for duplicates

optional arguments:
-h, --help            show this help message and exit
--remove              Whether script should remove duplicates
-e EXTENSIONS, --ext EXTENSIONS
                    Comma-separated list of extensions of files to be
                    searched. Default:
                    jpg,jpeg,nef,raf,cr2,raw,dng,tif,tiff
from os.path import getsize, exists, isdir, join
from os import walk, remove
from sys import exit
from hashlib import md5
from argparse import ArgumentParser
DEFAULT_EXTENSIONS = "jpg,jpeg,nef,raf,cr2,raw,dng,tif,tiff"
def md5sum_with_size(file_path):
file_hash = md5()
with open(file_path) as f:
file_hash.update(f.read())
return file_hash.hexdigest(), getsize(file_path)
def check_root_dir(root_dir):
if not exists(root_dir) or not isdir(root_dir):
print "\"%s\" does not exist or is not a directory" % root_dir
exit(-1)
def find_duplicates(root_dir, is_to_be_checked):
check_root_dir(root_dir)
files_hashes, duplicates = {}, {}
total_files, total_dirs, bytes_to_free = 0, 0, 0
for root, dirs, files in walk(root_dir):
total_dirs += 1
files_to_hash = [f for f in files if is_to_be_checked(f)]
if len(files_to_hash) > 0:
print "Checking %d files in %s" % (len(files_to_hash), root)
for filename in files_to_hash:
file_path = join(root, filename)
hash_file, file_size = md5sum_with_size(file_path)
if hash_file in files_hashes:
bytes_to_free += file_size
if hash_file in duplicates:
already_found_dupes = duplicates.get(hash_file)
already_found_dupes.append(file_path)
duplicates[hash_file] = already_found_dupes
else:
duplicates[hash_file] = [files_hashes.get(hash_file), file_path]
else:
files_hashes[hash_file] = file_path
total_files += 1
if total_files % 100 == 0:
print "... already checked %d files, found %d duplicates" % (total_files, len(duplicates))
return duplicates, total_dirs, total_files, bytes_to_free
def process_duplicates(should_remove, extensions, duplicates, dirs_number, files_number, bytes_to_save):
print "In %d directories, have checked %d files with extensions %s." % \
(dirs_number, files_number, extensions.replace(',', ', '))
if len(duplicates) > 0:
print "Found %d duplicates, which are taking %d bytes of your disk space." % (len(duplicates), bytes_to_save)
for file_hash, duplicated_files in duplicates.iteritems():
print "MD5 hash: %s" % file_hash
for index, a_file in enumerate(duplicated_files):
if index == 0:
print "Original : %s" % a_file
else:
print "Duplicate: %s" % a_file
if should_remove:
remove(a_file)
print ""
else:
print "No duplicates found"
def get_file_checker(extensions):
ext_list = extensions.split(",")
def is_to_be_checked(a_file):
return len([ext for ext in ext_list if a_file.lower().endswith('.' + ext.lower())]) > 0
return is_to_be_checked
if __name__ == "__main__":
args_parser = ArgumentParser()
args_parser.add_argument("root_dir", help="Root directory that will be searched for duplicates")
args_parser.add_argument("--remove", help="Whether script should remove duplicates", action="store_true")
args_parser.add_argument("-e", "--ext", dest="extensions", default=DEFAULT_EXTENSIONS,
help="Comma-separated list of extensions of files to be searched. Default: " +
DEFAULT_EXTENSIONS)
args = args_parser.parse_args()
process_duplicates(args.remove, args.extensions, *find_duplicates(args.root_dir, get_file_checker(args.extensions)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment