|
from os.path import getsize, exists, isdir, join |
|
from os import walk, remove |
|
from sys import exit |
|
from hashlib import md5 |
|
from argparse import ArgumentParser |
|
|
|
DEFAULT_EXTENSIONS = "jpg,jpeg,nef,raf,cr2,raw,dng,tif,tiff" |
|
|
|
|
|
def md5sum_with_size(file_path): |
|
file_hash = md5() |
|
with open(file_path) as f: |
|
file_hash.update(f.read()) |
|
return file_hash.hexdigest(), getsize(file_path) |
|
|
|
|
|
def check_root_dir(root_dir): |
|
if not exists(root_dir) or not isdir(root_dir): |
|
print "\"%s\" does not exist or is not a directory" % root_dir |
|
exit(-1) |
|
|
|
|
|
def find_duplicates(root_dir, is_to_be_checked): |
|
check_root_dir(root_dir) |
|
|
|
files_hashes, duplicates = {}, {} |
|
total_files, total_dirs, bytes_to_free = 0, 0, 0 |
|
|
|
for root, dirs, files in walk(root_dir): |
|
total_dirs += 1 |
|
files_to_hash = [f for f in files if is_to_be_checked(f)] |
|
if len(files_to_hash) > 0: |
|
print "Checking %d files in %s" % (len(files_to_hash), root) |
|
for filename in files_to_hash: |
|
file_path = join(root, filename) |
|
hash_file, file_size = md5sum_with_size(file_path) |
|
if hash_file in files_hashes: |
|
bytes_to_free += file_size |
|
if hash_file in duplicates: |
|
already_found_dupes = duplicates.get(hash_file) |
|
already_found_dupes.append(file_path) |
|
duplicates[hash_file] = already_found_dupes |
|
else: |
|
duplicates[hash_file] = [files_hashes.get(hash_file), file_path] |
|
else: |
|
files_hashes[hash_file] = file_path |
|
|
|
total_files += 1 |
|
if total_files % 100 == 0: |
|
print "... already checked %d files, found %d duplicates" % (total_files, len(duplicates)) |
|
|
|
return duplicates, total_dirs, total_files, bytes_to_free |
|
|
|
|
|
def process_duplicates(should_remove, extensions, duplicates, dirs_number, files_number, bytes_to_save): |
|
print "In %d directories, have checked %d files with extensions %s." % \ |
|
(dirs_number, files_number, extensions.replace(',', ', ')) |
|
if len(duplicates) > 0: |
|
print "Found %d duplicates, which are taking %d bytes of your disk space." % (len(duplicates), bytes_to_save) |
|
for file_hash, duplicated_files in duplicates.iteritems(): |
|
print "MD5 hash: %s" % file_hash |
|
for index, a_file in enumerate(duplicated_files): |
|
if index == 0: |
|
print "Original : %s" % a_file |
|
else: |
|
print "Duplicate: %s" % a_file |
|
if should_remove: |
|
remove(a_file) |
|
print "" |
|
else: |
|
print "No duplicates found" |
|
|
|
|
|
def get_file_checker(extensions): |
|
ext_list = extensions.split(",") |
|
|
|
def is_to_be_checked(a_file): |
|
return len([ext for ext in ext_list if a_file.lower().endswith('.' + ext.lower())]) > 0 |
|
|
|
return is_to_be_checked |
|
|
|
|
|
if __name__ == "__main__": |
|
args_parser = ArgumentParser() |
|
args_parser.add_argument("root_dir", help="Root directory that will be searched for duplicates") |
|
args_parser.add_argument("--remove", help="Whether script should remove duplicates", action="store_true") |
|
args_parser.add_argument("-e", "--ext", dest="extensions", default=DEFAULT_EXTENSIONS, |
|
help="Comma-separated list of extensions of files to be searched. Default: " + |
|
DEFAULT_EXTENSIONS) |
|
args = args_parser.parse_args() |
|
|
|
process_duplicates(args.remove, args.extensions, *find_duplicates(args.root_dir, get_file_checker(args.extensions))) |