Skip to content

Instantly share code, notes, and snippets.

@yngwie74
Created February 27, 2013 10:39
Show Gist options
  • Save yngwie74/5047007 to your computer and use it in GitHub Desktop.
Save yngwie74/5047007 to your computer and use it in GitHub Desktop.
Find duplicate files within a file system sub-tree. It demonstrates a simple way for combining Functional and Object-Oriented Programming techniques -- no religious wars! -- using generator expressions, comprehensions and decorators, among others.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from os import path
from itertools import imap, chain
import fnmatch
# Set the base path to scan for duplicates here
base = r'/media/file-rep/files'
def must_exist(f):
def check(self, *args, **kwds):
if not self.exists:
raise ValueError('Path to directory %r does not exist.' % self.path)
return f(self, *args, **kwds)
return check
class Dir(object):
'''Encapsulates directory and sub-tree scanning logic'''
def __init__(self, dirpath):
self.path = path.abspath(dirpath)
@property
def exists(self):
return path.exists(self.path)
def _get_full_path_of(self, f):
return path.normcase(path.join(self.path, f))
def _get_entries(self, of_type, filter):
iter = (f for f in os.listdir(self.path) if of_type(self._get_full_path_of(f)))
if filter:
iter = (f for f in iter if fnmatch.fnmatch(f, filter))
return iter
@must_exist
def get_sub_dirs(self, filter=None):
iter = self._get_entries(path.isdir, filter)
return (Dir(self._get_full_path_of(f)) for f in iter)
@must_exist
def get_files(self, filter=None):
iter = self._get_entries(path.isfile, filter)
return imap(self._get_full_path_of, iter)
def __str__(self):
return self.path
#~ end class Dir
def find_depth(top, file_spec):
dir = Dir(top)
local = dir.get_files(file_spec)
recursive = (file for subdir in dir.get_sub_dirs()
for file in find_depth(subdir.path, file_spec))
return chain(local, recursive)
def find_duplicates_of(file_path):
file_name = path.basename(file_path)
return [r for r in sorted(find_depth(base, file_name)) if f != r]
def print_duplicates_of(file_name, duplicates):
file_list = '\r\t'.join([file_name] + duplicates)
sys.stderr.write('\n%s\n' % file_list)
if __name__ == '__main__':
import sys
for cur_file in Dir('.').get_files('_*.*'):
sys.stdout.write('.')
duplicates = find_duplicates_of(cur_file)
if duplicates:
print_duplicates_of(cur_file, duplicates)
@yngwie74
Copy link
Author

I created this little script to aid the maintenance of a media file collection. Files were ordered in directories by category and each directory has a MD5SUMS digest file, so checking for true duplicates vs name clashes was trivial. However, I decided to leave the digest-related code out of the gist to keep things simple and focused.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment