tazdij · February 9, 2015 03:27
diff --git a/dupscout.py b/dupscout.py
 # Written by Don Duvall
 # Licensed under MIT
 # Date 2015-02-08
 # Website: http://deduvall.com/ & http://donaldduvall.com
 # Email: don@deduvall.com
 #
 import os
 import hashlib

 class DupScout:
    
    # Index by MD5 of File -> list of filepath
    fileTree = dict()
    
    # Index by Directory -> list of MD5
    dirIdx = dict()
    
    # Index by Filepath -> MD5 of File
    fileMD5 = dict()
    
    def generateHash(self, filename, blocksize=2**20):
        # Generate and return the hash of the file
        m = hashlib.md5()
        with open(filename, "rb") as f:
            while True:
                buf = f.read(blocksize)
                if not buf:
                    break
                m.update(buf)
        return m.hexdigest()


    def main(self, paths):
        
        # Loop each path to search
        for filepath in paths:
            
            # Loop files and insert into fileTree
            for root, dirs, files in os.walk(filepath, topdown=False):
                
                # Print objects to parse
                print('Scanning: ' + root)
                count = len(files)
                print('# of files: ' + str(count))
                
                i = 0
                for name in files:
                    i += 1
                    # Generate MD5 Hash of the files
                    filename = os.path.join(root, name)
                    fhash = self.generateHash(filename) 
                    #print(filename + ': ' + fhash)
                    
                    if not self.fileTree.has_key(fhash):
                        self.fileTree[fhash] = list()
                    
                    self.fileTree[fhash].append(filename)
                    
                    if not (i % 20):
                        print('# ' + str(i) + ' of ' + str(count))
                    
        # Write the tree to a file
        f = open('log.txt', 'w+')
        for item in self.fileTree.items():
            if len(item[1]) > 1:
                f.write(item[0] + '\t' + str(item[1]) + '\n')
                print('Found ' + str(len(item[1])) + ' locations. Please select a location # to keep.')
                
                # Prompt user to select the file to keep
                locations = item[1]
                i = 0;
                locs = dict()
                for location in locations:
                    locs[str(i)] = location
                    print(str(i) + '. ' + location)
                    i += 1 
                
                idx = raw_input("What location should be kept? ")
                f.write('Keeping Location: ' + locs[idx] + '\n')
                
                # Delete all other locations
                for loc in locs.items():
                    if not loc[0] == idx:
                        # Delete the location
                        os.unlink(loc[1])
                        f.write('Deleting Location: ' + loc[1] + '\n')
                
        f.close()
        print('Finished, log.txt contains the results.')
    
    
 if __name__ == '__main__':
    dupscout = DupScout()
    paths = raw_input('Enter the path to scan for duplicates: ')
    paths = paths.split(';')
    
    
    dupscout.main(paths)
	# Written by Don Duvall
	# Licensed under MIT
	# Date 2015-02-08
	# Website: http://deduvall.com/ & http://donaldduvall.com
	# Email: don@deduvall.com
	#
	import os
	import hashlib

	class DupScout:

	# Index by MD5 of File -> list of filepath
	fileTree = dict()

	# Index by Directory -> list of MD5
	dirIdx = dict()

	# Index by Filepath -> MD5 of File
	fileMD5 = dict()

	def generateHash(self, filename, blocksize=2**20):
	# Generate and return the hash of the file
	m = hashlib.md5()
	with open(filename, "rb") as f:
	while True:
	buf = f.read(blocksize)
	if not buf:
	break
	m.update(buf)
	return m.hexdigest()


	def main(self, paths):

	# Loop each path to search
	for filepath in paths:

	# Loop files and insert into fileTree
	for root, dirs, files in os.walk(filepath, topdown=False):

	# Print objects to parse
	print('Scanning: ' + root)
	count = len(files)
	print('# of files: ' + str(count))

	i = 0
	for name in files:
	i += 1
	# Generate MD5 Hash of the files
	filename = os.path.join(root, name)
	fhash = self.generateHash(filename)
	#print(filename + ': ' + fhash)

	if not self.fileTree.has_key(fhash):
	self.fileTree[fhash] = list()

	self.fileTree[fhash].append(filename)

	if not (i % 20):
	print('# ' + str(i) + ' of ' + str(count))

	# Write the tree to a file
	f = open('log.txt', 'w+')
	for item in self.fileTree.items():
	if len(item[1]) > 1:
	f.write(item[0] + '\t' + str(item[1]) + '\n')
	print('Found ' + str(len(item[1])) + ' locations. Please select a location # to keep.')

	# Prompt user to select the file to keep
	locations = item[1]
	i = 0;
	locs = dict()
	for location in locations:
	locs[str(i)] = location
	print(str(i) + '. ' + location)
	i += 1

	idx = raw_input("What location should be kept? ")
	f.write('Keeping Location: ' + locs[idx] + '\n')

	# Delete all other locations
	for loc in locs.items():
	if not loc[0] == idx:
	# Delete the location
	os.unlink(loc[1])
	f.write('Deleting Location: ' + loc[1] + '\n')

	f.close()
	print('Finished, log.txt contains the results.')


	if __name__ == '__main__':
	dupscout = DupScout()
	paths = raw_input('Enter the path to scan for duplicates: ')
	paths = paths.split(';')


	dupscout.main(paths)