Skip to content

Instantly share code, notes, and snippets.

@diego021
Last active May 27, 2024 21:10
Show Gist options
  • Save diego021/f487214de18d235dd45d83150b50bb6c to your computer and use it in GitHub Desktop.
Save diego021/f487214de18d235dd45d83150b50bb6c to your computer and use it in GitHub Desktop.
Deduplicate files recursively on a given directory.
#!/usr/bin/env python3
"""Scan recursively a given directory and find duplicates files within it.
You can use this module as is by invoking it from the command shell:
~$ python3 deduplicate.py /path/to/scan
"""
import dataclasses
import hashlib
import os
@dataclasses.dataclass(frozen=True)
class FileChecksum:
md5checksum: str
path: str
def __eq__(self, other) -> bool:
if isinstance(other, FileChecksum):
return self.md5checksum == other.md5checksum
return False
def __hash__(self):
return hash(self.md5checksum)
@dataclasses.dataclass
class ScanResult:
visited: list[str] = dataclasses.field(default_factory=list)
unique: set[FileChecksum] = dataclasses.field(default_factory=set)
duplicated: list[FileChecksum] = dataclasses.field(default_factory=list)
def __str__(self):
return (
f'Total visited: {len(self.visited)}, '
f'Unique: {len(self.unique)}, '
f'Duplicated: {len(self.duplicated)}'
)
def _compute_hash(path: str) -> str:
bdata = open(path, 'rb').read()
hash_ = hashlib.md5(bdata)
return hash_.hexdigest()
def scan_path(path: str) -> ScanResult:
result = ScanResult()
for root, dirs, files in os.walk(path):
for file_ in files:
abspath = os.path.join(root, file_)
f = FileChecksum(
md5checksum=_compute_hash(path=abspath),
path=abspath,
)
if f in result.unique:
result.duplicated.append(f)
result.unique.add(f)
result.visited.append(f.path)
assert len(result.visited) == len(result.duplicated) + len(result.unique)
return result
def remove_duplicates(result: ScanResult) -> None:
response = None
for f in result.duplicated:
if response and response.upper() == 'ALL':
os.remove(f.path)
else:
uf = next(u for u in result.unique if u == f)
dpath = f'\033[1;31m{f.path}\033[0m'
upath = f'\033[1;32m{uf.path}\033[0m'
response = input(f'[y/N/all] Remove {dpath} with unique reference on {upath}? ')
if response.upper() in ('Y', 'YES', 'ALL'):
os.remove(f.path)
if __name__ == '__main__':
import sys
path = sys.argv[1].rstrip('/')
print(f'Scaning duplicates in path \033[1m{path}\033[0m ...')
print()
scan_result = scan_path(path)
print(scan_result)
print()
remove_duplicates(scan_result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment