Skip to content

Instantly share code, notes, and snippets.

@qxcv
Created October 1, 2022 02:45
Show Gist options
  • Save qxcv/052d652c47c33afd767d01175843fb58 to your computer and use it in GitHub Desktop.
Save qxcv/052d652c47c33afd767d01175843fb58 to your computer and use it in GitHub Desktop.
Find and delete large groups of files that are identical except for a single number (preserving the largest-numbered entry)
import collections
import re
import os
number_re = re.compile(r"([0-9]+)")
def number_split_permutations(filename):
for match in number_re.finditer(filename):
start, end = match.span()
prefix = filename[:start]
number = int(filename[start:end], base=10)
suffix = filename[end:]
yield (prefix, suffix), number
def build_number_table(filenames):
match_table = collections.defaultdict(set)
for filename in filenames:
for key, number_val in number_split_permutations(filename):
match_table[key].add((number_val, filename))
return match_table
def delete_files(directory, filenames, min_files, dry_run=True):
number_table = build_number_table(filenames)
to_delete = set()
for key, number_val_pairs in number_table.items():
if len(number_val_pairs) < min_files:
continue
to_keep, *delete_list = (name for _, name in sorted(number_val_pairs, reverse=True))
verb = "Would delete" if dry_run else "Will delete"
print(f"{verb} '{'*'.join(key)}' from '{directory}', except {to_keep} (deletes {len(delete_list)} files)")
to_delete.update(delete_list)
if not dry_run:
for filename in sorted(to_delete):
os.unlink(os.path.join(directory, filename))
def delete_files_recursive(root_directory, *, min_files=11, dry_run=True):
"""Recursively clean directories with many files of the form `<prefix>_<number>_<suffix>`.
Specifically, for each directory below `root_directory`, this script looks for groups of files that are identical
except for a single number in the filename. For each group of files consisting of more than `min_files` members,
this function will delete all such files except the one with the highest number. This is useful for cleaning up
numbered training snapshots in ML code. By default, `dry_run=True prevents files from actually being deleted; set
dry_run=False to delete files.
"""
for dirpath, _, filenames in os.walk(root_directory):
delete_files(directory=dirpath, filenames=filenames, min_files=min_files, dry_run=dry_run)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment