Skip to content

Instantly share code, notes, and snippets.

@cgcostume
Created January 31, 2020 19:53
Show Gist options
  • Save cgcostume/dc38c8120966ad5d21b6887c67416d86 to your computer and use it in GitHub Desktop.
Save cgcostume/dc38c8120966ad5d21b6887c67416d86 to your computer and use it in GitHub Desktop.
Create a data base with Directory Hashes
import argparse
import hashlib
import math
import numpy
import os
import sqlite3
import sys
import locale
locale.setlocale(locale.LC_ALL, '')
parser = argparse.ArgumentParser('This creates a new data base with directory hashes.')
parser.add_argument('source', help = 'path to SQL database containing hashes, file pathes, and file sizes', default = '.')
parser.add_argument('target', help = 'target SQL database file path to write hashes and file pathes to')
args = parser.parse_args()
source = os.path.abspath(args.source)
assert os.path.isfile(source)
target = os.path.abspath(args.target)
# assert not os.path.isfile(target)
# total = len(todo)
# progress = 0
# totalDecimals = math.ceil(math.log10(total))
# totalDecimals = totalDecimals + math.floor(totalDecimals / 3)
conn_in = sqlite3.connect(source)
result = conn_in.execute("""SELECT count(*) FROM hashes""")
total = int(result.fetchone()[0])
result = conn_in.execute("""SELECT file, size, hash FROM hashes""")
todo = {}
for file, size, hash in result.fetchall():
dirname = os.path.dirname(file)
basename = os.path.basename(file)
if dirname not in todo:
todo[dirname] = []
todo[dirname].append(basename)
todo[dirname].append(int(size))
todo[dirname].append(hash)
conn_out = sqlite3.connect(target)
conn_out.execute("""CREATE TABLE IF NOT EXISTS hashes (
hash TEXT,
path TEXT NOT NULL,
size INTEGER,
UNIQUE(hash, path)
)""")
conn_out.execute("""CREATE INDEX IF NOT EXISTS idx_hash ON hashes (hash)""")
# conn_out.execute("""DELETE FROM hashes""")
total = len(todo)
progress = 0
totalDecimals = math.ceil(math.log10(total))
totalDecimals = totalDecimals + math.floor(totalDecimals / 3)
hashes = {}
for path in todo:
progress += 1
print ('Processing file', f'{progress:n}'.rjust(totalDecimals), 'of', f'{total:n}', '...',
flush = True, end = ('\r' if progress < total else '\n'))
data = numpy.ravel(todo[path])
try:
size = int(data[1::3].astype(int).sum())
except Exception as e:
print('exception', e, data[1::3])
hasher = hashlib.sha256()
hasher.update(data)
hash = hasher.hexdigest()
conn_out.execute("""INSERT OR IGNORE INTO hashes VALUES (?, ?, ?)""", (hash, path, size))
conn_in.close()
conn_out.commit()
conn_out.close()
@cgcostume
Copy link
Author

Find duplicate files:

SELECT hashes.file, hashes.size, hashes.hash, duplicates.c FROM hashes INNER JOIN (SELECT hash, count(*) AS c  FROM hashes GROUP BY hash HAVING c > 1) AS duplicates ON duplicates.hash = hashes.hash

Find duplicate directories:

SELECT * FROM hashes INNER JOIN (SELECT hash, count(*) AS c  FROM hashes GROUP BY hash HAVING c > 1) AS duplicates ON duplicates.hash = hashes.hash ORDER BY c, hash, path

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment