Skip to content

Instantly share code, notes, and snippets.

@yangmillstheory
Created August 25, 2018 01:13
Show Gist options
  • Save yangmillstheory/4a330e77119efac29a6458630d2cd133 to your computer and use it in GitHub Desktop.
Save yangmillstheory/4a330e77119efac29a6458630d2cd133 to your computer and use it in GitHub Desktop.
Concurrent directory traversal
import os
import argparse
import hashlib
import threading
from collections import defaultdict
from queue import Queue
rem = Queue(maxsize=0)
sem = threading.Semaphore(value=100)
_file_info = defaultdict(set)
_file_info_lock = threading.Lock()
_seen = set()
_seen_lock = threading.Lock()
def worker(path):
while True:
path = os.path.join(path, rem.get())
with _seen_lock:
if path in _seen:
continue
else:
_seen.add(path)
sem.acquire()
try:
if os.path.isdir(path):
for _path in os.listdir(path):
if _path.startswith('.'):
continue
rem.put_nowait(os.path.join(path, _path))
else:
hasher = hashlib.md5()
with open(path, 'rb') as f:
hasher.update(f.read())
with _file_info_lock:
_file_info[hasher.hexdigest()].add(
(path, os.stat(path).st_size)
)
finally:
sem.release()
rem.task_done()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d')
parser.add_argument('-n', type=int)
args = parser.parse_args()
for _ in range(args.n):
threading.Thread(target=worker, args=(args.d,)).start()
rem.put_nowait(os.path.join(os.getcwd(), args.d))
rem.join()
# print(_file_info)
os._exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment