Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from collections import defaultdict
- import os
- def find_duplicates_normal(path):
- CHUNKSIZE = 2 * 1024**2
- duplicates = defaultdict(list)
- for root, dirs, files in os.walk(path):
- for file in files:
- file = os.path.join(root, file)
- with open(file, 'rb') as fd:
- h = hashlib.md5()
- for chunk in iter(lambda: fd.read(CHUNKSIZE), b''):
- h.update(chunk)
- duplicates[h.hexdigest()].append(file)
- return [d for d in duplicates.values() if len(d) > 1]
- def find_duplicates_bytearray(path):
- CHUNKSIZE = 1 * 1024**2
- chunk = bytearray(CHUNKSIZE)
- view = memoryview(chunk)
- duplicates = defaultdict(list)
- for root, dirs, files in os.walk(path):
- for file in files:
- file = os.path.join(root, file)
- with open(file, 'rb') as fd:
- h = hashlib.md5()
- while True:
- size = fd.readinto(view)
- if not size:
- break
- h.update(view[:size])
- duplicates[h.hexdigest()].append(file)
- return [d for d in duplicates.values() if len(d) > 1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement