Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # POSTED ONLINE: https://pastebin.com/VimRpgv1
- # dupFinder.py
- # SOURCE: https://www.pythoncentral.io/finding-duplicate-files-with-python/
- # SOURCE: https://www.pythoncentral.io/hashing-files-with-python/
- import os
- import sys
- import hashlib
- def findDups(folders):
- dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
- if args.redact and args.target == None:
- print('WARNING: Search cancelled. Using --redact without --target means all output should be redacted.')
- return dups
- for path in folders:
- if os.path.exists(path):
- print('Searching "%s"' % path)
- # Find the duplicated files and append them to the dups.
- mergeDups(dups, findDup(path))
- else:
- print('%s does not exist.' % path)
- # For all dictionary keys that contain a single file.
- for key in [key for key, value in dups.items() if len(value) == 1]:
- # Delete the item.
- del dups[key]
- return dups
- def findDup(parentFolder):
- dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
- for dirName, subdirs, fileList in os.walk(parentFolder):
- # print('Searching %s' % dirName)
- for filename in fileList:
- # Get the path to the file.
- path = os.path.join(dirName, filename)
- # Calculate hash
- file_hash = hashfile(path)
- # Add or append the file path.
- if file_hash in dups:
- dups[file_hash].append(path)
- else:
- dups[file_hash] = [path]
- return dups
- def mergeDups(a, b):
- for key in b.keys():
- if key in a:
- # Merge lists.
- a[key] += b[key]
- else:
- # Add list as a new entry.
- a[key] = b[key]
- def hashfile(path, blocksize = 65536):
- f = open(path, 'rb')
- hasher = hashlib.sha256()
- buf = f.read(blocksize)
- while len(buf) > 0:
- hasher.update(buf)
- buf = f.read(blocksize)
- f.close()
- return hasher.hexdigest()
- def filterUnique(dups):
- if len(dups) == 0:
- return
- if not args.unique:
- return
- # If no targets.
- if args.target == None:
- for key, value in dups.items():
- # Remove first item.
- value[:] = value[1:]
- return
- for key, value in dups.items():
- # If list is too small.
- if len(value) <= 1:
- continue
- # If all items are targets.
- if all([any([target in path for target in args.target]) for path in value]):
- # Remove first item.
- value[:] = value[1:]
- continue
- # Remove the first non-target item from every result.
- total = 0
- value[:] = [path for path in value if any([target in path for target in args.target]) or (total := total + 1) > 1]
- def filterRedact(dups):
- if len(dups) == 0:
- return
- if not args.redact:
- return
- if args.target == None:
- # Remove every result.
- dups.clear()
- else:
- # Remove non-targets from every result.
- for key, value in dups.items():
- value[:] = [path for path in value if any([target in path for target in args.target])]
- def filterClean(dups):
- if len(dups) == 0:
- return
- # For all dictionary keys that contain no files (made empty by other filters).
- for key in [key for key, value in dups.items() if len(value) == 0]:
- # Delete the item.
- del dups[key]
- def printResults(dups):
- print()
- if len(dups) == 0:
- print('No duplicates found.')
- return
- print('The following files are identical:')
- for key, value in dups.items():
- print()
- for path in sorted(value):
- output = ''
- if args.showhash:
- output += key
- output += ' '
- if args.prepend != '':
- output += args.prepend
- output += '"%s"' % path
- if args.append != '':
- output += args.append
- print(output)
- def wait_for_any_keypress():
- import sys
- if sys.platform == 'win32':
- import os
- os.system('pause')
- elif sys.platform.startswith('linux') or sys.platform == 'darwin':
- print('Press any key to continue . . .')
- import termios
- import tty
- stdin_file_desc = sys.stdin.fileno()
- old_stdin_tty_attr = termios.tcgetattr(stdin_file_desc)
- try:
- tty.setraw(stdin_file_desc)
- sys.stdin.read(1)
- finally:
- termios.tcsetattr(stdin_file_desc, termios.TCSADRAIN, old_stdin_tty_attr)
- def main():
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument('-r', '--redact', action='store_true', help='only display paths which contain a "target" word', required=False)
- parser.add_argument('-a', '--append', help='append this text after every path', type=str, default='', required=False)
- parser.add_argument('-p', '--prepend', help='prepend this text before every path', type=str, default='', required=False)
- parser.add_argument('-t', '--target', action='append', help='only display duplicate groups if one of the paths contains a "target" word', required=False)
- parser.add_argument('-u', '--unique', action='store_true', help='do not display one of the paths in a duplicate group so you can delete the duplicates', required=False)
- parser.add_argument('-s', '--showhash', action='store_true', help='print the SHA-256 hash for each duplicate file', required=False)
- parser.add_argument('folders', help='the directory paths to compare', type=str, nargs='+')
- try:
- global args
- args = parser.parse_args()
- except SystemExit:
- wait_for_any_keypress()
- return
- # Print args.
- import sys
- print(sys.argv)
- dups = findDups(args.folders)
- filterUnique(dups)
- filterRedact(dups)
- filterClean(dups)
- printResults(dups)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement