Advertisement
FocusedWolf

Python: Duplicate File Finder

Apr 16th, 2022 (edited)
1,510
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.13 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. # POSTED ONLINE: https://pastebin.com/VimRpgv1
  4.  
  5. # dupFinder.py
  6. # SOURCE: https://www.pythoncentral.io/finding-duplicate-files-with-python/
  7. # SOURCE: https://www.pythoncentral.io/hashing-files-with-python/
  8.  
  9. import os
  10. import sys
  11. import hashlib
  12.  
  13. def findDups(folders):
  14.     dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
  15.  
  16.     if args.redact and args.target == None:
  17.         print('WARNING: Search cancelled. Using --redact without --target means all output should be redacted.')
  18.         return dups
  19.  
  20.     for path in folders:
  21.         if os.path.exists(path):
  22.             print('Searching "%s"' % path)
  23.  
  24.             # Find the duplicated files and append them to the dups.
  25.             mergeDups(dups, findDup(path))
  26.  
  27.         else:
  28.             print('%s does not exist.' % path)
  29.  
  30.     # For all dictionary keys that contain a single file.
  31.     for key in [key for key, value in dups.items() if len(value) == 1]:
  32.         # Delete the item.
  33.         del dups[key]
  34.  
  35.     return dups
  36.  
  37. def findDup(parentFolder):
  38.     dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
  39.  
  40.     for dirName, subdirs, fileList in os.walk(parentFolder):
  41.         #  print('Searching %s' % dirName)
  42.  
  43.         for filename in fileList:
  44.             # Get the path to the file.
  45.             path = os.path.join(dirName, filename)
  46.             # Calculate hash
  47.             file_hash = hashfile(path)
  48.  
  49.             # Add or append the file path.
  50.             if file_hash in dups:
  51.                 dups[file_hash].append(path)
  52.             else:
  53.                 dups[file_hash] = [path]
  54.  
  55.     return dups
  56.  
  57. def mergeDups(a, b):
  58.     for key in b.keys():
  59.         if key in a:
  60.             # Merge lists.
  61.             a[key] += b[key]
  62.         else:
  63.             # Add list as a new entry.
  64.             a[key] = b[key]
  65.  
  66. def hashfile(path, blocksize = 65536):
  67.     f = open(path, 'rb')
  68.     hasher = hashlib.sha256()
  69.     buf = f.read(blocksize)
  70.     while len(buf) > 0:
  71.         hasher.update(buf)
  72.         buf = f.read(blocksize)
  73.     f.close()
  74.     return hasher.hexdigest()
  75.  
  76. def filterUnique(dups):
  77.     if len(dups) == 0:
  78.         return
  79.  
  80.     if not args.unique:
  81.         return
  82.  
  83.     # If no targets.
  84.     if args.target == None:
  85.         for key, value in dups.items():
  86.             # Remove first item.
  87.             value[:] = value[1:]
  88.         return
  89.  
  90.     for key, value in dups.items():
  91.         # If list is too small.
  92.         if len(value) <= 1:
  93.             continue
  94.  
  95.         # If all items are targets.
  96.         if all([any([target in path for target in args.target]) for path in value]):
  97.             # Remove first item.
  98.             value[:] = value[1:]
  99.             continue
  100.  
  101.         # Remove the first non-target item from every result.
  102.         total = 0
  103.         value[:] = [path for path in value if any([target in path for target in args.target]) or (total := total + 1) > 1]
  104.  
  105. def filterRedact(dups):
  106.     if len(dups) == 0:
  107.         return
  108.  
  109.     if not args.redact:
  110.         return
  111.  
  112.     if args.target == None:
  113.         # Remove every result.
  114.         dups.clear()
  115.  
  116.     else:
  117.         # Remove non-targets from every result.
  118.         for key, value in dups.items():
  119.             value[:] = [path for path in value if any([target in path for target in args.target])]
  120.  
  121. def filterClean(dups):
  122.     if len(dups) == 0:
  123.         return
  124.  
  125.     # For all dictionary keys that contain no files (made empty by other filters).
  126.     for key in [key for key, value in dups.items() if len(value) == 0]:
  127.         # Delete the item.
  128.         del dups[key]
  129.  
  130. def printResults(dups):
  131.     print()
  132.  
  133.     if len(dups) == 0:
  134.         print('No duplicates found.')
  135.         return
  136.  
  137.     print('The following files are identical:')
  138.  
  139.     for key, value in dups.items():
  140.         print()
  141.         for path in sorted(value):
  142.             output = ''
  143.  
  144.             if args.showhash:
  145.                 output += key
  146.                 output += ' '
  147.  
  148.             if args.prepend != '':
  149.                 output += args.prepend
  150.  
  151.             output += '"%s"' % path
  152.  
  153.             if args.append != '':
  154.                 output += args.append
  155.  
  156.             print(output)
  157.  
  158. def wait_for_any_keypress():
  159.     import sys
  160.     if sys.platform == 'win32':
  161.         import os
  162.         os.system('pause')
  163.     elif sys.platform.startswith('linux') or sys.platform == 'darwin':
  164.         print('Press any key to continue . . .')
  165.         import termios
  166.         import tty
  167.         stdin_file_desc = sys.stdin.fileno()
  168.         old_stdin_tty_attr = termios.tcgetattr(stdin_file_desc)
  169.         try:
  170.             tty.setraw(stdin_file_desc)
  171.             sys.stdin.read(1)
  172.         finally:
  173.             termios.tcsetattr(stdin_file_desc, termios.TCSADRAIN, old_stdin_tty_attr)
  174.  
  175. def main():
  176.     import argparse
  177.     parser = argparse.ArgumentParser()
  178.     parser.add_argument('-r', '--redact', action='store_true', help='only display paths which contain a "target" word', required=False)
  179.     parser.add_argument('-a', '--append', help='append this text after every path', type=str, default='', required=False)
  180.     parser.add_argument('-p', '--prepend', help='prepend this text before every path', type=str, default='', required=False)
  181.     parser.add_argument('-t', '--target', action='append', help='only display duplicate groups if one of the paths contains a "target" word', required=False)
  182.     parser.add_argument('-u', '--unique', action='store_true', help='do not display one of the paths in a duplicate group so you can delete the duplicates', required=False)
  183.     parser.add_argument('-s', '--showhash', action='store_true', help='print the SHA-256 hash for each duplicate file', required=False)
  184.     parser.add_argument('folders', help='the directory paths to compare', type=str, nargs='+')
  185.  
  186.     try:
  187.         global args
  188.         args = parser.parse_args()
  189.     except SystemExit:
  190.         wait_for_any_keypress()
  191.         return
  192.  
  193.     # Print args.
  194.     import sys
  195.     print(sys.argv)
  196.  
  197.     dups = findDups(args.folders)
  198.     filterUnique(dups)
  199.     filterRedact(dups)
  200.     filterClean(dups)
  201.     printResults(dups)
  202.  
  203. if __name__ == '__main__':
  204.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement