Advertisement
stream13

UniqueFilter

Apr 29th, 2015
373
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.45 KB | None | 0 0
  1. __author__ = 'Andrii Shelestov <streamx3@gmail.com>'
  2. # http://pastebin.com/FBP6zSKC
  3.  
  4. import sys
  5. import os
  6. import shutil
  7. import ntpath
  8. from hashlib import md5
  9.  
  10. #############
  11. # Functions #
  12. ################################################################################
  13.  
  14.  
  15. def error_and_leave(error):
  16.     print(error)
  17.     sys.exit(1)
  18.  
  19.  
  20. def assert_dir_exists(dir):
  21.     if not os.path.exists(dir):
  22.         error_and_leave('Dir "' + dir + '" does not exitst!')
  23.  
  24.  
  25. def get_filepaths(directory, ext=None):
  26.     file_paths = []
  27.  
  28.     for root, directories, files in os.walk(directory):
  29.         for filename in files:
  30.             filepath = os.path.join(root, filename)
  31.             if ext is None:
  32.                 file_paths.append(filepath)
  33.             else:
  34.                 if filepath.endswith(ext):
  35.                     file_paths.append(filepath)
  36.  
  37.     file_paths.sort(reverse=True)
  38.     return file_paths  # Self-explanatory.
  39.  
  40.  
  41. def md5sum(filename):
  42.     # Calculates MD5 hash of file content just like program md5sum
  43.     return md5(open(filename, 'rb').read()).hexdigest()
  44.  
  45.  
  46. def file_list2dict(filelist):
  47.     retval = {}
  48.     for filename in filelist:
  49.         retval[md5sum(filename)] = filename
  50.     return retval
  51.  
  52.  
  53. def filter_and_copy_unique_files(argv):
  54.     if len(argv) not in (3, 4):
  55.         error_and_leave('usage: ' + sys.argv[0] + ' srcDir dstDir [extension]')
  56.  
  57.     src_dir = argv[1]
  58.     dst_dir = argv[2]
  59.  
  60.     assert_dir_exists(src_dir)
  61.     assert_dir_exists(dst_dir)
  62.  
  63.     if len(argv) is 4:
  64.         file_list = get_filepaths(src_dir, argv[3])
  65.     else:
  66.         file_list = get_filepaths(src_dir)
  67.  
  68.     print('Files:\t' + str(len(file_list)))
  69.     file_dict = file_list2dict(file_list)
  70.     print('Unique:\t' + str(len(file_dict.items())))
  71.     print('')
  72.  
  73.     for key, filename in file_dict.items():
  74.         dst_file = os.path.join(dst_dir, ntpath.basename(filename))
  75.         if os.path.exists(dst_file):
  76.             if md5sum(dst_file) == md5sum(filename):
  77.                 print('Skipping  ' + key + ' "' + filename + '"')
  78.                 continue
  79.             else:
  80.                 print('Replacing ' + key + ' "' + filename + '"')
  81.         else:
  82.             print('Copying   ' + key + ' "' + filename + '"')
  83.         shutil.copy(filename, os.path.join(dst_dir, ntpath.basename(filename)))
  84.  
  85.  
  86. ########
  87. # Main #
  88. ################################################################################
  89.  
  90. filter_and_copy_unique_files(sys.argv)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement