Advertisement
opexxx

downloadMIMEsort.py

Mar 30th, 2015
340
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.68 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import errno
  3. import getopt
  4. import math
  5. import mimetypes
  6. import operator
  7. import os
  8. import shutil
  9. import sys
  10.  
  11. __author__ = "Eric Pruitt"
  12. __license__ = "Public Domain, FreeBSD, or PSF"
  13.  
  14. __all__ = ['IGNORED_FILES', 'MIXED_TYPES_LABEL', 'UNKNOWN_TYPES_LABEL',
  15.     'categorize', 'classify', 'diversity', 'guess_mime_type', 'organize']
  16.  
  17. MAX_DIVERSITY = 0.0
  18. MIXED_TYPES_LABEL = 'mixed'
  19. UNKNOWN_TYPES_LABEL = 'unknown'
  20.  
  21. # If python-magic is installed, it will be used as a fall-back when
  22. # mimetypes.guess_type cannot identify a file. I have tested this with two
  23. # versions of python-magic: the current github master as of 2011.09.26 and the
  24. # package found in the Debian Squeeze repositories. For the latter, we rely on
  25. # monkey-patching. The github master can be found at the following address:
  26. # https://github.com/ahupp/python-magic/raw/master/magic.py
  27. try:
  28.     import magic
  29.     if not hasattr(magic, 'from_file'):
  30.         cookie = magic.open(magic.MAGIC_MIME & ~magic.MAGIC_MIME_ENCODING)
  31.         cookie.load()
  32.         magic.from_file = lambda x, mime: cookie.file(x)
  33.     elif 'mime' not in magic.from_file.func_code.co_varnames:
  34.         raise AttributeError
  35.  
  36.     def guess_mime_type(path):
  37.         """
  38.        Guesses the MIME type of a file based on its extension using
  39.        mimetypes.guess_type and python-magic as a fall-back when the mimetypes
  40.        method is unable to identify the file. Returns a tuple containing the
  41.        MIME type and encoding (usually `None`).
  42.        """
  43.         mimetype, _ = mimetypes.guess_type(path)
  44.         if not mimetype or mimetype.endswith('/octet-stream'):
  45.             magicmime = magic.from_file(path, mime=True)
  46.             if magicmime and '/' not in magicmime:
  47.                 return None, None
  48.             return magicmime, None
  49.         return mimetype, _
  50.  
  51. except Exception as err:
  52.     if isinstance(err, AttributeError):
  53.         print >> sys.stderr, "Available version of python-magic not supported."
  54.     guess_mime_type = mimetypes.guess_type
  55.  
  56.  
  57. def diversity(elements):
  58.     """
  59.    Computes the Shannon index of diversity for `elements`. The calculated
  60.    index and a list containing the most frequently occurring element or
  61.    elements are returned.
  62.    """
  63.     bucket = dict()
  64.     mode = list()
  65.     maxfreq = 0
  66.  
  67.     # Tally up the elements in the list.
  68.     for element in elements:
  69.         bucket[element] = bucket.setdefault(element, 0) + 1
  70.  
  71.         if bucket[element] >= maxfreq:
  72.             if bucket[element] == maxfreq:
  73.                 mode.append(element)
  74.             else:
  75.                 maxfreq = bucket[element]
  76.                 mode = [element]
  77.  
  78.     total = float(len(elements))
  79.     freqs = [count / total for _, count in bucket.iteritems()]
  80.     diversity = -sum(map(operator.mul, freqs, map(math.log, freqs)))
  81.  
  82.     # Do not return a negative zero
  83.     return diversity or 0.0, mode
  84.  
  85.  
  86. def classify(path, guesser=None):
  87.     """
  88.    Returns more user-friendly version of a file's MIME type. For most MIME
  89.    types, this function will simply return the primary type, but more
  90.    ambiguous types like 'application' or 'text' are broken down into sub-types
  91.    where possible and certain prefixes stripped. When `guesser` is supplied,
  92.    `classify` will use it to identify the MIME type of a given path.  The
  93.    function should require only a single parameter -- the path of the file --
  94.    and return the MIME type and a second value that will be ignored.
  95.    """
  96.     if not guesser:
  97.         guesser = guess_mime_type
  98.     path = os.path.normcase(path)
  99.     mimetype, _ = guesser(path)
  100.     if not mimetype:
  101.         return None
  102.  
  103.     mediatype, mediasubtype = mimetype.split('/')
  104.     if mediatype in ('application', 'text'):
  105.         mediatype = mediasubtype
  106.         if mediatype == 'octet-stream':
  107.             return UNKNOWN_TYPES_LABEL
  108.  
  109.     if mediatype.startswith('vnd.'):
  110.         mediatype = mediatype.split('.')[-1]
  111.  
  112.     if mediatype.startswith('x-'):
  113.         mediatype = mediatype[2:]
  114.  
  115.     return mediatype
  116.  
  117.  
  118. def categorize(path, maxdiversity=MAX_DIVERSITY):
  119.     """
  120.    When `path` is a directory, two components are used to determine its
  121.    classification: a Shannon index of diversity generated by running
  122.    `classify` on every file found recursively under the directory and the
  123.    `maxdiversity` parameter. When the directory contains files whose
  124.    classifications differ, a Shannon index of diversity greater than
  125.    `maxdiversity` will cause the folder to classified as `mixed`. Otherwise,
  126.    the most prevalent classification is used.
  127.  
  128.    If `path` is a file, `categorize` is nothing more than an alias for
  129.    `classify` that returns a file's classification and `None`.
  130.    """
  131.     if os.path.isdir(path):
  132.         categories = list()
  133.         for root, directories, files in os.walk(path):
  134.             # I originally had `categories.extend(map(classify, files))` here,
  135.             # but I need to make sure the classify function has the full path
  136.             # since I have implemented support for python-magic.
  137.             for filename in files:
  138.                 categories.append(classify(os.path.join(root, filename)))
  139.  
  140.         dirdiversity, mode = diversity(categories)
  141.         path = os.path.basename(path)
  142.         if len(mode) == 1 and dirdiversity <= maxdiversity:
  143.             return mode[0], dirdiversity
  144.         return False, dirdiversity
  145.     else:
  146.         return classify(path), None
  147.  
  148.  
  149. def organize(folder, dest=False, detectdirs=True, maxdiversity=MAX_DIVERSITY):
  150.     """
  151.    Classifies the files in `folder` and moves like items into appropriate
  152.    named folders.  The `dest` parameter is the destination folder for sorted
  153.    files. When `dest` is False, the sorted contents of `folder` remain in
  154.    `folder`. When `dest` is `None`, organize will simply print a list showing
  155.    the Shannon index of diversity and classification for each item in
  156.    `folder`. The `maxdiversity` is passed to `categorized` unchanged.
  157.  
  158.    When `detectdirs` is True, the function ignores folders that appear to be
  159.    sorted based on the folders' names.
  160.    """
  161.     files = [os.path.join(folder, base) for base in os.listdir(folder)]
  162.     if detectdirs:
  163.         files = [F for F in files if os.path.basename(F) not in IGNORED_FILES]
  164.  
  165.     # Wrap categorize so I can provide maxdiversity while using map
  166.     _categorize = lambda x: categorize(x, maxdiversity=maxdiversity)
  167.  
  168.     for path, categorydata in zip(files, map(_categorize, files)):
  169.         category, pathdiversity = categorydata
  170.         if category is None:
  171.             category = UNKNOWN_TYPES_LABEL
  172.         elif category is False:
  173.             category = MIXED_TYPES_LABEL
  174.  
  175.         displayname = path if len(path) < 34 else path[:7] + '...' + path[-24:]
  176.         if pathdiversity is not None:
  177.             left = '%-34s           %.3f' % (displayname, pathdiversity)
  178.         else:
  179.             left = '%-34s                ' % (displayname)
  180.  
  181.         if len(left) > 40:
  182.             # Truncate spaces starting from the right side of the string
  183.             left = left[::-1].replace(' ', '', len(left) - 50)[::-1]
  184.         print left + ' ' + category
  185.  
  186.         if dest is None:
  187.             continue
  188.  
  189.         destination = os.path.join(dest or folder, category)
  190.         try:
  191.             os.makedirs(destination)
  192.         except OSError as (err, msg):
  193.             if err != errno.EEXIST:
  194.                 print >> sys.stderr, '%s: %s' % (destination, msg)
  195.                 exit(1)
  196.  
  197.         if os.path.isdir(destination):
  198.             if os.path.samefile(path, destination):
  199.                 print >> sys.stderr, '%s: Source is destination.' % path
  200.             else:
  201.                 try:
  202.                     shutil.move(path, destination)
  203.                 except Exception as err:
  204.                     print >> sys.stderr, '%s: %s' % (path, err.message)
  205.         else:
  206.             print >> sys.stderr, '%s: Destination is not a folder.' % path
  207.  
  208.  
  209. def main(args=sys.argv[1:]):
  210.     """
  211.    Entry point when run as a stand-alone script.
  212.    """
  213.     arguments, trailing = getopt.gnu_getopt(args, 'd:nihm')
  214.     argdict = dict(arguments)
  215.     maxdiversity = argdict.get('-d', MAX_DIVERSITY)
  216.     if '-h' in argdict:
  217.         print os.path.basename(__file__), '[OPTIONS] [DIR... [DEST]]'
  218.         print '\t-h         Display this message and quit'
  219.         print '\t-d NUMBER  Threshold for Shannon diversity index'
  220.         print '\t-i         Ignore folders that appear to be sorted'
  221.         print '\t-n         Display categorizations and exit'
  222.         print '\t-m         Do not use python-magic even if it is available'
  223.  
  224.     else:
  225.         if '-m' in argdict:
  226.             global guess_mime_type
  227.             guess_mime_type = mimetypes.guess_type
  228.  
  229.         detectdirs = '-i' not in argdict
  230.         dryrun = '-n' in argdict
  231.         promptuser = '-y' not in argdict
  232.         for folder in trailing or '.':
  233.             if len(trailing) > 1:
  234.                 dest = trailing.pop()
  235.             else:
  236.                 dest = folder
  237.  
  238.             if dryrun:
  239.                 print 'Destination folder: %s' % dest
  240.                 dest = None
  241.             elif not trailing and promptuser:
  242.                 response = raw_input('Sort %s? [N/y] ' % os.getcwd())
  243.                 if not response.strip().lower().startswith('y'):
  244.                     exit(0)
  245.  
  246.             organize(folder, dest, detectdirs, maxdiversity)
  247.  
  248.  
  249. # Generate set containing all possible folder names
  250. IGNORED_FILES = set((MIXED_TYPES_LABEL, UNKNOWN_TYPES_LABEL))
  251. strict, loose = mimetypes.MimeTypes().types_map
  252. for extension in strict.keys() + loose.keys():
  253.     IGNORED_FILES.add(classify('x' + extension, guesser=mimetypes.guess_type))
  254.  
  255. if __name__ == '__main__':
  256.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement