Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import errno
- import getopt
- import math
- import mimetypes
- import operator
- import os
- import shutil
- import sys
- __author__ = "Eric Pruitt"
- __license__ = "Public Domain, FreeBSD, or PSF"
- __all__ = ['IGNORED_FILES', 'MIXED_TYPES_LABEL', 'UNKNOWN_TYPES_LABEL',
- 'categorize', 'classify', 'diversity', 'guess_mime_type', 'organize']
- MAX_DIVERSITY = 0.0
- MIXED_TYPES_LABEL = 'mixed'
- UNKNOWN_TYPES_LABEL = 'unknown'
- # If python-magic is installed, it will be used as a fall-back when
- # mimetypes.guess_type cannot identify a file. I have tested this with two
- # versions of python-magic: the current github master as of 2011.09.26 and the
- # package found in the Debian Squeeze repositories. For the latter, we rely on
- # monkey-patching. The github master can be found at the following address:
- # https://github.com/ahupp/python-magic/raw/master/magic.py
- try:
- import magic
- if not hasattr(magic, 'from_file'):
- cookie = magic.open(magic.MAGIC_MIME & ~magic.MAGIC_MIME_ENCODING)
- cookie.load()
- magic.from_file = lambda x, mime: cookie.file(x)
- elif 'mime' not in magic.from_file.func_code.co_varnames:
- raise AttributeError
- def guess_mime_type(path):
- """
- Guesses the MIME type of a file based on its extension using
- mimetypes.guess_type and python-magic as a fall-back when the mimetypes
- method is unable to identify the file. Returns a tuple containing the
- MIME type and encoding (usually `None`).
- """
- mimetype, _ = mimetypes.guess_type(path)
- if not mimetype or mimetype.endswith('/octet-stream'):
- magicmime = magic.from_file(path, mime=True)
- if magicmime and '/' not in magicmime:
- return None, None
- return magicmime, None
- return mimetype, _
- except Exception as err:
- if isinstance(err, AttributeError):
- print >> sys.stderr, "Available version of python-magic not supported."
- guess_mime_type = mimetypes.guess_type
- def diversity(elements):
- """
- Computes the Shannon index of diversity for `elements`. The calculated
- index and a list containing the most frequently occurring element or
- elements are returned.
- """
- bucket = dict()
- mode = list()
- maxfreq = 0
- # Tally up the elements in the list.
- for element in elements:
- bucket[element] = bucket.setdefault(element, 0) + 1
- if bucket[element] >= maxfreq:
- if bucket[element] == maxfreq:
- mode.append(element)
- else:
- maxfreq = bucket[element]
- mode = [element]
- total = float(len(elements))
- freqs = [count / total for _, count in bucket.iteritems()]
- diversity = -sum(map(operator.mul, freqs, map(math.log, freqs)))
- # Do not return a negative zero
- return diversity or 0.0, mode
- def classify(path, guesser=None):
- """
- Returns more user-friendly version of a file's MIME type. For most MIME
- types, this function will simply return the primary type, but more
- ambiguous types like 'application' or 'text' are broken down into sub-types
- where possible and certain prefixes stripped. When `guesser` is supplied,
- `classify` will use it to identify the MIME type of a given path. The
- function should require only a single parameter -- the path of the file --
- and return the MIME type and a second value that will be ignored.
- """
- if not guesser:
- guesser = guess_mime_type
- path = os.path.normcase(path)
- mimetype, _ = guesser(path)
- if not mimetype:
- return None
- mediatype, mediasubtype = mimetype.split('/')
- if mediatype in ('application', 'text'):
- mediatype = mediasubtype
- if mediatype == 'octet-stream':
- return UNKNOWN_TYPES_LABEL
- if mediatype.startswith('vnd.'):
- mediatype = mediatype.split('.')[-1]
- if mediatype.startswith('x-'):
- mediatype = mediatype[2:]
- return mediatype
- def categorize(path, maxdiversity=MAX_DIVERSITY):
- """
- When `path` is a directory, two components are used to determine its
- classification: a Shannon index of diversity generated by running
- `classify` on every file found recursively under the directory and the
- `maxdiversity` parameter. When the directory contains files whose
- classifications differ, a Shannon index of diversity greater than
- `maxdiversity` will cause the folder to classified as `mixed`. Otherwise,
- the most prevalent classification is used.
- If `path` is a file, `categorize` is nothing more than an alias for
- `classify` that returns a file's classification and `None`.
- """
- if os.path.isdir(path):
- categories = list()
- for root, directories, files in os.walk(path):
- # I originally had `categories.extend(map(classify, files))` here,
- # but I need to make sure the classify function has the full path
- # since I have implemented support for python-magic.
- for filename in files:
- categories.append(classify(os.path.join(root, filename)))
- dirdiversity, mode = diversity(categories)
- path = os.path.basename(path)
- if len(mode) == 1 and dirdiversity <= maxdiversity:
- return mode[0], dirdiversity
- return False, dirdiversity
- else:
- return classify(path), None
- def organize(folder, dest=False, detectdirs=True, maxdiversity=MAX_DIVERSITY):
- """
- Classifies the files in `folder` and moves like items into appropriate
- named folders. The `dest` parameter is the destination folder for sorted
- files. When `dest` is False, the sorted contents of `folder` remain in
- `folder`. When `dest` is `None`, organize will simply print a list showing
- the Shannon index of diversity and classification for each item in
- `folder`. The `maxdiversity` is passed to `categorized` unchanged.
- When `detectdirs` is True, the function ignores folders that appear to be
- sorted based on the folders' names.
- """
- files = [os.path.join(folder, base) for base in os.listdir(folder)]
- if detectdirs:
- files = [F for F in files if os.path.basename(F) not in IGNORED_FILES]
- # Wrap categorize so I can provide maxdiversity while using map
- _categorize = lambda x: categorize(x, maxdiversity=maxdiversity)
- for path, categorydata in zip(files, map(_categorize, files)):
- category, pathdiversity = categorydata
- if category is None:
- category = UNKNOWN_TYPES_LABEL
- elif category is False:
- category = MIXED_TYPES_LABEL
- displayname = path if len(path) < 34 else path[:7] + '...' + path[-24:]
- if pathdiversity is not None:
- left = '%-34s %.3f' % (displayname, pathdiversity)
- else:
- left = '%-34s ' % (displayname)
- if len(left) > 40:
- # Truncate spaces starting from the right side of the string
- left = left[::-1].replace(' ', '', len(left) - 50)[::-1]
- print left + ' ' + category
- if dest is None:
- continue
- destination = os.path.join(dest or folder, category)
- try:
- os.makedirs(destination)
- except OSError as (err, msg):
- if err != errno.EEXIST:
- print >> sys.stderr, '%s: %s' % (destination, msg)
- exit(1)
- if os.path.isdir(destination):
- if os.path.samefile(path, destination):
- print >> sys.stderr, '%s: Source is destination.' % path
- else:
- try:
- shutil.move(path, destination)
- except Exception as err:
- print >> sys.stderr, '%s: %s' % (path, err.message)
- else:
- print >> sys.stderr, '%s: Destination is not a folder.' % path
- def main(args=sys.argv[1:]):
- """
- Entry point when run as a stand-alone script.
- """
- arguments, trailing = getopt.gnu_getopt(args, 'd:nihm')
- argdict = dict(arguments)
- maxdiversity = argdict.get('-d', MAX_DIVERSITY)
- if '-h' in argdict:
- print os.path.basename(__file__), '[OPTIONS] [DIR... [DEST]]'
- print '\t-h Display this message and quit'
- print '\t-d NUMBER Threshold for Shannon diversity index'
- print '\t-i Ignore folders that appear to be sorted'
- print '\t-n Display categorizations and exit'
- print '\t-m Do not use python-magic even if it is available'
- else:
- if '-m' in argdict:
- global guess_mime_type
- guess_mime_type = mimetypes.guess_type
- detectdirs = '-i' not in argdict
- dryrun = '-n' in argdict
- promptuser = '-y' not in argdict
- for folder in trailing or '.':
- if len(trailing) > 1:
- dest = trailing.pop()
- else:
- dest = folder
- if dryrun:
- print 'Destination folder: %s' % dest
- dest = None
- elif not trailing and promptuser:
- response = raw_input('Sort %s? [N/y] ' % os.getcwd())
- if not response.strip().lower().startswith('y'):
- exit(0)
- organize(folder, dest, detectdirs, maxdiversity)
- # Generate set containing all possible folder names
- IGNORED_FILES = set((MIXED_TYPES_LABEL, UNKNOWN_TYPES_LABEL))
- strict, loose = mimetypes.MimeTypes().types_map
- for extension in strict.keys() + loose.keys():
- IGNORED_FILES.add(classify('x' + extension, guesser=mimetypes.guess_type))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement