ntfsUSN.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Parser for the NTFS USN change journal.

More information about the format can be found here:

http://msdn.microsoft.com/en-us/library/windows/desktop/aa365720%28v=vs.85%29

.aspx
"""

import struct
from plaso.lib import event
from plaso.lib import eventdata
from plaso.lib import parser
from plaso.lib import errors
from plaso.lib import timelib
from functools import partial

class UsnJrnlEvent(event.TimestampEvent):
  """ USN change journal record / event """

  DATA_TYPE = 'windows:metadata:usnjrnl'

  def __init__(
      self, target_file, majorvers, minorvers, mft, parentmft, usn, reasons,
      attributes, securityid, timestamp, timestamptype
      ) :
    super(UsnJrnlEvent, self).__init__(timestamp, timestamptype,
      'windows:metadata:usnjrnl')
    self.data_type     = 'windows:metadata:usnjrnl'
    self.target_file   = target_file
    self.majorvers     = majorvers
    self.minorvers     = minorvers
    self.mft           = mft
    self.parentmft     = parentmft
    self.usn           = usn
    self.reasons       = reasons
    self.fileattrib    = attributes
    self.security      = securityid
    self.timestamp     = timestamp
    self.timestamptype = timestamptype

class UsnJrnlParser(parser.PlasoParser):
  """ Parses the NTFS change journal.

      The NTFS update sequence number (USN) change journal resides at
      /$Extend/$UsnJrnl. It is present in NTFS since Windows XP. The most
      relevant data is contained in the named alternate data stream $J, thus
      this parsers looks for the file /$Extend/$UsnJrnl:$J.
      Information about the journal itself will not be reconstructed.

      More details on the USN change journal can be found here:

http://msdn.microsoft.com/en-us/library/windows/desktop/aa365720%28v=vs.85%29.aspx

      Two versions of USN change journal records exist. Major version 2 was
      used since Windows XP / Server 2003, Major version 3 was used since
      Windows 8 / Windows Server 2012.
      All records should be aligned at 64-bit boundaries.

      The two structures can be defined as follows:

      Major version 2: (Size in Byte)
      Offset Type    Size  Record
      0x00   DWORD     4    RecordLength
      0x04   WORD      2    MajorVersion;
      0x06   WORD      2    MinorVersion;
      0x08   DWORDLONG 8    FileReferenceNumber;
      0x10   DWORDLONG 8    ParentFileReferenceNumber;
      0x18   USN       8    Usn;
      0x20   LARGE_INTEGER 8 TimeStamp;
      0x28   DWORD     4    Reason;
      0x2B   DWORD     4    SourceInfo;
      0x30   DWORD     4    SecurityId;
      0x34   DWORD     4    FileAttributes;
      0x38   WORD      2    FileNameLength;
      0x3A   WORD      2    FileNameOffset;
      0x3C   WCHAR     *    FileName[1];

      Major version 3: (Size in Byte)
      Offset Type    Size  Record
      0x00   DWORD     4    RecordLength
      0x04   WORD      2    MajorVersion;
      0x06   WORD      2    MinorVersion;
      0x08   BYTE     16    FileReferenceNumber;
      0x18   BYTE     16    ParentFileReferenceNumber;
      0x28   USN       8    Usn;
      0x30   LARGE_INTEGER 8 TimeStamp;
      0x38   DWORD     4    Reason;
      0x3B   DWORD     4    SourceInfo;
      0x40   DWORD     4    SecurityId;
      0x44   DWORD     4    FileAttributes;
      0x48   WORD      2    FileNameLength;
      0x4A   WORD      2    FileNameOffset;
      0x4C   WCHAR     *    FileName[1];

      The major difference between the two versions is the size and type of
      the two FileReferenceNumbers. With the minor difference of the version
      number, the official reasonstrings are identical between the versions
      as well. SourceInfo and File-Attributes are version indepent.
      """

  REASONS = {
    0x00000001 : 'The data in the file or directory is overwritten.',
    0x00000002 : 'The file or directory is extended (added to).',
    0x00000004 : 'The file or directory is truncated.',
    0x00000010 : 'The data in one or more named data streams for a file is '
                 'overwritten.',
    0x00000020 : 'The one or more named data streams for a file are extended '
                  '(added to).',
    0x00000040 : 'The one or more named data streams for a file is truncated.',
    0x00000100 : 'The file or directory is created for the first time.',
    0x00000200 : 'The file or directory is deleted.',
    0x00000400 : 'The user made a change to the extended attributes of a file'
                 'or directory. These NTFS file system attributes are not '
                 'accessible to Windows-based applications.',
    0x00000800 : 'A change is made in the access rights to a file or '
                 'directory.',
    0x00001000 : 'The file or directory is renamed, and the file name in the '
                 'USN_RECORD structure is the previous name.',
    0x00002000 : 'A file or directory is renamed, and the file name in the '
                 'USN_RECORD_V2 structure is the new name.',
    0x00004000 : 'A user changes the FILE_ATTRIBUTE_NOT_CONTENT_INDEXED '
                 'attribute. That is, the user changes the file or directory '
                 'from one where content can be indexed to one where content '
                 'cannot be indexed, or vice versa. Content indexing permits '
                 'rapid searching of data by building a database of selected '
                 'content.',
    0x00008000 : 'A user has either changed one or more file or directory '
                 'attributes (for example, the read-only, hidden, system, '
                 'archive, or sparse attribute), or one or more time '
                 'stamps.',
    0x00010000 : 'An NTFS file system hard link is added to or removed from '
                 'the file or directory. An NTFS file system hard link, '
                 'similar to a POSIX hard link, is one of several directory '
                 'entries that see the same file or directory.',
    0x00020000 : 'The compression state of the file or directory is changed '
                 'drom or to compressed.',
    0x00040000 : 'The file or directory is encrypted or decrypted.',
    0x00080000 : 'The object identifier of a file or directory is changed.',
    0x00100000 : 'The reparse point that is contained in a file or directory '
                 'is changed, or a reparse point is added to or deleted from a '
                 'file or directory.',
    0x00200000 : 'A named stream is added to or removed from a file, or a named'
    'stream is renamed.',
    0x80000000 : 'The file or directory is closed.'
    }

  SOURCES = {
    0x00000002 : 'USN_SOURCE_AUXILIARY_DATA',
    0x00000001 : 'USN_SOURCE_DATA_MANAGEMENT',
    0x00000004 : 'USN_SOURCE_REPLICATION_MANAGEMENT'
  }
  # See http://msdn.microsoft.com/en-us/library/ee332330%28VS.85%29.aspx

  ATTRIBUTES = {
      1:'FILE_ATTRIBUTE_READONLY',
      2:'FILE_ATTRIBUTE_HIDDEN',
      4:'FILE_ATTRIBUTE_SYSTEM',
      16:'FILE_ATTRIBUTE_DIRECTORY',
      32:'FILE_ATTRIBUTE_ARCHIVE',
      64:'FILE_ATTRIBUTE_DEVICE',
      128:'FILE_ATTRIBUTE_NORMAL',
      256:'FILE_ATTRIBUTE_TEMPORARY',
      512:'FILE_ATTRIBUTE_SPARSE_FILE',
      1024:'FILE_ATTRIBUTE_REPARSE_POINT',
      2048:'FILE_ATTRIBUTE_COMPRESSED',
      4096:'FILE_ATTRIBUTE_OFFLINE',
      8192:'FILE_ATTRIBUTE_NOT_CONTENT_INDEXED',
      16384:'FILE_ATTRIBUTE_ENCRYPTED',
      65536:'FILE_ATTRIBUTE_VIRTUAL'
    }

  def Parse(self, file_object):
    """ Verifies the requested file as change journal and returns the parsed
    events.

     As the journal has no magic bytes or unique recognizable byte patters,
        verification is done by checking the filename for $UsnJrnl and $J.

     Args:
     file_object: A filehandle/file-like-object that is seekable to the
        file needed to be checked.
     Raises:
     UnableToParseFile when the file has the wrong name or cannot be
        parsed """
    # Check the given filename ( *$usnjrnl*$J ) at least basically
    try:
      name = file_object.name.lower()
      if not name.endswith(u'$j') or not u'$usnjrnl' in name:
        raise errors.UnableToParseFile(u'[%s] file %s not named *$UsnJrnl*$J'
                                % (self.parser_name, file_object.name))
    except UnicodeEncodeError as error:
      raise errors.UnableToParseFile(u'[%s] unable to read name of file %s: %s'
                                % (self.parser_name, file_object.name,error))
    res = self.Scan(file_object)
    return res

  def Scan(self, file_object):
    """ Parses and returns change journal records from the given file.

     Args:
     file_object: A filehandle/file-like-object that is seekable to the
        file needed to be checked.
     Raises:
     UnableToParseFile when the file has the wrong name or cannot be
        parsed """
    try:
      offset = self.ReadSparseOffset(file_object)
      resultset = self.OffsetParse(file_object, offset)
      return resultset
    except Exception as error:
      raise errors.UnableToParseFile(u'[%s] Exception with scan %s: %s'
                                     % (self.parser_name, file_object,error))

  def ReadSparseOffset(self, file_object):
    """Reads file_object and determines the offset of the first non-zero
       byte.

    Reads the file_object in 1MB chunks, reading from the left and
       reading till the first non-zero byte is reached. This is determined to
       be the offset and returned.

    Args:
      file_object: A filehandle/file-like-object that is seekable to the
      file needed to be checked."""

    # $UsnJrnl may contain lots of leading zeros. Try to skip them
    # fast by reading 1MB chunks and stripping zeros
    chunksize = 1024*1024*1024
    l = 0
    for chunk in iter(partial(file_object.read, chunksize), ''):
      chunk = chunk.lstrip('\x00')
      l     = len(chunk)
      if l > 0:
        break
    # The offset is the current file position minus the rest of the
    # current chunk
    offset = file_object.tell() - l
    return offset

  def OffsetParse(self, file_object, offset):
    """ Parses filesystem journal from file_object, omitting 'offset' bytes.

        $UsnJrnl:$J often contains a large number of leading zeroes, the
        offset can be specified in order to skip over them. """
    try:
      file_object.seek(offset)
    except Exception as error:
      raise errors.UnableToParseFile(u'[%s] unable to seek offset %i in file'
        '%s: %s' % (offset, self.parser_name, file_object.name,error))
    # seek till non-sparse
    # parse
    while ( True ) :
      try:
        entry = self.readEntry(file_object)
        yield UsnJrnlEvent(entry[0], entry[1], entry[2], entry[3], entry[4],
                       entry[5], entry[6], entry[7], entry[8], entry[9],
                       entry[10])
      except EndOfFileError as error:
        break
      except SparseError as error:
        # Try to jump over sparse parts
                # preoff = file_object.tell()
        offset = self.ReadSparseOffset(file_object)
        # We only want to skip to 64-bit boundaries
        offset = offset - (offset % 8)
        file_object.seek(offset)

  def readEntry(self, file_object):
    currentOffset = file_object.tell()
    # Read record size and version numbers only
    data = file_object.read(0x08)
    if len(data) < 0x08 :
      # end of file
      raise EndOfFileError("Reached end of file at offset %i" % currentOffset)
    try:
      formatstring = 'IHH'
      sdata = struct.unpack_from(formatstring, data)
    except struct.error as error:
      raise error
    recordsize = sdata[0]
    if recordsize == 0:
      file_object.seek(currentOffset+4)
      raise SparseError('Reached 0 Byte in recordsize at offset %i - sparse'
                     ' block?' % currentOffset)
    majorversion = sdata[1]
    minorversion = sdata[2]

    #Depending on the version, we need to read 52 or 68 bytes with slight
    #differences in the formatstring. Everything else is identical.

    if majorversion == 2 :
      rsize = 0x34
      formatstring = 'QQQqIIIIHH'
    else :
      rsize = 0x44
      formatstring = '16B16BQqIIIIHH'

    # Read and parse the rest of the non-variable record
    try:
      data = file_object.read(rsize)
      if len(data) < rsize :
        raise EndOfFileError("Unexpectetly reached end of file at offset %i" %
                         currentOffset)
      sdata = struct.unpack_from(formatstring, data)
    except struct.error as error:
      raise error
    mftref = sdata[0]
    mftparentref = sdata[1]
    usn  = sdata[2]
    timestamp = sdata[3]
    reasonID = sdata[4]
    sourceID = sdata[5]
    securityID = sdata[6]
    fileattrib = sdata[7]
    sizefilename = sdata[8]
    # provided for completeness
        # pylint: disable-msg=unused-variable
    offset = sdata[9]

    # And now read the filename
    try:
      data = file_object.read(sizefilename)
      if len(data) < sizefilename :
         raise EndOfFileError("Unexpectetly reached end of file at offset %i" % currentOffset)
#      formatstring2 = '%is' % (sizefilename)
#      sdata = struct.unpack_from(formatstring2, data)
#      filename = sdata[0].decode('utf-16')
       filename = data.decode('utf-16')
       padding = recordsize - (rsize+8) - sizefilename
       # Jump over Padding
       data = file_object.read(padding)
     except struct.error as error:
       raise error
     # Reasons, Sources and file attributes are (in essence) bit-arrays, so
     # they can be read by checking which attribute-bits are set.
     reasons = u''
     reasonlist = self.REASONS.keys()
     for r in reasonlist:
       if r & reasonID > 0:
        if len(reasons) > 0:
          reasons = reasons + ' ' + self.REASONS[r]
        else:
          reasons = self.REASONS[r]

    sources = u''
    sourcelist = self.SOURCES.keys()
    for s in sourcelist:
      if s & sourceID > 0:
        if len(sources) > 0:
          sources = self.SOURCES[s]
        else:
          sources = sources + ', ' + self.SOURCES[s]

    attributes = u''
    attributelist = self.ATTRIBUTES.keys()
    for a in attributelist:
      if a & fileattrib > 0:
        if len(attributes) > 0:
          attributes = attributes + ', ' + self.ATTRIBUTES[a]
        else:
          attributes = self.ATTRIBUTES[a]

    # The timestamp is given in the NTFS filetime format
    timestamp = timelib.Timestamp.FromFiletime(timestamp)

    # Event needs a type, so try to specify which reason maps to which type
    MODIFICATION_TIME_LIST = [ 0x01, 0x02, 0x04, 0x10, 0x20, 0x40, 0x10000,
                            0x40000, 0x200000 ]
    CREATION_TIME_LIST = [ 0x100, 0x2000 ]
    DELETED_TIME_LIST = [ 0x200, 0x1000 ]
    CHANGE_TIME_LIST = [ 0x400, 0x800, 0x4000, 0x8000, 0x20000, 0x80000,
                      0x100000 ]
        # Provided for completeness
    # pylint: disable-msg=unused-variable
    ACCESS_TIME_LIST = [ 0x80000000 ]

    timestamptype = eventdata.EventTimestamp.ACCESS_TIME
    for m in MODIFICATION_TIME_LIST:
      if reasonID & m > 0:
        timestamptype = eventdata.EventTimestamp.MODIFICATION_TIME
        break
    for m in CHANGE_TIME_LIST:
      if reasonID & m > 0:
        timestamptype = eventdata.EventTimestamp.CHANGE_TIME
        break
    for m in CREATION_TIME_LIST:
      if reasonID & m > 0:
        timestamptype = eventdata.EventTimestamp.CREATION_TIME
        break
    for m in DELETED_TIME_LIST:
      if reasonID & m > 0:
        timestamptype = eventdata.EventTimestamp.DELETED_TIME
        break

    return [filename, majorversion, minorversion, mftref,
                     mftparentref, usn, reasons, attributes, securityID,
                     timestamp, timestamptype]

class EndOfFileError(Exception):
  def __init__(self, value):
    Exception.__init__(self, value)
    self.value = value

  def __str__(self):
    return repr(self.value)

class SparseError(Exception):
  def __init__(self, value):
    Exception.__init__(self, value)
    self.value = value

  def __str__(self):
    return repr(self.value)