opexxx

olevba-0.8b.py

Mar 27th, 2015
395
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 70.21 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """
  3. olevba.py
  4.  
  5. olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6. (e.g. Word, Excel), to extract VBA Macro code in clear text.
  7.  
  8. Supported formats:
  9. - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  10. - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  11. - PowerPoint 2007+ (.pptm, .ppsm)
  12.  
  13. Author: Philippe Lagadec - http://www.decalage.info
  14. License: BSD, see source code or documentation
  15.  
  16. olevba is part of the python-oletools package:
  17. http://www.decalage.info/python/oletools
  18.  
  19. olevba is based on source code from officeparser by John William Davison
  20. https://github.com/unixfreak0037/officeparser
  21. """
  22.  
  23. #=== LICENSE ==================================================================
  24.  
  25. # olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
  26. # All rights reserved.
  27. #
  28. # Redistribution and use in source and binary forms, with or without modification,
  29. # are permitted provided that the following conditions are met:
  30. #
  31. #  * Redistributions of source code must retain the above copyright notice, this
  32. #    list of conditions and the following disclaimer.
  33. #  * Redistributions in binary form must reproduce the above copyright notice,
  34. #    this list of conditions and the following disclaimer in the documentation
  35. #    and/or other materials provided with the distribution.
  36. #
  37. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  38. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  39. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  40. # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  41. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  42. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  43. # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  44. # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  46. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  47.  
  48.  
  49. # olevba contains modified source code from the officeparser project, published
  50. # under the following MIT License (MIT):
  51. #
  52. # officeparser is copyright (c) 2014 John William Davison
  53. #
  54. # Permission is hereby granted, free of charge, to any person obtaining a copy
  55. # of this software and associated documentation files (the "Software"), to deal
  56. # in the Software without restriction, including without limitation the rights
  57. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  58. # copies of the Software, and to permit persons to whom the Software is
  59. # furnished to do so, subject to the following conditions:
  60. #
  61. # The above copyright notice and this permission notice shall be included in all
  62. # copies or substantial portions of the Software.
  63. #
  64. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  65. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  66. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  67. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  68. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  69. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  70. # SOFTWARE.
  71.  
  72. #------------------------------------------------------------------------------
  73. # CHANGELOG:
  74. # 2014-08-05 v0.01 PL: - first version based on officeparser code
  75. # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
  76. # 2014-08-15       PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  77. # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  78. #                        and to find the VBA project root anywhere in the file
  79. # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  80. # 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  81. #                      - added detect_vba_macros
  82. # 2014-12-10 v0.06 PL: - hide first lines with VB attributes
  83. #                      - detect auto-executable macros
  84. #                      - ignore empty macros
  85. # 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
  86. # 2014-12-15 v0.08 PL: - improved display for empty macros
  87. #                      - added pattern extraction
  88. # 2014-12-25 v0.09 PL: - added suspicious keywords detection
  89. # 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
  90. #                      - uses xglob to scan several files with wildcards
  91. #                      - option -r to recurse subdirectories
  92. #                      - option -z to scan files in password-protected zips
  93. # 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
  94. # 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
  95. #                      - process_file: improved display, shows container file
  96. #                      - improved list of executable file extensions
  97. # 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
  98. # 2015-01-08 v0.14 PL: - added hex strings detection and decoding
  99. #                      - fixed issue #2, decoding VBA stream names using
  100. #                        specified codepage and unicode stream names
  101. # 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
  102. # 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
  103. #                      - added several suspicious keywords
  104. #                      - added option -i to analyze VBA source code directly
  105. # 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
  106. #                      - added scan_vba to run all detection algorithms
  107. #                      - decoded hex strings are now also scanned + reversed
  108. # 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
  109. # 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
  110. #                        strings and StrReverse
  111.  
  112. #######################
  113. # Base64 detection and decode added by James Habben
  114. #######################
  115.  
  116. __version__ = '0.19'
  117.  
  118. #------------------------------------------------------------------------------
  119. # TODO:
  120. # + do not use logging, but a provided logger (null logger by default)
  121. # + setup logging (common with other oletools)
  122.  
  123. # TODO later:
  124. # + do not show hex strings by default (add option --hex)
  125. # + performance improvement: instead of searching each keyword separately,
  126. #   first split vba code into a list of words (per line), then check each
  127. #   word against a dict. (or put vba words into a set/dict?)
  128. # + for regex, maybe combine them into a single re with named groups?
  129. # + add Yara support, include sample rules? plugins like balbuzard?
  130. # + add balbuzard support
  131. # + output to file (replace print by file.write, sys.stdout by default)
  132. # + look for VBA in embedded documents (e.g. Excel in Word)
  133. # + support SRP streams (see Lenny's article + links and sample)
  134. # - python 3.x support
  135. # - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
  136. # - check VBA macros in Visio, Access, Project, etc
  137. # - extract_macros: convert to a class, split long function into smaller methods
  138. # - extract_macros: read bytes from stream file objects instead of strings
  139. # - extract_macros: use combined struct.unpack instead of many calls
  140.  
  141. #------------------------------------------------------------------------------
  142. # REFERENCES:
  143. # - [MS-OVBA]: Microsoft Office VBA File Format Structure
  144. #   http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
  145. # - officeparser: https://github.com/unixfreak0037/officeparser
  146.  
  147.  
  148. #--- IMPORTS ------------------------------------------------------------------
  149.  
  150. import sys, logging
  151. import struct
  152. import cStringIO
  153. import math
  154. import zipfile
  155. import re
  156. import optparse
  157. import os.path
  158. import binascii
  159. import base64
  160.  
  161. import thirdparty.olefile as olefile
  162. from thirdparty.prettytable import prettytable
  163. from thirdparty.xglob import xglob
  164.  
  165. #--- CONSTANTS ----------------------------------------------------------------
  166.  
  167. TYPE_OLE     = 'OLE'
  168. TYPE_OpenXML = 'OpenXML'
  169.  
  170. MODULE_EXTENSION = "bas"
  171. CLASS_EXTENSION = "cls"
  172. FORM_EXTENSION = "frm"
  173.  
  174. # Keywords to detect auto-executable macros
  175. AUTOEXEC_KEYWORDS = {
  176.     # MS Word:
  177.     'Runs when the Word document is opened':
  178.         ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
  179.     'Runs when the Word document is closed':
  180.         ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
  181.     'Runs when the Word document is modified':
  182.         ('DocumentChange',),
  183.     'Runs when a new Word document is created':
  184.         ('AutoNew', 'Document_New', 'NewDocument'),
  185.  
  186.     # MS Excel:
  187.     'Runs when the Excel Workbook is opened':
  188.         ('Auto_Open', 'Workbook_Open'),
  189.     'Runs when the Excel Workbook is closed':
  190.         ('Auto_Close', 'Workbook_Close'),
  191.  
  192.     #TODO: full list in MS specs??
  193. }
  194.  
  195. # Suspicious Keywords that may be used by malware
  196. # See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
  197. SUSPICIOUS_KEYWORDS = {
  198.     #TODO: use regex to support variable whitespaces
  199.     'May read system environment variables':
  200.         ('Environ',),
  201.     'May open a file':
  202.         ('Open',),
  203.     'May write to a file (if combined with Open)':
  204.         #TODO: regex to find Open+Write on same line
  205.         ('Write', 'Put', 'Output', 'Print #'),
  206.     'May read or write a binary file (if combined with Open)':
  207.         #TODO: regex to find Open+Binary on same line
  208.         ('Binary',),
  209.     'May copy a file':
  210.         ('FileCopy', 'CopyFile'),
  211.         #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
  212.         #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
  213.     'May create a text file':
  214.         ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
  215.         #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
  216.         #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
  217.     'May run an executable file or a system command':
  218.         ('Shell', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
  219.          'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
  220.         #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
  221.         #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
  222.     'May hide the application':
  223.         ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
  224.     'May create a directory':
  225.         ('MkDir',),
  226.     'May save the current workbook':
  227.         ('ActiveWorkbook.SaveAs',),
  228.     'May change which directory contains files to open at startup':
  229.         #TODO: confirm the actual effect
  230.         ('Application.AltStartupPath',),
  231.     'May create an OLE object':
  232.         ('CreateObject',),
  233.     'May run an application (if combined with CreateObject)':
  234.         ('Shell.Application',),
  235.     'May enumerate application windows (if combined with Shell.Application object)':
  236.         ('Windows', 'FindWindow'),
  237.     'May run code from a DLL':
  238.         #TODO: regex to find declare+lib on same line
  239.         ('Lib',),
  240.     'May download files from the Internet':
  241.         #TODO: regex to find urlmon+URLDownloadToFileA on same line
  242.         ('URLDownloadToFileA',),
  243.     'May control another application by simulating user keystrokes':
  244.         ('SendKeys', 'AppActivate'),
  245.         #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
  246.     'May attempt to obfuscate malicious function calls':
  247.         ('CallByName',),
  248.         #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
  249.     'May attempt to obfuscate specific strings':
  250.         ('Chr', 'ChrB', 'ChrW', 'StrReverse'),
  251.         #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
  252. }
  253.  
  254. # Patterns to be extracted (IP addresses, URLs, etc)
  255. # From patterns.py in balbuzard
  256. RE_PATTERNS = (
  257.     #TODO: check if this regex matches URLs with an IP address (various forms)
  258.     ('URL', re.compile(r'(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*[^\.\,\)\(\s]')),
  259.     ('IPv4 address', re.compile(r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b")),
  260.     ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2,12}|XN--[A-Z0-9]{4,18})\b')),
  261.     # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
  262.     # Executable file name with known extensions (except .com which is present in many URLs, and .application):
  263.     ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VB|VBS|JS|VBE|JSE|WS|WSF|WSC|WSH|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1|PS1XML|PS2|PS2XML|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
  264.     # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
  265.     #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  266.     #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
  267.     )
  268.  
  269. # regex to detect strings encoded in hexadecimal
  270. re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
  271.  
  272. # regex to detect strings encoded in base64
  273. re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
  274.  
  275.  
  276. #--- FUNCTIONS ----------------------------------------------------------------
  277.  
  278. def copytoken_help(decompressed_current, decompressed_chunk_start):
  279.     """
  280.    compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
  281.  
  282.    decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
  283.    decompressed_chunk_start: offset of the current chunk in the decompressed container
  284.    return length_mask, offset_mask, bit_count, maximum_length
  285.    """
  286.     difference = decompressed_current - decompressed_chunk_start
  287.     bit_count = int(math.ceil(math.log(difference, 2)))
  288.     bit_count = max([bit_count, 4])
  289.     length_mask = 0xFFFF >> bit_count
  290.     offset_mask = ~length_mask
  291.     maximum_length = (0xFFFF >> bit_count) + 3
  292.     return length_mask, offset_mask, bit_count, maximum_length
  293.  
  294.  
  295. def decompress_stream (compressed_container):
  296.     """
  297.    Decompress a stream according to MS-OVBA section 2.4.1
  298.  
  299.    compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
  300.    return the decompressed container as a string (bytes)
  301.    """
  302.     # 2.4.1.2 State Variables
  303.  
  304.     # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
  305.     # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
  306.     # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
  307.     #                    decompression or to be written by compression.
  308.  
  309.     # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
  310.     # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
  311.     #                       CompressedContainer (section 2.4.1.1.1).
  312.  
  313.     # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
  314.     # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
  315.     #                      decompression or to be read by compression.
  316.     # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
  317.  
  318.     # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
  319.     # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
  320.     #                         DecompressedBuffer (section 2.4.1.1.2).
  321.  
  322.     decompressed_container = ''  # result
  323.     compressed_current = 0
  324.  
  325.     sig_byte = ord(compressed_container[compressed_current])
  326.     if sig_byte != 0x01:
  327.         raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
  328.  
  329.     compressed_current += 1
  330.  
  331.     #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
  332.     # CompressedRecordEnd = len(compressed_container)
  333.     while compressed_current < len(compressed_container):
  334.         # 2.4.1.1.5
  335.         compressed_chunk_start = compressed_current
  336.         # chunk header = first 16 bits
  337.         compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
  338.         # chunk size = 12 first bits of header + 3
  339.         chunk_size = (compressed_chunk_header & 0x0FFF) + 3
  340.         # chunk signature = 3 next bits - should always be 0b011
  341.         chunk_signature = (compressed_chunk_header >> 12) & 0x07
  342.         if chunk_signature != 0b011:
  343.             raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
  344.         # chunk flag = next bit - 1 == compressed, 0 == uncompressed
  345.         chunk_flag = (compressed_chunk_header >> 15) & 0x01
  346.         logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
  347.  
  348.         #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
  349.         # The minimum size is 3 bytes
  350.         # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
  351.         # in chunk header before adding 3.
  352.         # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
  353.         if chunk_flag == 1 and chunk_size > 4098:
  354.             raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
  355.         if chunk_flag == 0 and chunk_size != 4098:
  356.             raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
  357.  
  358.         # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
  359.         #TODO: raise an exception?
  360.         if compressed_chunk_start + chunk_size > len(compressed_container):
  361.             logging.warning('Chunk size is larger than remaining compressed data')
  362.         compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
  363.         # read after chunk header:
  364.         compressed_current = compressed_chunk_start + 2
  365.  
  366.         if chunk_flag == 0:
  367.             # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
  368.             # uncompressed chunk: read the next 4096 bytes as-is
  369.             #TODO: check if there are at least 4096 bytes left
  370.             decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  371.             compressed_current += 4096
  372.         else:
  373.             # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
  374.             # compressed chunk
  375.             decompressed_chunk_start = len(decompressed_container)
  376.             while compressed_current < compressed_end:
  377.                 # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
  378.                 # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
  379.                 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
  380.                 # copy tokens (reference to a previous literal token)
  381.                 flag_byte = ord(compressed_container[compressed_current])
  382.                 compressed_current += 1
  383.                 for bit_index in xrange(0, 8):
  384.                     # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
  385.                     if compressed_current >= compressed_end:
  386.                         break
  387.                     # MS-OVBA 2.4.1.3.5 Decompressing a Token
  388.                     # MS-OVBA 2.4.1.3.17 Extract FlagBit
  389.                     flag_bit = (flag_byte >> bit_index) & 1
  390.                     #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
  391.                     if flag_bit == 0: # LiteralToken
  392.                         # copy one byte directly to output
  393.                         decompressed_container += compressed_container[compressed_current]
  394.                         compressed_current += 1
  395.                     else: # CopyToken
  396.                         # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
  397.                         copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
  398.                         #TODO: check this
  399.                         length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
  400.                             len(decompressed_container), decompressed_chunk_start)
  401.                         length = (copy_token & length_mask) + 3
  402.                         temp1 = copy_token & offset_mask
  403.                         temp2 = 16 - bit_count
  404.                         offset = (temp1 >> temp2) + 1
  405.                         #logging.debug('offset=%d length=%d' % (offset, length))
  406.                         copy_source = len(decompressed_container) - offset
  407.                         for index in xrange(copy_source, copy_source + length):
  408.                             decompressed_container += decompressed_container[index]
  409.                         compressed_current += 2
  410.     return decompressed_container
  411.  
  412.  
  413. def _extract_vba (ole, vba_root, project_path, dir_path):
  414.     """
  415.    Extract VBA macros from an OleFileIO object.
  416.    Internal function, do not call directly.
  417.  
  418.    vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  419.    vba_project: path to the PROJECT stream
  420.    This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
  421.    """
  422.     # Open the PROJECT stream:
  423.     project = ole.openstream(project_path)
  424.  
  425.     # sample content of the PROJECT stream:
  426.  
  427.     ##    ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
  428.     ##    Document=ThisDocument/&H00000000
  429.     ##    Module=NewMacros
  430.     ##    Name="Project"
  431.     ##    HelpContextID="0"
  432.     ##    VersionCompatible32="393222000"
  433.     ##    CMG="F1F301E705E705E705E705"
  434.     ##    DPB="8F8D7FE3831F2020202020"
  435.     ##    GC="2D2FDD81E51EE61EE6E1"
  436.     ##
  437.     ##    [Host Extender Info]
  438.     ##    &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
  439.     ##    &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
  440.     ##
  441.     ##    [Workspace]
  442.     ##    ThisDocument=22, 29, 339, 477, Z
  443.     ##    NewMacros=-4, 42, 832, 510, C
  444.  
  445.     code_modules = {}
  446.  
  447.     for line in project:
  448.         line = line.strip()
  449.         if '=' in line:
  450.             # split line at the 1st equal sign:
  451.             name, value = line.split('=', 1)
  452.             # looking for code modules
  453.             # add the code module as a key in the dictionary
  454.             # the value will be the extension needed later
  455.             # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
  456.             value = value.lower()
  457.             if name == 'Document':
  458.                 # split value at the 1st slash, keep 1st part:
  459.                 value = value.split('/', 1)[0]
  460.                 code_modules[value] = CLASS_EXTENSION
  461.             elif name == 'Module':
  462.                 code_modules[value] = MODULE_EXTENSION
  463.             elif name == 'Class':
  464.                 code_modules[value] = CLASS_EXTENSION
  465.             elif name == 'BaseClass':
  466.                 code_modules[value] = FORM_EXTENSION
  467.  
  468.     # read data from dir stream (compressed)
  469.     dir_compressed = ole.openstream(dir_path).read()
  470.  
  471.     def check_value(name, expected, value):
  472.         if expected != value:
  473.             logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
  474.  
  475.     dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
  476.  
  477.     # PROJECTSYSKIND Record
  478.     PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
  479.     check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
  480.     PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
  481.     check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
  482.     PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
  483.     if PROJECTSYSKIND_SysKind == 0x00:
  484.         logging.debug("16-bit Windows")
  485.     elif PROJECTSYSKIND_SysKind == 0x01:
  486.         logging.debug("32-bit Windows")
  487.     elif PROJECTSYSKIND_SysKind == 0x02:
  488.         logging.debug("Macintosh")
  489.     elif PROJECTSYSKIND_SysKind == 0x03:
  490.         logging.debug("64-bit Windows")
  491.     else:
  492.         logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
  493.  
  494.     # PROJECTLCID Record
  495.     PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
  496.     check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
  497.     PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
  498.     check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
  499.     PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
  500.     check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
  501.  
  502.     # PROJECTLCIDINVOKE Record
  503.     PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  504.     check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
  505.     PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  506.     check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
  507.     PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
  508.     check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
  509.  
  510.     # PROJECTCODEPAGE Record
  511.     PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  512.     check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
  513.     PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  514.     check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
  515.     PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
  516.  
  517.     # PROJECTNAME Record
  518.     PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  519.     check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
  520.     PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
  521.     if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
  522.         logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
  523.     PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
  524.  
  525.     # PROJECTDOCSTRING Record
  526.     PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
  527.     check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
  528.     PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  529.     if PROJECTNAME_SizeOfProjectName > 2000:
  530.         logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
  531.     PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
  532.     PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  533.     check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
  534.     PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  535.     if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
  536.         logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  537.     PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
  538.  
  539.     # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  540.     PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
  541.     check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
  542.     PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
  543.     if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
  544.         logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
  545.     PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
  546.     PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  547.     check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
  548.     PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
  549.     if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
  550.         logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  551.     PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
  552.     if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
  553.         logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  554.  
  555.     # PROJECTHELPCONTEXT Record
  556.     PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
  557.     check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
  558.     PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  559.     check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
  560.     PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  561.  
  562.     # PROJECTLIBFLAGS Record
  563.     PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  564.     check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
  565.     PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
  566.     check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
  567.     PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
  568.     check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
  569.  
  570.     # PROJECTVERSION Record
  571.     PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
  572.     check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
  573.     PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  574.     check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
  575.     PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
  576.     PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
  577.  
  578.     # PROJECTCONSTANTS Record
  579.     PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  580.     check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
  581.     PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
  582.     if PROJECTCONSTANTS_SizeOfConstants > 1015:
  583.         logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
  584.     PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
  585.     PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  586.     check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
  587.     PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  588.     if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
  589.         logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  590.     PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
  591.  
  592.     # array of REFERENCE records
  593.     check = None
  594.     while True:
  595.         check = struct.unpack("<H", dir_stream.read(2))[0]
  596.         logging.debug("reference type = {0:04X}".format(check))
  597.         if check == 0x000F:
  598.             break
  599.  
  600.         if check == 0x0016:
  601.             # REFERENCENAME
  602.             REFERENCE_Id = check
  603.             REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
  604.             REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
  605.             REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  606.             check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
  607.             REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  608.             REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
  609.             continue
  610.  
  611.         if check == 0x0033:
  612.             # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  613.             REFERENCEORIGINAL_Id = check
  614.             REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
  615.             REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
  616.             continue
  617.  
  618.         if check == 0x002F:
  619.             # REFERENCECONTROL
  620.             REFERENCECONTROL_Id = check
  621.             REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  622.             REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  623.             REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
  624.             REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  625.             check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
  626.             REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  627.             check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
  628.             # optional field
  629.             check2 = struct.unpack("<H", dir_stream.read(2))[0]
  630.             if check2 == 0x0016:
  631.                 REFERENCECONTROL_NameRecordExtended_Id = check
  632.                 REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
  633.                 REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
  634.                 REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  635.                 check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
  636.                 REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  637.                 REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
  638.                 REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  639.             else:
  640.                 REFERENCECONTROL_Reserved3 = check2
  641.  
  642.             check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
  643.             REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
  644.             REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
  645.             REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
  646.             REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  647.             REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  648.             REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
  649.             REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
  650.             continue
  651.  
  652.         if check == 0x000D:
  653.             # REFERENCEREGISTERED
  654.             REFERENCEREGISTERED_Id = check
  655.             REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
  656.             REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
  657.             REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
  658.             REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  659.             check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
  660.             REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  661.             check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
  662.             continue
  663.  
  664.         if check == 0x000E:
  665.             # REFERENCEPROJECT
  666.             REFERENCEPROJECT_Id = check
  667.             REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  668.             REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
  669.             REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
  670.             REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
  671.             REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
  672.             REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
  673.             REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
  674.             continue
  675.  
  676.         logging.error('invalid or unknown check Id {0:04X}'.format(check))
  677.         sys.exit(0)
  678.  
  679.     PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
  680.     check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
  681.     PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
  682.     check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
  683.     PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
  684.     PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
  685.     check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
  686.     PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
  687.     check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
  688.     PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  689.  
  690.     logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
  691.     for x in xrange(0, PROJECTMODULES_Count):
  692.         MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  693.         check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
  694.         MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
  695.         MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
  696.         # account for optional sections
  697.         section_id = struct.unpack("<H", dir_stream.read(2))[0]
  698.         if section_id == 0x0047:
  699.             MODULENAMEUNICODE_Id = section_id
  700.             MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  701.             MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
  702.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  703.         if section_id == 0x001A:
  704.             MODULESTREAMNAME_id = section_id
  705.             MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
  706.             MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
  707.             MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  708.             check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
  709.             MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  710.             MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
  711.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  712.         if section_id == 0x001C:
  713.             MODULEDOCSTRING_Id = section_id
  714.             check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
  715.             MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  716.             MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
  717.             MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  718.             check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
  719.             MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  720.             MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
  721.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  722.         if section_id == 0x0031:
  723.             MODULEOFFSET_Id = section_id
  724.             check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
  725.             MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
  726.             check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
  727.             MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
  728.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  729.         if section_id == 0x001E:
  730.             MODULEHELPCONTEXT_Id = section_id
  731.             check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
  732.             MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  733.             check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
  734.             MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  735.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  736.         if section_id == 0x002C:
  737.             MODULECOOKIE_Id = section_id
  738.             check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
  739.             MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  740.             check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
  741.             MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  742.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  743.         if section_id == 0x0021 or section_id == 0x0022:
  744.             MODULETYPE_Id = section_id
  745.             MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  746.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  747.         if section_id == 0x0025:
  748.             MODULEREADONLY_Id = section_id
  749.             check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
  750.             MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  751.             check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
  752.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  753.         if section_id == 0x0028:
  754.             MODULEPRIVATE_Id = section_id
  755.             check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
  756.             MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  757.             check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
  758.             section_id = struct.unpack("<H", dir_stream.read(2))[0]
  759.         if section_id == 0x002B: # TERMINATOR
  760.             MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  761.             check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
  762.             section_id = None
  763.         if section_id != None:
  764.             logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  765.  
  766.         logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
  767.         vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
  768.         logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
  769.         logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
  770.         streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
  771.         logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
  772.         logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
  773.         logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
  774.  
  775.         code_path = vba_root + u'VBA/' + streamname_unicode
  776.         #TODO: test if stream exists
  777.         logging.debug('opening VBA code stream %s' % repr(code_path))
  778.         code_data = ole.openstream(code_path).read()
  779.         logging.debug("length of code_data = {0}".format(len(code_data)))
  780.         logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
  781.         code_data = code_data[MODULEOFFSET_TextOffset:]
  782.         if len(code_data) > 0:
  783.             code_data = decompress_stream(code_data)
  784.             # case-insensitive search in the code_modules dict to find the file extension:
  785.             filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
  786.             filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  787.             #TODO: also yield the codepage so that callers can decode it properly
  788.             yield (code_path, filename, code_data)
  789.             # print '-'*79
  790.             # print filename
  791.             # print ''
  792.             # print code_data
  793.             # print ''
  794.             logging.debug('extracted file {0}'.format(filename))
  795.         else:
  796.             logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
  797.     return
  798.  
  799.  
  800. def filter_vba(vba_code):
  801.     """
  802.    Filter VBA source code to remove the first lines starting with "Attribute VB_",
  803.    which are automatically added by MS Office and not displayed in the VBA Editor.
  804.    This should only be used when displaying source code for human analysis.
  805.  
  806.    Note: lines are not filtered if they contain a colon, because it could be
  807.    used to hide malicious instructions.
  808.  
  809.    :param vba_code: str, VBA source code
  810.    :return: str, filtered VBA source code
  811.    """
  812.     vba_lines = vba_code.splitlines()
  813.     start = 0
  814.     for line in vba_lines:
  815.         if line.startswith("Attribute VB_") and not ':' in line:
  816.             start += 1
  817.         else:
  818.             break
  819.     #TODO: also remove empty lines?
  820.     vba = '\n'.join(vba_lines[start:])
  821.     return vba
  822.  
  823.  
  824. def detect_autoexec(vba_code):
  825.     """
  826.    Detect if the VBA code contains keywords corresponding to macros running
  827.    automatically when triggered by specific actions (e.g. when a document is
  828.    opened or closed).
  829.  
  830.    :param vba_code: str, VBA source code
  831.    :return: list of str tuples (keyword, description)
  832.    """
  833.     #TODO: merge code with detect_suspicious
  834.     # case-insensitive search
  835.     #vba_code = vba_code.lower()
  836.     results = []
  837.     for description, keywords in AUTOEXEC_KEYWORDS.items():
  838.         for keyword in keywords:
  839.             #TODO: if keyword is already a compiled regex, use it as-is
  840.             # search using regex to detect word boundaries:
  841.             if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  842.             #if keyword.lower() in vba_code:
  843.                 results.append((keyword, description))
  844.     return results
  845.  
  846.  
  847. def detect_suspicious(vba_code):
  848.     """
  849.    Detect if the VBA code contains suspicious keywords corresponding to
  850.    potential malware behaviour.
  851.  
  852.    :param vba_code: str, VBA source code
  853.    :return: list of str tuples (keyword, description)
  854.    """
  855.     # case-insensitive search
  856.     #vba_code = vba_code.lower()
  857.     results = []
  858.     for description, keywords in SUSPICIOUS_KEYWORDS.items():
  859.         for keyword in keywords:
  860.             # search using regex to detect word boundaries:
  861.             if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  862.             #if keyword.lower() in vba_code:
  863.                 results.append((keyword, description))
  864.     return results
  865.  
  866.  
  867. def detect_patterns(vba_code):
  868.     """
  869.    Detect if the VBA code contains specific patterns such as IP addresses,
  870.    URLs, e-mail addresses, executable file names, etc.
  871.  
  872.    :param vba_code: str, VBA source code
  873.    :return: list of str tuples (pattern type, value)
  874.    """
  875.     results = []
  876.     found = set()
  877.     for pattern_type, pattern_re in RE_PATTERNS:
  878.         for match in pattern_re.finditer(vba_code):
  879.             value = match.group()
  880.             if value not in found:
  881.                 results.append((pattern_type, value))
  882.                 found.add(value)
  883.     return results
  884.  
  885.  
  886. def detect_hex_strings(vba_code):
  887.     """
  888.    Detect if the VBA code contains strings encoded in hexadecimal.
  889.  
  890.    :param vba_code: str, VBA source code
  891.    :return: list of str tuples (encoded string, decoded string)
  892.    """
  893.     results = []
  894.     found = set()
  895.     for match in re_hex_string.finditer(vba_code):
  896.         value = match.group()
  897.         if value not in found:
  898.             decoded = binascii.unhexlify(value)
  899.             results.append((value, decoded))
  900.             found.add(value)
  901.     return results
  902.  
  903. def detect_base64_strings(vba_code):
  904.     """
  905.    Detect if the VBA code contains strings encoded in base64.
  906.  
  907.    :param vba_code: str, VBA source code
  908.    :return: list of str tuples (encoded string, decoded string)
  909.    """
  910.     results = []
  911.     found = set()
  912.     for match in re_base64_string.finditer(vba_code):
  913.         value = match.group()
  914.         if value not in found:
  915.             decoded = base64.b64decode(value)
  916.             results.append((value, decoded))
  917.             found.add(value)
  918.     return results
  919.  
  920. def scan_vba(vba_code):
  921.     """
  922.    Analyze the provided VBA code to detect suspicious keywords,
  923.    auto-executable macros, IOC patterns, obfuscation patterns
  924.    such as hex-encoded strings.
  925.  
  926.    :param vba_code: str, VBA source code to be analyzed
  927.    :return: list of tuples (type, keyword, description)
  928.    (type = 'AutoExec', 'Suspicious', 'IOC' or 'Hex String')
  929.    """
  930.     # First, detect and extract hex-encoded strings:
  931.     hex_strings = detect_hex_strings(vba_code)
  932.     base64_strings = detect_base64_strings(vba_code)
  933.     # detect if the code contains StrReverse:
  934.     if 'strreverse' in vba_code.lower(): strreverse = True
  935.     else: strreverse = False
  936.     # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
  937.     for encoded, decoded in hex_strings:
  938.         vba_code += '\n'+decoded
  939.         # if the code contains "StrReverse", also append the hex strings in reverse order:
  940.         if strreverse:
  941.             # StrReverse after hex decoding:
  942.             vba_code += '\n'+decoded[::-1]
  943.             # StrReverse before hex decoding:
  944.             vba_code += '\n'+binascii.unhexlify(encoded[::-1])
  945.             #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
  946.     #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
  947.     autoexec_keywords = detect_autoexec(vba_code)
  948.     suspicious_keywords = detect_suspicious(vba_code)
  949.     # If hex-encoded strings were discovered, add an item to suspicious keywords:
  950.     if hex_strings:
  951.         suspicious_keywords.append(('Hex Strings', 'Hex-encoded strings were detected, may be used to obfuscate strings (option --hex to see all)'))
  952.     if base64_strings:
  953.         suspicious_keywords.append(('Base64 Strings', 'Base64-encoded strings were detected, may be used to obfuscate strings'))
  954.     patterns = detect_patterns(vba_code)
  955.     results = []
  956.     for keyword, description in autoexec_keywords:
  957.         results.append(('AutoExec', keyword, description))
  958.     for keyword, description in suspicious_keywords:
  959.         results.append(('Suspicious', keyword, description))
  960.     for pattern_type, value in patterns:
  961.         results.append(('IOC', value, pattern_type))
  962.     # Only if option --hex:
  963.     # for encoded, decoded in hex_strings:
  964.     #     results.append(('Hex String', repr(decoded), encoded))
  965.     for encoded, decoded in base64_strings:
  966.          results.append(('Base64 String', repr(decoded), encoded))
  967.     return results
  968.  
  969.  
  970. #=== CLASSES =================================================================
  971.  
  972. class VBA_Parser(object):
  973.     """
  974.    Class to parse MS Office files, to detect VBA macros and extract VBA source code
  975.    Supported file formats:
  976.    - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  977.    - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  978.    - PowerPoint 2007+ (.pptm, .ppsm)
  979.    """
  980.  
  981.     def __init__(self, filename, data=None):
  982.         """
  983.        Constructor for VBA_Parser
  984.  
  985.        :param _file: path of file to parse, file-like object or file content
  986.        :param filename: actual filename if _file is a  file-like object or file content
  987.        in a bytes string
  988.        """
  989.         #TODO: filename should be mandatory, optional data is a string or file-like object
  990.         #TODO: also support olefile and zipfile as input
  991.         if data is None:
  992.             # open file from disk:
  993.             _file = filename
  994.         else:
  995.             # file already read in memory, make it a file-like object for zipfile:
  996.             _file = cStringIO.StringIO(data)
  997.         #self.file = _file
  998.         self.ole_file = None
  999.         self.ole_subfiles = []
  1000.         self.filename = filename
  1001.         self.type = None
  1002.         self.vba_projects = None
  1003.         # if filename is None:
  1004.         #     if isinstance(_file, basestring):
  1005.         #         if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  1006.         #             self.filename = _file
  1007.         #         else:
  1008.         #             self.filename = '<file in bytes string>'
  1009.         #     else:
  1010.         #         self.filename = '<file-like object>'
  1011.         if olefile.isOleFile(_file):
  1012.             # This looks like an OLE file
  1013.             logging.info('Parsing OLE file %s' % self.filename)
  1014.             # Open and parse the OLE file, using unicode for path names:
  1015.             self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  1016.             self.type = TYPE_OLE
  1017.             #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1018.         elif zipfile.is_zipfile(_file):
  1019.             # This looks like a zip file, need to look for vbaProject.bin inside
  1020.             # It can be any OLE file inside the archive
  1021.             #...because vbaProject.bin can be renamed:
  1022.             # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  1023.             logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  1024.             self.type = TYPE_OpenXML
  1025.             z = zipfile.ZipFile(_file)
  1026.             #TODO: check if this is actually an OpenXML file
  1027.             # check each file within the zip if it is an OLE file, by reading its magic:
  1028.             for subfile in z.namelist():
  1029.                 magic = z.open(subfile).read(len(olefile.MAGIC))
  1030.                 if magic == olefile.MAGIC:
  1031.                     logging.debug('Opening OLE file %s within zip' % subfile)
  1032.                     ole_data = z.open(subfile).read()
  1033.                     try:
  1034.                         self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
  1035.                     except:
  1036.                         logging.debug('%s is not a valid OLE file' % subfile)
  1037.                         continue
  1038.             z.close()
  1039.         else:
  1040.             msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
  1041.             logging.error(msg)
  1042.             raise TypeError(msg)
  1043.  
  1044.     def find_vba_projects (self):
  1045.         """
  1046.        Finds all the VBA projects stored in an OLE file.
  1047.  
  1048.        Return None if the file is not OLE but OpenXML.
  1049.        Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  1050.        vba_root is the path of the root OLE storage containing the VBA project,
  1051.        including a trailing slash unless it is the root of the OLE file.
  1052.        project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  1053.        dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  1054.  
  1055.        If this function returns an empty list for one of the supported formats
  1056.        (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  1057.        file does not contain VBA macros.
  1058.  
  1059.        :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  1060.        for each VBA project found if OLE file
  1061.        """
  1062.         # if the file is not OLE but OpenXML, return None:
  1063.         if self.ole_file is None:
  1064.             return None
  1065.  
  1066.         # if this method has already been called, return previous result:
  1067.         if self.vba_projects is not None:
  1068.             return self.vba_projects
  1069.  
  1070.         # Find the VBA project root (different in MS Word, Excel, etc):
  1071.         # - Word 97-2003: Macros
  1072.         # - Excel 97-2003: _VBA_PROJECT_CUR
  1073.         # - PowerPoint 97-2003: not supported yet (different file structure)
  1074.         # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  1075.         # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  1076.         # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  1077.         # - Visio 2007: not supported yet (different file structure)
  1078.  
  1079.         # According to MS-OVBA section 2.2.1:
  1080.         # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  1081.         # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  1082.         # - all names are case-insensitive
  1083.  
  1084.         # start with an empty list:
  1085.         self.vba_projects = []
  1086.         # Look for any storage containing those storage/streams:
  1087.         ole = self.ole_file
  1088.         for storage in ole.listdir(streams=False, storages=True):
  1089.             # Look for a storage ending with "VBA":
  1090.             if storage[-1].upper() == 'VBA':
  1091.                 logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  1092.                 vba_root = '/'.join(storage[:-1])
  1093.                 # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  1094.                 # (used later to append all the child streams/storages)
  1095.                 if vba_root != '':
  1096.                     vba_root += '/'
  1097.                 logging.debug('Checking vba_root="%s"' % vba_root)
  1098.  
  1099.                 def check_vba_stream(ole, vba_root, stream_path):
  1100.                     full_path = vba_root + stream_path
  1101.                     if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  1102.                         logging.debug('Found %s stream: %s' % (stream_path, full_path))
  1103.                         return full_path
  1104.                     else:
  1105.                         logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  1106.                         return False
  1107.  
  1108.                 # Check if the VBA root storage also contains a PROJECT stream:
  1109.                 project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  1110.                 if not project_path: continue
  1111.                 # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  1112.                 vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  1113.                 if not vba_project_path: continue
  1114.                 # Check if the VBA root storage also contains a VBA/dir stream:
  1115.                 dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  1116.                 if not dir_path: continue
  1117.                 # Now we are pretty sure it is a VBA project structure
  1118.                 logging.debug('VBA root storage: "%s"' % vba_root)
  1119.                 # append the results to the list as a tuple for later use:
  1120.                 self.vba_projects.append((vba_root, project_path, dir_path))
  1121.         return self.vba_projects
  1122.  
  1123.     def detect_vba_macros(self):
  1124.         """
  1125.        Detect the potential presence of VBA macros in the file, by checking
  1126.        if it contains VBA projects. Both OLE and OpenXML files are supported.
  1127.  
  1128.        Important: for now, results are accurate only for Word, Excel and PowerPoint
  1129.        EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
  1130.  
  1131.        Note: this method does NOT attempt to check the actual presence or validity
  1132.        of VBA macro source code, so there might be false positives.
  1133.        It may also detect VBA macros in files embedded within the main file,
  1134.        for example an Excel workbook with macros embedded into a Word
  1135.        document without macros may be detected, without distinction.
  1136.  
  1137.        :return: bool, True if at least one VBA project has been found, False otherwise
  1138.        """
  1139.         #TODO: return None or raise exception if format not supported like PPT 97-2003
  1140.         #TODO: return the number of VBA projects found instead of True/False?
  1141.         # if OpenXML, check all the OLE subfiles:
  1142.         if self.ole_file is None:
  1143.             for ole_subfile in self.ole_subfiles:
  1144.                 if ole_subfile.detect_vba_macros():
  1145.                     return True
  1146.             return False
  1147.         # otherwise it's an OLE file, find VBA projects:
  1148.         vba_projects = self.find_vba_projects()
  1149.         if len(vba_projects) == 0:
  1150.             return False
  1151.         else:
  1152.             return True
  1153.  
  1154.  
  1155.     def extract_macros (self):
  1156.         """
  1157.        Extract and decompress source code for each VBA macro found in the file
  1158.  
  1159.        Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  1160.        If the file is OLE, filename is the path of the file.
  1161.        If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  1162.        within the zip archive, e.g. word/vbaProject.bin.
  1163.        """
  1164.         if self.ole_file is None:
  1165.             for ole_subfile in self.ole_subfiles:
  1166.                 for results in ole_subfile.extract_macros():
  1167.                     yield results
  1168.         else:
  1169.             self.find_vba_projects()
  1170.             for vba_root, project_path, dir_path in self.vba_projects:
  1171.                 # extract all VBA macros from that VBA root storage:
  1172.                 for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
  1173.                     yield (self.filename, stream_path, vba_filename, vba_code)
  1174.  
  1175.  
  1176.     def close(self):
  1177.         """
  1178.        Close all the open files. This method must be called after usage, if
  1179.        the application is opening many files.
  1180.        """
  1181.         if self.ole_file is None:
  1182.             for ole_subfile in self.ole_subfiles:
  1183.                 ole_subfile.close()
  1184.         else:
  1185.             self.ole_file.close()
  1186.  
  1187.  
  1188. def print_analysis(vba_code):
  1189.     """
  1190.    Analyze the provided VBA code, and print the results in a table
  1191.  
  1192.    :param vba_code: str, VBA source code to be analyzed
  1193.    :return: None
  1194.    """
  1195.     results = scan_vba(vba_code)
  1196.     if results:
  1197.         t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
  1198.         t.align = 'l'
  1199.         t.max_width['Type'] = 10
  1200.         t.max_width['Keyword'] = 20
  1201.         t.max_width['Description'] = 39
  1202.         for kw_type, keyword, description in results:
  1203.             t.add_row((kw_type, keyword, description))
  1204.         print t
  1205.     else:
  1206.         print 'No suspicious keyword or IOC found.'
  1207.  
  1208.  
  1209.  
  1210. def process_file (container, filename, data):
  1211.     """
  1212.    Process a single file
  1213.  
  1214.    :param container: str, path and filename of container if the file is within
  1215.    a zip archive, None otherwise.
  1216.    :param filename: str, path and filename of file on disk, or within the container.
  1217.    :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1218.    """
  1219.     #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1220.     if container:
  1221.         display_filename = '%s in %s' % (filename, container)
  1222.     else:
  1223.         display_filename = filename
  1224.     print '='*79
  1225.     print 'FILE:', display_filename
  1226.     try:
  1227.         #TODO: handle olefile errors, when an OLE file is malformed
  1228.         vba = VBA_Parser(filename, data)
  1229.         print 'Type:', vba.type
  1230.         if vba.detect_vba_macros():
  1231.             #print 'Contains VBA Macros:'
  1232.             for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1233.                 # hide attribute lines:
  1234.                 #TODO: option to disable attribute filtering
  1235.                 vba_code = filter_vba(vba_code)
  1236.                 print '-'*79
  1237.                 print 'VBA MACRO %s ' % vba_filename
  1238.                 print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
  1239.                 print '- '*39
  1240.                 # detect empty macros:
  1241.                 if vba_code.strip() == '':
  1242.                     print '(empty macro)'
  1243.                 else:
  1244.                     print vba_code
  1245.                     print '- '*39
  1246.                     print 'ANALYSIS:'
  1247.                     print_analysis(vba_code)
  1248.         else:
  1249.             print 'No VBA macros found.'
  1250.     except: #TypeError:
  1251.         #raise
  1252.         #TODO: print more info if debug mode
  1253.         print sys.exc_value
  1254.     print ''
  1255.  
  1256.  
  1257. def process_file_triage (container, filename, data):
  1258.     """
  1259.    Process a single file
  1260.  
  1261.    :param container: str, path and filename of container if the file is within
  1262.    a zip archive, None otherwise.
  1263.    :param filename: str, path and filename of file on disk, or within the container.
  1264.    :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1265.    """
  1266.     #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1267.     nb_macros = 0
  1268.     nb_autoexec = 0
  1269.     nb_suspicious = 0
  1270.     nb_iocs = 0
  1271.     nb_hexstrings = 0
  1272.     # ftype = 'Other'
  1273.     message = ''
  1274.     try:
  1275.         #TODO: handle olefile errors, when an OLE file is malformed
  1276.         vba = VBA_Parser(filename, data)
  1277.         if vba.detect_vba_macros():
  1278.             for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1279.                 nb_macros += 1
  1280.                 if vba_code.strip() != '':
  1281.                     #TODO: same changes as scan_vba, or modify scan_vba to return these counts
  1282.                     nb_autoexec += len(detect_autoexec(vba_code))
  1283.                     nb_suspicious += len(detect_suspicious(vba_code))
  1284.                     nb_iocs += len(detect_patterns(vba_code))
  1285.                     nb_hexstrings += len(detect_hex_strings(vba_code))
  1286.         if vba.type == TYPE_OLE:
  1287.             flags = 'O'
  1288.         else:
  1289.             flags = 'X'
  1290.         macros = autoexec = suspicious = iocs = hexstrings = '-'
  1291.         if nb_macros: macros = 'M'
  1292.         if nb_autoexec: autoexec = 'A'
  1293.         if nb_suspicious: suspicious = 'S'
  1294.         if nb_iocs: iocs = 'I'
  1295.         if nb_hexstrings: hexstrings = 'H'
  1296.         flags += '%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings)
  1297.  
  1298.         # macros = autoexec = suspicious = iocs = hexstrings = 'no'
  1299.         # if nb_macros: macros = 'YES:%d' % nb_macros
  1300.         # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
  1301.         # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
  1302.         # if nb_iocs: iocs = 'YES:%d' % nb_iocs
  1303.         # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
  1304.         # # 2nd line = info
  1305.         # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
  1306.     except TypeError:
  1307.         # file type not OLE nor OpenXML
  1308.         flags = '?'
  1309.         message = 'File format not supported'
  1310.     except:
  1311.         # another error occurred
  1312.         #raise
  1313.         #TODO: print more info if debug mode
  1314.         #TODO: distinguish real errors from incorrect file types
  1315.         flags = '!ERROR'
  1316.         message = sys.exc_value
  1317.     line = '%-6s %s' % (flags, filename)
  1318.     if message:
  1319.         line += ' - %s' % message
  1320.     print line
  1321.  
  1322.     # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
  1323.     #     header=False, border=False)
  1324.     # t.align = 'l'
  1325.     # t.max_width['filename'] = 30
  1326.     # t.max_width['type'] = 10
  1327.     # t.max_width['macros'] = 6
  1328.     # t.max_width['autoexec'] = 6
  1329.     # t.max_width['suspicious'] = 6
  1330.     # t.max_width['ioc'] = 6
  1331.     # t.max_width['hexstrings'] = 6
  1332.     # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
  1333.     # print t
  1334.  
  1335. def main_triage_quick():
  1336.     pass
  1337.  
  1338. #=== MAIN =====================================================================
  1339.  
  1340. def main():
  1341.     """
  1342.    Main function, called when olevba is run from the command line
  1343.    """
  1344.     usage = 'usage: %prog [options] <filename> [filename2 ...]'
  1345.     parser = optparse.OptionParser(usage=usage)
  1346.     # parser.add_option('-o', '--outfile', dest='outfile',
  1347.     #     help='output file')
  1348.     # parser.add_option('-c', '--csv', dest='csv',
  1349.     #     help='export results to a CSV file')
  1350.     parser.add_option("-r", action="store_true", dest="recursive",
  1351.         help='find files recursively in subdirectories.')
  1352.     parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  1353.         help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  1354.     parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  1355.         help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  1356.     parser.add_option("-t", action="store_true", dest="triage_mode",
  1357.         help='triage mode, display results as a summary table (default for multiple files)')
  1358.     parser.add_option("-d", action="store_true", dest="detailed_mode",
  1359.         help='detailed mode, display full results (default for single file)')
  1360.     parser.add_option("-i", "--input", dest='input', type='str', default=None,
  1361.         help='input file containing VBA source code to be analyzed (no parsing)')
  1362.  
  1363.     (options, args) = parser.parse_args()
  1364.  
  1365.     # Print help if no arguments are passed
  1366.     if len(args) == 0 and not options.input:
  1367.         print __doc__
  1368.         parser.print_help()
  1369.         sys.exit()
  1370.  
  1371.     logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
  1372.     # For now, all logging is disabled:
  1373.     logging.disable(logging.CRITICAL)
  1374.  
  1375.     if options.input:
  1376.         # input file provided with VBA source code to be analyzed directly:
  1377.         print 'Analysis of VBA source code from %s:' % options.input
  1378.         vba_code = open(options.input).read()
  1379.         print_analysis(vba_code)
  1380.         sys.exit()
  1381.  
  1382.     # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
  1383.     # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
  1384.     if not options.detailed_mode or options.triage_mode:
  1385.         print '%-6s %-72s' % ('Flags', 'Filename')
  1386.         print '%-6s %-72s' % ('-'*6, '-'*72)
  1387.     previous_container = None
  1388.     count = 0
  1389.     container = filename = data = None
  1390.     for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  1391.         zip_password=options.zip_password, zip_fname=options.zip_fname):
  1392.         # ignore directory names stored in zip files:
  1393.         if container and filename.endswith('/'):
  1394.             continue
  1395.         if options.detailed_mode and not options.triage_mode:
  1396.             # fully detailed output
  1397.             process_file(container, filename, data)
  1398.         else:
  1399.             # print container name when it changes:
  1400.             if container != previous_container:
  1401.                 if container is not None:
  1402.                     print '\nFiles in %s:' % container
  1403.                 previous_container = container
  1404.             # summarized output for triage:
  1405.             process_file_triage(container, filename, data)
  1406.         count += 1
  1407.     if not options.detailed_mode or options.triage_mode:
  1408.         print '\n(Flags: O=OLE, X=OpenXML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex-encoded strings, ?=Unknown)\n'
  1409.  
  1410.     if count == 1 and not options.triage_mode and not options.detailed_mode:
  1411.         # if options -t and -d were not specified and it's a single file, print details:
  1412.         #TODO: avoid doing the analysis twice by storing results
  1413.         process_file(container, filename, data)
  1414.  
  1415. if __name__ == '__main__':
  1416.     main()
  1417.  
  1418. # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
Add Comment
Please, Sign In to add comment