Advertisement
lilo_booter

JPEG Scraper from a Corrupt or Formatted HD

Nov 11th, 2015
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.72 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import array
  5. import os
  6.  
  7. # Usage message if arguments incomplete
  8. if len( sys.argv ) < 3:
  9.     print "Usage: %s disk-image directory" % sys.argv[ 0 ]
  10.     sys.exit( 0 )
  11.  
  12. # Deal with arguments
  13. disk_image = sys.argv[ 1 ]
  14. directory = sys.argv[ 2 ]
  15. pattern = directory + "/%05d.jpg"
  16.  
  17. # Create the output directory if doesn't exist
  18. if not os.path.exists( directory ):
  19.     os.makedirs( directory )
  20.  
  21. # Open the disk image
  22. fd = open( disk_image, "rb" )
  23.  
  24. # Create an array to hold the bytes read
  25. buffer = array.array( "B" )
  26.  
  27. # Holds the number of images found
  28. found = 0
  29.  
  30. # Start and End of image constants
  31. SOI = chr( 0xff ) + chr( 0xd8 ) + chr( 0xff )
  32. EOI = chr( 0xff ) + chr( 0xd9 )
  33.  
  34. while True:
  35.     # Keep track of where we are in the file (debug mostly)
  36.     # NB: buffer might not be empty here - see below - hence real offset is
  37.     # current position minus the length of the buffer
  38.     offset = fd.tell( ) - len( buffer )
  39.  
  40.     # Add 1K of data to the array
  41.     buffer.fromfile( fd, 1024 )
  42.     if len( buffer ) == 0: break
  43.  
  44.     # Ensure that we don't miss the marker or length - keep adding until we
  45.     # are sure we won't run out of bytes during the parsing of the current buffer
  46.     while buffer[ -2 ] == 0xff or buffer[ -1 ] == 0xff:
  47.         buffer.fromfile( fd, 4 )
  48.  
  49.     # Determine if and where the array contains a start of image
  50.     # NB: It could possibly contain multiple SOI's - we'll deal with that below
  51.     soi = buffer.tostring( ).find( SOI )
  52.  
  53.     if soi != -1:
  54.         # Determine the absolute soi (for diagnostics)
  55.         abs_soi = offset + soi
  56.  
  57.         # There is definitely nothing of interest before the SOI, so discard what comes before
  58.         buffer = buffer[ soi : ]
  59.  
  60.         # Current eoi is immediately after the soi
  61.         eoi = 2
  62.  
  63.         # Keep reading while the current eoi is a valid jpeg section (starting with 0xff) and
  64.         # not followed immediately by 0xd9
  65.         while buffer[ eoi ] == 0xff and buffer[ eoi + 1 ] != 0xd9:
  66.  
  67.             # All sections apart from the data (0xda) should follow with a length
  68.             if buffer[ eoi + 1 ] != 0xda:
  69.  
  70.                 # There must be a 2 byte length following the JPEG marker here
  71.                 if len( buffer ) < eoi + 4:
  72.                     buffer.fromfile( fd, eoi + 4 - len( buffer ) )
  73.  
  74.                 # Calculate the size of the section plus the following marker and length
  75.                 collect = buffer[ eoi + 2 ] * 256 + buffer[ eoi + 3 ] + 4
  76.  
  77.                 # Calculate number of bytes not in the buffer already and fetch them
  78.                 remainder = collect - len( buffer ) + eoi
  79.                 if remainder > 0:
  80.                     buffer.fromfile( fd, remainder )
  81.  
  82.                 # Current eoi should point at the start of a JPEG section (0xff)
  83.                 eoi += collect - 2
  84.  
  85.             else:
  86.                 # Don't like this - assumes a SOX (0xff 0xda) will be followed by an EOI eventually..
  87.                 while True:
  88.                     buffer.fromfile( fd, 1024 )
  89.                     e = buffer[ eoi : ].tostring( ).find( EOI )
  90.                     if e != -1:
  91.                         eoi += e
  92.                         break
  93.                     else:
  94.                         # Just in case the EOI marker falls between the last bytes read and the following
  95.                         eoi = len( buffer ) - 1
  96.  
  97.             # Ensure that we don't encounter an inner SOI here and that we are pointing at a 0xff value
  98.             if buffer[ eoi ] != 0xff or buffer[ eoi + 1 ] == 0xd8: break
  99.  
  100.         # Determine if a valid image has been detected (SOI ... EOI)
  101.         if buffer[ eoi ] == 0xff and buffer[ eoi + 1 ] == 0xd9:
  102.  
  103.             # Report and save the image
  104.             print found, ": found at", abs_soi, eoi + 2
  105.             buffer[ : eoi + 2 ].tofile( open( pattern % found, "wb" ) )
  106.             found += 1
  107.  
  108.             # Retain everything after the image
  109.             buffer = buffer[ eoi + 2 : ]
  110.         else:
  111.             # Discard all but the first 2 bytes (SOI) in case another SOI exists in the buffer
  112.             print "false positive at", abs_soi, "of", eoi + 2
  113.             buffer = buffer[ 2 : ]
  114.     else:
  115.         # Definitely nothing so empty the buffer and continue
  116.         buffer = buffer[ 0 : 0 ]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement