Advertisement
snake5

SGScript - .tar parsing code used on a 35gb file

Mar 19th, 2014
409
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. include "io", "string", "fmt", "math", "re";
  2.  
  3.  
  4. TARFILE = io_file( "D:/data/data.tar", FILE_READ );
  5.  
  6. OUTDIR_ARTICLES = "./for_articles/";
  7. OUTDIR_CONTRACTS = "./for_contracts/";
  8.  
  9. RESLIST = string_explode( io_file_read( "reslist.txt" ), "\n" );
  10. RESLIST = RESLIST.unique();
  11. array_process( RESLIST, function( x )
  12. {
  13.     x = re_replace( x, "#[\x80-\xff]#", "" );
  14.     return string_replace( x, "%20", " " );
  15. });
  16.  
  17.  
  18. if( !RESLIST )
  19.     ERROR( "no reslist" );
  20. if( !TARFILE )
  21.     ERROR( "no tarfile" );
  22.  
  23.  
  24. RESLIST_ARTICLES = [];
  25. RESLIST_CONTRACTS = [];
  26. foreach( item : RESLIST )
  27. {
  28.     citem = string_part( item, 2 );
  29.     if( string_part( item, 0, 2 ) == "A|" )
  30.         RESLIST_ARTICLES.push( citem );
  31.     else
  32.         RESLIST_CONTRACTS.push( citem );
  33. }
  34.  
  35.  
  36. TARFILE_size = TARFILE.size;
  37. printlns
  38. (
  39.     "size = " $ TARFILE_size,
  40.     "items to find = " $ RESLIST.size
  41. );
  42.  
  43.  
  44. /**************** FUNCTIONS ******************/
  45. function cstring( text )
  46. {
  47.     end = string_find( text, "\0" );
  48.     if( end === null )
  49.         return text;
  50.     return string_part( text, 0, end );
  51. }
  52.  
  53. function parse_tarhead( data )
  54. {
  55.     arr = fmt_unpack( "100s8s8s8s12s12s8s c100s 8s 32s32s8s8s 155s", data );
  56.     return
  57.     {
  58.         name = cstring( arr[0] ),
  59.         size = fmt_string_parser( cstring( arr[4] ) ).read_octal_int(),
  60.         type_num = arr[7],
  61.         type_char = string_frombytes( arr[7] ),
  62.         linkname = cstring( arr[8] ),
  63.         filename_prefix = cstring( arr[14] ),
  64.     };
  65. }
  66.  
  67.  
  68. function create_dir_for( filename )
  69. {
  70.     filename = string_replace( filename, "\\", "/" );
  71.     parts = string_explode( filename, "/" );
  72.     parts.pop();
  73.     path = parts.shift();
  74.     foreach( part : parts )
  75.     {
  76.         path = path $ "/" $ part;
  77.         io_dir_create( path );
  78.     }
  79. }
  80.  
  81. function read_file( file, size )
  82. {
  83.     data = file.read( size );
  84.     size = size % 512;
  85.     if( size )
  86.         file.seek( 512 - size, SEEK_CUR );
  87.     return data;
  88. }
  89.  
  90. function skip_file( file, size )
  91. {
  92.     file.seek( toint( ceil( size / 512 ) ) * 512, SEEK_CUR );
  93. }
  94.  
  95. /**************** END FUNCTIONS ******************/
  96.  
  97.  
  98. count = 0;
  99. while( !TARFILE.error && !TARFILE.eof )
  100. {
  101.     header = TARFILE.read( 512 );
  102.     if( !header )
  103.         break;
  104.     if( header.length < 512 )
  105.     {
  106.         INFO( "offset = " $ TARFILE.offset );
  107.         WARNING( "partial data, stopping: " $ header );
  108.         break;
  109.     }
  110.     data = parse_tarhead( header );
  111.    
  112.     if( data.type_char == "0" || data.type_num == 0 ) // file
  113.     {
  114.         findname = re_replace( data.name, "#[\x80-\xff]#", "" );
  115.         is_article = RESLIST_ARTICLES.find( findname ) !== null;
  116.         is_contract = RESLIST_CONTRACTS.find( findname ) !== null;
  117.         if( is_article || is_contract )
  118.         {
  119.             println( "at " $ ( TARFILE.offset / TARFILE_size ) * 100 $ "%" );
  120.             println( "found ", data.name );
  121.            
  122.             fullpath = if( is_article, OUTDIR_ARTICLES, OUTDIR_CONTRACTS ) $ data.name;
  123.             create_dir_for( fullpath );
  124.             filedata = read_file( TARFILE, data.size );
  125.             io_file_write( fullpath, filedata );
  126.             count++;
  127.         }
  128.         else
  129.             skip_file( TARFILE, data.size );
  130.     }
  131.     else
  132.         skip_file( TARFILE, data.size );
  133. }
  134. println( "found " $ count $ " files" );
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement