Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- include "io", "string", "fmt", "math", "re";
- TARFILE = io_file( "D:/data/data.tar", FILE_READ );
- OUTDIR_ARTICLES = "./for_articles/";
- OUTDIR_CONTRACTS = "./for_contracts/";
- RESLIST = string_explode( io_file_read( "reslist.txt" ), "\n" );
- RESLIST = RESLIST.unique();
- array_process( RESLIST, function( x )
- {
- x = re_replace( x, "#[\x80-\xff]#", "" );
- return string_replace( x, "%20", " " );
- });
- if( !RESLIST )
- ERROR( "no reslist" );
- if( !TARFILE )
- ERROR( "no tarfile" );
- RESLIST_ARTICLES = [];
- RESLIST_CONTRACTS = [];
- foreach( item : RESLIST )
- {
- citem = string_part( item, 2 );
- if( string_part( item, 0, 2 ) == "A|" )
- RESLIST_ARTICLES.push( citem );
- else
- RESLIST_CONTRACTS.push( citem );
- }
- TARFILE_size = TARFILE.size;
- printlns
- (
- "size = " $ TARFILE_size,
- "items to find = " $ RESLIST.size
- );
- /**************** FUNCTIONS ******************/
- function cstring( text )
- {
- end = string_find( text, "\0" );
- if( end === null )
- return text;
- return string_part( text, 0, end );
- }
- function parse_tarhead( data )
- {
- arr = fmt_unpack( "100s8s8s8s12s12s8s c100s 8s 32s32s8s8s 155s", data );
- return
- {
- name = cstring( arr[0] ),
- size = fmt_string_parser( cstring( arr[4] ) ).read_octal_int(),
- type_num = arr[7],
- type_char = string_frombytes( arr[7] ),
- linkname = cstring( arr[8] ),
- filename_prefix = cstring( arr[14] ),
- };
- }
- function create_dir_for( filename )
- {
- filename = string_replace( filename, "\\", "/" );
- parts = string_explode( filename, "/" );
- parts.pop();
- path = parts.shift();
- foreach( part : parts )
- {
- path = path $ "/" $ part;
- io_dir_create( path );
- }
- }
- function read_file( file, size )
- {
- data = file.read( size );
- size = size % 512;
- if( size )
- file.seek( 512 - size, SEEK_CUR );
- return data;
- }
- function skip_file( file, size )
- {
- file.seek( toint( ceil( size / 512 ) ) * 512, SEEK_CUR );
- }
- /**************** END FUNCTIONS ******************/
- count = 0;
- while( !TARFILE.error && !TARFILE.eof )
- {
- header = TARFILE.read( 512 );
- if( !header )
- break;
- if( header.length < 512 )
- {
- INFO( "offset = " $ TARFILE.offset );
- WARNING( "partial data, stopping: " $ header );
- break;
- }
- data = parse_tarhead( header );
- if( data.type_char == "0" || data.type_num == 0 ) // file
- {
- findname = re_replace( data.name, "#[\x80-\xff]#", "" );
- is_article = RESLIST_ARTICLES.find( findname ) !== null;
- is_contract = RESLIST_CONTRACTS.find( findname ) !== null;
- if( is_article || is_contract )
- {
- println( "at " $ ( TARFILE.offset / TARFILE_size ) * 100 $ "%" );
- println( "found ", data.name );
- fullpath = if( is_article, OUTDIR_ARTICLES, OUTDIR_CONTRACTS ) $ data.name;
- create_dir_for( fullpath );
- filedata = read_file( TARFILE, data.size );
- io_file_write( fullpath, filedata );
- count++;
- }
- else
- skip_file( TARFILE, data.size );
- }
- else
- skip_file( TARFILE, data.size );
- }
- println( "found " $ count $ " files" );
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement