Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- $test = "crud <img src='foo' alt='heh'></img> crud <img src='blah' alt='foo' /> crud";
- //$expr = "string '<img' not string '/>' string 'src=' extract quoted '\\'','\\\"' '\\\\' not string '<img' string '/>'";
- $expr = "'<img' not '/>', '</img' 'src=' not '<img' '>'";
- /*
- Searching in a multimatch environment...
- > goal is to grab the first possible match
- > need anything shorter? use "not" to avoid repetition
- - " A A B A C C "
- 1) "string 'A' string 'C'" => "A A B A C"
- 2) "string 'A' not string 'A' string 'C'" => "A C"
- 3) "string 'A' not string 'B' string 'C'" => "A C"
- --- process ---
- 1) find A => 1, find C => 9 || 1-10
- 2) find A => 1, ( find A => 3, find C => 9 ) => not FAIL, restart from 2 ..
- .. find A => 3, ( find A => 7, find C => 9 ) => not FAIL, restart from 4 ..
- .. find A => 7, ( find A => false, find C => 9 ) => not SUCCESS || 7-10
- 3) find A => 1, ( find B => 5, find C => 9 ) => not FAIL, restart from 2 ..
- .. find A => 3, ( find B => 5, find C => 9 ) => not FAIL, restart from 4 ..
- .. find A => 7, ( find B => false, find C => 9 ) => not SUCCESS || 7-10
- --- rules ---
- */
- function hent( $txt ){ return htmlspecialchars( $txt, ENT_QUOTES, 'UTF-8' ); }
- function hdr( $name ){ echo "<h3>{$name}</h3>\n"; }
- hdr( "input" );
- echo "test data: \" <b>".hent( $test )."</b> \"\n<br />\n";
- echo "pattern: \" <b>".hent( $expr )."</b> \"\n<br />\n";
- /*
- Tokenize
- */
- function coltok( $toks )
- {
- $out = array();
- foreach( $toks as $tok )
- $out []= "{ type={$tok->type}, data='{$tok->data}', pos={$tok->pos} }";
- return $out;
- }
- function mktoken( $type, $data, $pos ){ return (object) array( 'type' => $type, 'data' => $data, 'pos' => $pos ); }
- function te_parse_number( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- while( strpos( "0123456789", $expr[ $i ] ) !== false )
- $i++;
- $out[] = mktoken( 'ident', substr( $expr, $pos, $i - $pos ), $pos );
- }
- function te_parse_ident( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- while( strpos( "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", $expr[ $i ] ) !== false )
- $i++;
- $out[] = mktoken( 'ident', substr( $expr, $pos, $i - $pos ), $pos );
- }
- function te_parse_string( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- $ec = $expr[ $i ];
- $i++;
- $outstr = '';
- for( ; $i < $len; ++$i )
- {
- $c = $expr[ $i ];
- if( $c == '\\' )
- {
- $i++;
- $nc = $expr[ $i ];
- if( $nc == 'n' ) $outstr .= "\n";
- elseif( $nc == 't' ) $outstr .= "\t";
- elseif( $nc == '\\' ) $outstr .= '\\';
- else
- $outstr .= $c.$nc;
- }
- elseif( $c == $ec )
- {
- $out[] = mktoken( 'string', $outstr, $pos );
- return;
- }
- else
- $outstr .= $c;
- }
- }
- function tokenize_expr( $expr )
- {
- $len = strlen( $expr );
- $out = array();
- $errors = array();
- for( $i = 0; $i < $len; ++$i )
- {
- $c = $expr[ $i ];
- // whitespace
- if( strpos( " \t\n\r", $c ) !== false )
- continue;
- // special symbols
- elseif( strpos( ",", $c ) !== false )
- $out[] = mktoken( 'special', $c, $i );
- // strings
- elseif( strpos( "\'\"", $c ) !== false )
- te_parse_string( $expr, $i, $len, $out, $errors );
- // numbers
- elseif( strpos( "0123456789", $c ) !== false )
- te_parse_number( $expr, $i, $len, $out, $errors );
- // keywords
- elseif( strpos( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", $c ) !== false )
- te_parse_ident( $expr, $i, $len, $out, $errors );
- else
- $errors[] = "invalid character at position {$i} ({$c})";
- }
- if( count( $errors ) )
- {
- var_dump( coltok( $out ), $errors );
- die;
- }
- return $out;
- }
- hdr( 'tokenizer' );
- $tokens = tokenize_expr( $expr );
- var_dump( coltok( $tokens ) );
- function mkparseitem( $type, $data ){ return (object) array( 'type' => $type, 'data' => $data ); }
- function gen_parsetree( $tokens, &$from = null )
- {
- $ptree = array();
- $len = count( $tokens );
- $errors = array();
- $i = 0;
- $onlyone = false;
- if( isset( $from ) )
- {
- $i = $from;
- $onlyone = true;
- }
- for( ; $i < $len; ++$i )
- {
- $token = $tokens[ $i ];
- if( $token->type == 'string' )
- {
- $items = array( $token->data );
- $i++;
- while( count( $tokens ) > $i && $tokens[ $i ]->type == 'special' && $tokens[ $i ]->data == ',' )
- {
- $i++;
- if( $tokens[ $i ]->type != 'string' )
- {
- $errors = "expected string after string and ','";
- $i--; continue;
- }
- $items[] = $tokens[ $i ]->data;
- $i++;
- }
- $i--;
- $ptree[] = mkparseitem( 'string', $items );
- }
- elseif( $token->type == 'ident' && $token->data == 'not' )
- {
- $i++;
- $notwhat = gen_parsetree( $tokens, $i );
- $i++;
- $befwhat = gen_parsetree( $tokens, $i );
- $ptree[] = mkparseitem( 'not', array( $notwhat, $befwhat ) );
- }
- else
- {
- $errors[] = "unexpected token found: {$token->type} '{$token->data}'";
- }
- if( $onlyone )
- {
- $from = $i;
- break;
- }
- }
- if( count( $errors ) )
- {
- var_dump( $ptree, $errors );
- die;
- }
- return $ptree;
- }
- hdr( 'parsing tree' );
- $ptree = gen_parsetree( $tokens );
- var_dump( $ptree );
- function do_matching( $ptree, $str, $ofs = 0 )
- {
- restart:
- if( $ofs >= strlen( $str ) )
- return false;
- $first = null;
- foreach( $ptree as $pitem )
- {
- if( $pitem->type == 'string' )
- {
- $pos = strlen( $str );
- $matched = null;
- foreach( $pitem->data as $ms )
- {
- $pp = strpos( $str, $ms, $ofs );
- if( $pp === false ) continue;
- $prevpos = $pos;
- $pos = min( $pos, $pp );
- if( $pos != $prevpos )
- $matched = $ms;
- }
- if( $pos >= strlen( $str ) )
- return false;
- else
- {
- if( $first === null )
- $first = $pos;
- $ofs = $pos + strlen( $matched );
- }
- }
- else if( $pitem->type == 'not' )
- {
- $p1 = do_matching( $pitem->data[0], $str, $ofs );
- $p2 = do_matching( $pitem->data[1], $str, $ofs );
- if( $p2 === false || ( $p1 !== false && $p1[0] < $p2[0] ) )
- {
- $ofs = ( $first === null ? $ofs : $first ) + 1;
- goto restart;
- }
- if( $first === null )
- $first = $p2[ 0 ];
- $ofs = $p2[1];
- }
- }
- if( $first === null )
- return false;
- return array( $first, $ofs );
- }
- function multimatch( $ptree, $str )
- {
- $match = do_matching( $ptree, $str );
- if( $match === false )
- return array();
- $matches = array( $match );
- while( $match !== false )
- {
- $match = do_matching( $ptree, $str, $match[0]+1 );
- if( $match !== false )
- $matches[] = $match;
- }
- return $matches;
- }
- hdr( 'matching' );
- $out = do_matching( $ptree, $test );
- var_dump( $out );
- if( $out === false )
- echo "no matches\n";
- else
- {
- $match = substr( $test, $out[0], $out[1]-$out[0] );
- $match = hent( $match );
- echo "match: '{$match}'\n";
- }
- hdr( 'multimatch' );
- $out = multimatch( $ptree, $test );
- var_dump( $out );
- foreach( $out as $mtch )
- {
- $match = substr( $test, $mtch[0], $mtch[1]-$mtch[0] );
- $match = hent( $match );
- echo "match: '{$match}'<br/>\n";
- }
- /* output:
- http://pastehtml.com/view/ctzxi3h87.html
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement