Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- $test = "crud <img src='blah' alt='foo' /> crud";
- //$expr = "string '<img' not string '/>' string 'src=' extract quoted '\\'','\\\"' '\\\\' not string '<img' string '/>'";
- $expr = "string '<img' not string '/>' string 'src=' not string '<img' string '/>'";
- function hdr( $name ){ echo "<h3>{$name}</h3>\n"; }
- /*
- Tokenize
- */
- function coltok( $toks )
- {
- $out = array();
- foreach( $toks as $tok )
- $out []= "{ type={$tok->type}, data='{$tok->data}', pos={$tok->pos} }";
- return $out;
- }
- function mktoken( $type, $data, $pos ){ return (object) array( 'type' => $type, 'data' => $data, 'pos' => $pos ); }
- function te_parse_number( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- while( strpos( "0123456789", $expr[ $i ] ) !== false )
- $i++;
- $out[] = mktoken( 'ident', substr( $expr, $pos, $i - $pos ), $pos );
- }
- function te_parse_ident( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- while( strpos( "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", $expr[ $i ] ) !== false )
- $i++;
- $out[] = mktoken( 'ident', substr( $expr, $pos, $i - $pos ), $pos );
- }
- function te_parse_string( $expr, &$i, $len, &$out, &$errors )
- {
- $pos = $i;
- $ec = $expr[ $i ];
- $i++;
- $outstr = '';
- for( ; $i < $len; ++$i )
- {
- $c = $expr[ $i ];
- if( $c == '\\' )
- {
- $i++;
- $nc = $expr[ $i ];
- if( $nc == 'n' ) $outstr .= "\n";
- elseif( $nc == 't' ) $outstr .= "\t";
- elseif( $nc == '\\' ) $outstr .= '\\';
- else
- $outstr .= $c.$nc;
- }
- elseif( $c == $ec )
- {
- $out[] = mktoken( 'string', $outstr, $pos );
- return;
- }
- else
- $outstr .= $c;
- }
- }
- function tokenize_expr( $expr )
- {
- $len = strlen( $expr );
- $out = array();
- $errors = array();
- for( $i = 0; $i < $len; ++$i )
- {
- $c = $expr[ $i ];
- // whitespace
- if( strpos( " \t\n\r", $c ) !== false )
- continue;
- // special symbols
- elseif( strpos( ",", $c ) !== false )
- $out[] = mktoken( 'special', $c, $i );
- // strings
- elseif( strpos( "\'\"", $c ) !== false )
- te_parse_string( $expr, $i, $len, $out, $errors );
- // numbers
- elseif( strpos( "0123456789", $c ) !== false )
- te_parse_number( $expr, $i, $len, $out, $errors );
- // keywords
- elseif( strpos( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", $c ) !== false )
- te_parse_ident( $expr, $i, $len, $out, $errors );
- else
- $errors[] = "invalid character at position {$i} ({$c})";
- }
- if( count( $errors ) )
- {
- var_dump( coltok( $out ), $errors );
- die;
- }
- return $out;
- }
- hdr( 'tokenizer' );
- $tokens = tokenize_expr( $expr );
- var_dump( coltok( $tokens ) );
- function mkparseitem( $type, $data ){ return (object) array( 'type' => $type, 'data' => $data ); }
- function gen_parsetree( $tokens, &$from = null )
- {
- $ptree = array();
- $len = count( $tokens );
- $errors = array();
- $i = 0;
- $onlyone = false;
- if( isset( $from ) )
- {
- $i = $from;
- $onlyone = true;
- }
- for( ; $i < $len; ++$i )
- {
- $token = $tokens[ $i ];
- if( $token->type == 'ident' && $token->data == 'string' )
- {
- $i++;
- if( $tokens[ $i ]->type != 'string' )
- {
- $errors[] = "expected string after 'string'";
- $i--; continue;
- }
- $ptree[] = mkparseitem( 'string', $tokens[ $i ]->data );
- }
- elseif( $token->type == 'ident' && $token->data == 'not' )
- {
- $i++;
- $notwhat = gen_parsetree( $tokens, $i );
- $i++;
- $befwhat = gen_parsetree( $tokens, $i );
- $ptree[] = mkparseitem( 'not', array( $notwhat, $befwhat ) );
- }
- else
- {
- $errors[] = "unexpected token found: {$token->type} '{$token->data}'";
- }
- if( $onlyone )
- {
- $from = $i;
- break;
- }
- }
- if( count( $errors ) )
- {
- var_dump( $ptree, $errors );
- die;
- }
- return $ptree;
- }
- hdr( 'parsing tree' );
- $ptree = gen_parsetree( $tokens );
- var_dump( $ptree );
- function do_matching( $ptree, $str, $ofs = 0 )
- {
- $first = null;
- foreach( $ptree as $pitem )
- {
- if( $pitem->type == 'string' )
- {
- $pos = strpos( $str, $pitem->data, $ofs );
- if( $pos === false )
- return false;
- else
- {
- if( $first === null )
- $first = $pos;
- $ofs = $pos + strlen( $pitem->data );
- }
- }
- else if( $pitem->type == 'not' )
- {
- $p1 = do_matching( $pitem->data[0], $str, $ofs );
- $p2 = do_matching( $pitem->data[1], $str, $ofs );
- if( $p2 === false || ( $p1 !== false && $p1[0] < $p2[0] ) )
- return false;
- if( $first === null )
- $first = $p2[ 0 ];
- $ofs = $p2[1];
- }
- }
- return array( $first, $ofs );
- }
- hdr( 'matching' );
- $out = do_matching( $ptree, $test );
- var_dump( $out );
- if( $out === false )
- echo "no matches\n";
- else
- {
- $match = substr( $test, $out[0], $out[1]-$out[0] );
- echo "match: '{$match}'\n";
- }
- /* final output:
- match: '<img src='blah' alt='foo' />'
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement