Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Generate a regex matching all ABAP keywords
- # The point here is to not simply generate an alternation of all keywords but finds common "head parts" of words,
- # generating sub-alternations of different "tails" for common heads.
- # Source for ABAP keyword list: https://help.sap.com/doc/abapdocu_751_index_htm/7.51/en-us/abenabap_words.htm
- my $data_start = tell DATA; # first position of DATA, needed for a second loop over <DATA> (in the test sub)
- # Build a hash of arrays, comprising keywords with identical beginning part ("head")
- # The "head" will be the key, and the different "tails" are the corresponding value array
- my %groups = %{ do_split( ) };
- # Now produce the array of regular sub-expressions to be alternated
- my @final = @{ finalize( \%groups ) };
- # The final regex
- my $pattern = '\\b(?:' . join( '|', @final ) . ')\\b';
- print $pattern;
- # Test: the generated pattern should match all keywords of the list
- test( $pattern, $data_start );
- sub do_split {
- # Find common "head" parts of successive entries
- # Starting with MAX_HEAD_LENGTH down to MIN_HEAD_LENGTH
- use constant {
- MIN_HEAD_LENGTH => 2,
- MAX_HEAD_LENGTH => 15
- };
- my %result;
- my @all_keywords = sort <DATA>;
- my $keywords = \@all_keywords;
- for (my $len=MAX_HEAD_LENGTH;$len>=MIN_HEAD_LENGTH;$len--) {
- my $r = split_at_length( $len, $keywords );
- my @remaining = ();
- for my $key (sort keys %$r) {
- my $value = $r->{$key};
- if ( MIN_HEAD_LENGTH == $len or
- ( length($key) >= $len and scalar @$value > 1 ) ) {
- $result{$key}=$value; # ok, put it in final result
- } else {
- for my $v (sort @$value) {
- push @remaining,$key.$v;
- }
- }
- }
- last if (scalar @remaining == 0);
- $keywords = \@remaining;
- }
- return \%result;
- }
- sub split_at_length {
- my %r = ();
- my ($keylen,$data) = @_;
- foreach my $word (@$data) {
- chomp $word;
- if (length($word)<=$keylen) {
- $r{$word} = [''];
- } else {
- my $key = substr($word,0,$keylen);
- $r{$key} = [] unless exists $r{$key};
- push @{$r{$key}}, substr($word,$keylen);
- }
- }
- return \%r;
- }
- sub finalize {
- my $r = shift;
- foreach my $key (sort keys %$r) {
- my $expr = conv($key);
- my @tails = map { conv($_) } @{$r->{$key}};
- my $num_tails = scalar @tails;
- if ($num_tails == 1) {
- $expr .= $tails[0];
- }
- if ($num_tails > 1) {
- my $also_empty_tail = ( $tails[0] eq '' );
- shift @tails if $also_empty_tail;
- $expr .= '(?:' . join( '|', @tails ) . ')';
- $expr .= '?' if $also_empty_tail;
- }
- push @final, $expr;
- }
- return \@final;
- }
- # From an earlier version: find max "head" part exploiting regex greediness and recursion
- sub find_max_pieces {
- my $v = shift;
- my $vl = scalar @$v;
- return 0 unless $vl > 1;
- my $test = '=' . ( join '=', @$v );
- my $pattern = "(\\=\\w+)(.*\\1){". ($vl - 1) . "}";
- my $re = qr/$pattern/;
- my ($max) = ( $test =~ $re );
- my $ml = length($max);
- return 0 unless $ml;
- my @tails = map { substr( $_, $ml-1 ) } @$v;
- return [ substr($max,1), \@tails ];
- }
- sub conv {
- my $arg = shift;
- my $result = ($arg =~ s/([-*?])/\\$1/gr);
- return lc($result);
- }
- sub test {
- # Check that $pattern matches all keywords
- my ($pattern, $data_start) = @_;
- # Rewind data
- seek DATA, $data_start, 0;
- my $re = qr/$pattern/i;
- foreach my $keyword (<DATA>) {
- print "No match for $keyword\n" unless ( $keyword =~ $re );
- }
- }
- # Keyword list is shortened here:
- __DATA__
- ABAP_SYSTEM_TIMEZONE
- ABAP_USER_TIMEZONE
- ABAP-SOURCE
- ABBREVIATED
- ABS
- ABSTRACT
- ACCEPT
- ACCEPTING
- ACCORDING
- ACTIVATION
- ACTUAL
- ADABAS
- ADD
- ADD-CORRESPONDING
- ADJACENT
- AFTER
- Z
- ZERO
- ZONE
Add Comment
Please, Sign In to add comment