rplantiko

Generate a regex matching a list of keywords

Jan 7th, 2021 (edited)
641
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 3.75 KB | None | 0 0
  1. # Generate a regex matching all ABAP keywords
  2. # The point here is to not simply generate an alternation of all keywords but finds common "head parts" of words,
  3. # generating sub-alternations of different "tails" for common heads.
  4. # Source for ABAP keyword list: https://help.sap.com/doc/abapdocu_751_index_htm/7.51/en-us/abenabap_words.htm
  5.  
  6. my $data_start = tell DATA;  # first position of DATA, needed for a second loop over <DATA> (in the test sub)
  7.  
  8. # Build a hash of arrays, comprising keywords with identical beginning part ("head")
  9. # The "head" will be the key, and the different "tails" are the corresponding value array
  10. my %groups = %{ do_split( ) };
  11.  
  12. # Now produce the array of regular sub-expressions to be alternated
  13. my @final  = @{ finalize( \%groups ) };
  14.  
  15. # The final regex
  16. my $pattern = '\\b(?:' . join( '|', @final ) . ')\\b';
  17. print $pattern;
  18.  
  19. # Test: the generated pattern should match all keywords of the list
  20. test( $pattern, $data_start );
  21.  
  22. sub do_split {
  23.  
  24. # Find common "head" parts of successive entries
  25. # Starting with MAX_HEAD_LENGTH down to MIN_HEAD_LENGTH
  26.   use constant {
  27.     MIN_HEAD_LENGTH => 2,
  28.     MAX_HEAD_LENGTH => 15
  29.   };
  30.  
  31.   my %result;
  32.   my @all_keywords = sort <DATA>;
  33.   my $keywords = \@all_keywords;
  34.   for (my $len=MAX_HEAD_LENGTH;$len>=MIN_HEAD_LENGTH;$len--) {
  35.     my $r = split_at_length( $len, $keywords );
  36.     my @remaining = ();
  37.     for my $key (sort keys %$r) {
  38.       my $value = $r->{$key};
  39.       if ( MIN_HEAD_LENGTH == $len or
  40.            ( length($key) >= $len and scalar @$value > 1 ) ) {
  41.         $result{$key}=$value; # ok, put it in final result
  42.       } else {
  43.         for my $v (sort @$value) {
  44.           push @remaining,$key.$v;
  45.         }
  46.       }
  47.     }
  48.     last if (scalar @remaining == 0);
  49.     $keywords = \@remaining;
  50.   }
  51.   return \%result;
  52. }
  53.  
  54. sub split_at_length {
  55.   my %r = ();
  56.   my ($keylen,$data) = @_;
  57.   foreach my $word (@$data) {
  58.     chomp $word;
  59.     if (length($word)<=$keylen) {
  60.       $r{$word} = [''];
  61.     } else {
  62.       my $key = substr($word,0,$keylen);
  63.       $r{$key} = [] unless exists $r{$key};
  64.       push @{$r{$key}}, substr($word,$keylen);
  65.     }  
  66.   }
  67.   return \%r;
  68. }
  69.  
  70. sub finalize {
  71.   my $r = shift;
  72.   foreach my $key (sort keys %$r) {  
  73.     my $expr = conv($key);
  74.     my @tails = map { conv($_) } @{$r->{$key}};
  75.     my $num_tails = scalar @tails;
  76.     if ($num_tails == 1) {
  77.       $expr .= $tails[0];
  78.     }
  79.     if ($num_tails > 1) {
  80.       my $also_empty_tail = ( $tails[0] eq '' );
  81.       shift @tails if $also_empty_tail;
  82.       $expr .= '(?:' . join( '|', @tails ) . ')';
  83.       $expr .= '?' if $also_empty_tail;
  84.     }
  85.     push @final, $expr;
  86.   }
  87.   return \@final;
  88. }
  89.  
  90. # From an earlier version: find max "head" part exploiting regex greediness and recursion
  91. sub find_max_pieces {
  92.   my $v = shift;
  93.   my $vl = scalar @$v;
  94.   return 0 unless $vl > 1;
  95.   my $test = '=' . ( join '=', @$v );
  96.   my $pattern = "(\\=\\w+)(.*\\1){". ($vl - 1) . "}";
  97.   my $re = qr/$pattern/;
  98.   my ($max) = ( $test =~ $re );
  99.   my $ml = length($max);
  100.   return 0 unless $ml;
  101.   my @tails = map { substr( $_, $ml-1 ) } @$v;
  102.   return [ substr($max,1), \@tails ];
  103. }
  104.  
  105.  
  106. sub conv {
  107.   my $arg = shift;
  108.   my $result = ($arg =~ s/([-*?])/\\$1/gr);
  109.   return lc($result);
  110. }
  111.  
  112. sub test {
  113. # Check that $pattern matches all keywords  
  114.   my ($pattern, $data_start) = @_;
  115.   # Rewind data
  116.   seek DATA, $data_start, 0;
  117.   my $re = qr/$pattern/i;
  118.   foreach my $keyword (<DATA>) {
  119.     print "No match for $keyword\n" unless ( $keyword =~ $re );
  120.   }
  121. }
  122.  
  123. # Keyword list is shortened here:
  124. __DATA__
  125. ABAP_SYSTEM_TIMEZONE
  126. ABAP_USER_TIMEZONE
  127. ABAP-SOURCE
  128. ABBREVIATED
  129. ABS
  130. ABSTRACT
  131. ACCEPT
  132. ACCEPTING
  133. ACCORDING
  134. ACTIVATION
  135. ACTUAL
  136. ADABAS
  137. ADD
  138. ADD-CORRESPONDING
  139. ADJACENT
  140. AFTER
  141.  
  142. Z
  143. ZERO
  144. ZONE
Add Comment
Please, Sign In to add comment