Advertisement
Paul_Pedant

Similar: Fuzzy String Match in awk

Jun 3rd, 2020
3,838
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Awk 2.11 KB | None | 0 0
  1. #! /bin/bash
  2.  
  3. LC_ALL="C"
  4.  
  5. #### Ranking pairs of strings for similarity.
  6.  
  7. function Ranking {
  8.  
  9.     local AWK='''
  10. BEGIN { FS = "/"; Db = 0; }
  11. #- Given pair of substrings, return length of common initial string.
  12. function nSimil (ta, tb, Local, n, am, bm) {
  13.     am = length (ta); bm = length (tb);
  14.     for (n = 1; n <= am  &&  n <= bm; n++)
  15.         if (substr (ta, n, 1) != substr (tb, n, 1)) break;
  16.     return (n - 1);
  17. }
  18. #- Given pair of strings, return longest exact match included.
  19. function vSimil (ta, tb, Local, s, k, n, na, nb, a, b) {
  20.     if (Db) printf ("/%s/%s/\n", ta, tb);
  21.     n = 0; for (a = 1; length (ta) > n; a++) {
  22.         s = tb; for (b = 0; length (ta) > n  &&  length (s) > n; b += k) {
  23.             if ((k = index (s, substr (ta, 1, n + 1))) == 0) break;
  24.             n += 1 + nSimil( substr (ta, n + 2), substr (s, k + n + 1));
  25.             na = a; nb = b + k; s = substr (s, k + 1);
  26.         }
  27.         ta = substr (ta, 2);
  28.     }
  29.     if (Db) printf ("%.4d%.4d%.4d\n", n, na, nb);
  30.     return (sprintf ("%.4d%.4d%.4d", n, na, nb));
  31. }
  32. #- Given a pair of outer strings, recursively hunt for matches.
  33. function rSimil (ta, tb, Local, tn, q, n, a, b) {
  34.     q = vSimil( ta, tb); if (0 + substr (q, 1, 4) == 0) return (tn);
  35.     n = 0 + substr (q, 1, 4); tn += n;
  36.     a = 0 + substr (q, 5, 4); b = 0 + substr (q, 9, 4);
  37.     if (a > 1  ||  b > 1)
  38.         tn += rSimil( substr (ta, 1, a - 1), substr (tb, 1, b - 1));
  39.     if (a + n < length (ta)  ||  b + n < length (tb))
  40.         tn += rSimil( substr (ta, a + n), substr (tb, b + n));
  41.     return (tn);
  42. }
  43. #- Action a comparison.
  44. function Similar (a, b, Local, n, m) {
  45.     n = (a == b) ? length (a) : rSimil( a, b);
  46.     m = length (a) + length (b);
  47.     return ((m > 0) ? (n + n) / m : 1.00);
  48. }
  49. #- Process the input strings.
  50. { printf ("%.3f%s%s%s%s\n", Similar( $1, $2), FS, $1, FS, $2); }
  51. '''
  52.     awk -f <( echo "${AWK}" )
  53. }
  54.  
  55. #### Script Body Starts Here.
  56.  
  57.     {
  58.         echo 'Similar/Similar'
  59.         echo 'r/barry'
  60.         echo '/'
  61.         echo 'One/'
  62.         echo '/Two'
  63.         echo 'supersciliously/pernicious'
  64.         echo 'JOE KERR/JOKER'
  65.         echo 'TOE/TO'
  66.         echo 'TO/TWO'
  67.         echo 'TWO/TOE'
  68.         echo "Many of my friends have social tenderness/\
  69. Man of my fiends have socialist tendencies"
  70.     } | Ranking
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement