Advertisement
Paul_Pedant

Sim.c Fuzzy string matching in C.

Jun 3rd, 2020
370
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.90 KB | None | 0 0
  1. /*
  2.  * Sim.c: Fuzzy string comparison,
  3.  * Author: Paul Stillman 1998-2008.
  4.  */
  5.  
  6. static char *Usage =
  7.  
  8. "\nUsage: Sim [-h] [-H] [-f] [-b] [-w] [-t nn] [-] [string...]"
  9. "\n";
  10.  
  11. static char *Help [] = {
  12.  
  13. "Sim reports the similarity of strings on a scale of 0.000 to 1.000.",
  14. "Ranking takes account of successively shorter sub-strings that may",
  15. "be separated by non-matching parts. The final ranking is the count",
  16. "of pairs of ordered characters, divided by the total strings length.",
  17. "There are no patterns or wildcards recognised by the matching.",
  18. "",
  19. "One or more strings may be defined as command line arguments.",
  20. "Strings are also read (one per line) from standard input.",
  21. "Every such string is compared against every argument string.",
  22. "If there is more than one command-line string, each ranking report",
  23. "includes a number 00-99 to indicate the argument to which it relates.",
  24. "",
  25. "Sample output with one argument and two strings:",
  26. "0.543 First String",
  27. "0.712 Second String",
  28. "",
  29. "Sample output with two arguments and two strings:",
  30. "0.543 00 First String",
  31. "0.712 01 First String",
  32. "0.373 00 Second String",
  33. "0.962 01 Second String",
  34. "",
  35. "Options:",
  36. "",
  37. "-h\tUsage statement.",
  38. "-H\tThis Help page.",
  39. "-f\tFolds lower-case letters into upper case.",
  40. "-b\tIgnores leading and trailing blanks (spaces and tabs)",
  41. "\tand treats other strings of blanks as equivalent.",
  42. "-w\tIgnores all blanks (space and tab characters); for",
  43. "\texample, `if ( a == b )' will compare equal to `if(a==b)'.",
  44. "-t nn\tThreshold: minimum similarity to be reported. Format of nn",
  45. "\tis a decimal number <= 1.000. Default is to report all lines.",
  46. "-\tEnd options: useful where argument string starts with -.",
  47. "",
  48. };
  49.  
  50. #include <stdio.h>
  51. #include <stdlib.h>
  52. #include <string.h>
  53. #define __XPG4_CHAR_CLASS__ 1
  54. #include <ctype.h>
  55.  
  56. #define  PUBLIC
  57. #define  LOCAL  static
  58. #define  EMPTY
  59. #define  esac break
  60.  
  61. LOCAL    FILE *OUT = NULL;
  62. LOCAL    FILE *ERR = NULL;
  63.  
  64. #define  SZ_BUF 4000
  65.  
  66. #define  NUL  '\0'
  67. #define  NL   '\n'
  68. #define  DOT  '.'
  69. #define  TAB  '\t'
  70. #define  BLK  ' '
  71.  
  72. /*----- Local constants. */
  73.  
  74. LOCAL  char *Prog   = "Sim";
  75. LOCAL  int    fOpt  = 0;
  76. LOCAL  int    bOpt  = 0;
  77. LOCAL  int    wOpt  = 0;
  78. LOCAL  double tOpt  = -0.001;
  79.  
  80. /*----- Return the longest matching string within two inputs. */
  81.  
  82. LOCAL int LMS (char **ax, char **bx, char *ap, char *ae, char *bp, char *be)
  83.  
  84. {
  85. int m = 0; char *am = ap, *bm = bp;
  86. char *aq, *bq, *ar, *br, *af = ae, *bf = be;
  87.  
  88.     for (aq = ap; aq < af; aq++) {
  89.         for (bq = bp; bq < bf; bq++) {
  90.             for (ar = aq, br = bq; ar < af && br < bf && *ar == *br;
  91.                     ar++, br++) EMPTY;
  92.             if (m < ar - aq) {
  93.                 m = ar - aq; am = aq; bm = bq; af = ae - m; bf = be - m;
  94.     }   }   }
  95.     *ax = am; *bx = bm; return (m);
  96. }
  97.  
  98.  
  99. /*----- Recurse through the entire partitioning. */
  100.  
  101. LOCAL int RPS (char *ap, char *ae, char *bp, char *be)
  102.  
  103. {
  104. int n; char *aq, *bq;
  105.  
  106.     if ((n = LMS (& aq, & bq, ap, ae, bp, be)) > 0) {
  107.         if (aq + n < ae && bq + n < be) n += RPS (aq + n, ae, bq + n, be);
  108.         if (ap     < aq && bp     < bq) n += RPS (ap,     aq, bp,     bq);
  109.     }
  110.     return (n);
  111. }
  112.  
  113.  
  114. /*----- Copy a string dropping all or multiple whitespace. ----- */
  115.  
  116. LOCAL char *wDrop (char *t, char *p)
  117.  
  118. {
  119. char *z = t; char *e; for (e = p; *e; e++) EMPTY;
  120.  
  121.     while (p < e && isspace (*p)) p++;
  122.     while (p < e && isspace (*(e - 1))) e--;
  123.     while (p < e)
  124.         if (! isspace (*p)) *t++ = *p++;
  125.         else { if (! wOpt) *t++ = BLK; while (p < e && isspace (*p)) p++; }
  126.     *t++ = NUL; return (z);
  127. }
  128.  
  129.  
  130. /*----- Evaluate similarity of two strings. ----- */
  131.  
  132. LOCAL double Similar (char *ap, char *bp)
  133.  
  134. {
  135. int n, n1, n2, t; char *ae, *be;
  136. char *p, *q;
  137. char aX [SZ_BUF], bX [SZ_BUF];
  138.  
  139.     if (bOpt || wOpt) {
  140.         ap = wDrop (aX, ap); if (fOpt) for (q = ap; *q; q++) *q = toupper (*q);
  141.         bp = wDrop (bX, bp); if (fOpt) for (q = bp; *q; q++) *q = toupper (*q);
  142.     } else if (fOpt) {
  143.         p = ap; ap = q = aX; while (*q++ = toupper (*p++)) EMPTY;
  144.         p = bp; bp = q = bX; while (*q++ = toupper (*p++)) EMPTY;
  145.     }
  146.     for (ae = ap; *ae; ae++) EMPTY; for (be = bp; *be; be++) EMPTY;
  147.     n1 = RPS (ap, ae, bp, be); n2 = RPS (bp, be, ap, ae);
  148.     n = (n1 > n2) ? n1 : n2; t = (ae - ap) + (be - bp);
  149.     return ((double) (n + n) / (double) t);
  150. }
  151.  
  152.  
  153. /*----- Compare each argument against every input string. */
  154.  
  155. LOCAL void Comparer (char **p, char **e)
  156.  
  157. {
  158. int  a, m;
  159. char **q;
  160. char *z; char Buf [SZ_BUF];
  161. double s;
  162.  
  163.     while (fgets (Buf, sizeof (Buf), stdin) != NULL) {
  164.         for (z = Buf; *z; z++) EMPTY; if (Buf < z && *(--z) == NL) *z = NUL;
  165.         if (p + 1 < e) {
  166.             for (a = 0, q = p; q < e; a++, q++) {
  167.                 s = Similar (*q, Buf);
  168.                 if (s >= tOpt) printf ("%5.3f %.2d %s\n", s, a, Buf);
  169.             }
  170.         } else if (p < e) {
  171.             s = Similar (*p, Buf);
  172.             if (s >= tOpt) printf ("%5.3f %s\n", s, Buf);
  173.         }
  174.     }
  175. }
  176.  
  177.  
  178. /*----- Argument parsing. */
  179.  
  180. PUBLIC int main (int argc, char **argv)
  181.  
  182. {
  183. char *s, *e;
  184.  
  185.     OUT = stdout;
  186.     ERR = stderr;
  187.  
  188.     argc--; argv++;
  189.  
  190.     while (argc > 0) {
  191.         if (**argv != '-') break;
  192.         if (**argv == '-' && *((*argv) + 1) == NUL) { argc--; argv++; break; }
  193.  
  194.         switch (*(*argv + 1)) {
  195.         case 'h':
  196.             fprintf (OUT, "%s\n", Usage); return (2);
  197.         esac;
  198.  
  199.         case 'H':  { char **hs, **he;
  200.             fprintf (OUT, "%s\n", Usage);
  201.             for (hs = Help, he = Help + sizeof (Help) / sizeof (*Help);
  202.                 hs < he; hs++) fprintf (OUT, "%s\n", *hs);
  203.             return (2);
  204.         } esac;
  205.  
  206.         case 'f': fOpt = 1; argc--; argv++; esac;
  207.         case 'b': bOpt = 1; argc--; argv++; esac;
  208.         case 'w': wOpt = 1; argc--; argv++; esac;
  209.  
  210.         case 't':
  211.             if (argc < 2) { fprintf (ERR, "%s\n", Usage); return (2); }
  212.             tOpt = atof (*(argv + 1));
  213.             argc -= 2; argv += 2;
  214.         esac;
  215.  
  216.         default:  {
  217.             fprintf (OUT, "%s\n", Usage); return (2);
  218.         } esac;
  219.  
  220.         } /* END switch (*(*argv + 1)) */
  221.     } /* END while (argc > 0) */
  222.  
  223.     Comparer (argv, argv + argc);
  224.  
  225.     return (0);
  226. }
  227.  
  228. /* Build with
  229.     HP:   cc -D_HPUX_SOURCE -Aa -o Sim Sim.c -lm
  230.     DEC:  cc -o Sim Sim.c -lm
  231.  */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement