Advertisement
Paul_Pedant

Sim.c Version 2: Fuzzy Matching enhanced.

Jun 8th, 2020
377
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 10.24 KB | None | 0 0
  1. /*
  2.  * Sim.c: Fuzzy string comparison,
  3.  * Author: Paul Stillman 1998-2020.
  4.  */
  5.  
  6. static char *Usage =
  7.  
  8. "\nUsage: Sim [-h] [-H] [-f] [-p] [-b] [-w] [-t nn] [-|file] [-|file]"
  9. "\n";
  10.  
  11. static char *Help [] = {
  12.  
  13. "Sim reports the similarity of strings on a scale of 0.000 to 1.000.",
  14. "Ranking takes account of successively shorter sub-strings that may",
  15. "be separated by non-matching parts. The final ranking is the count",
  16. "of pairs of ordered characters, divided by the total strings length.",
  17. "There are no patterns or wildcards recognised by the matching.",
  18. "Output is sorted by Rank, highest first.",
  19. "",
  20. "If two file names are given (one of which may be - for stdin),",
  21. "each line in each file is matched against the other file.",
  22. "If one name is given (or - or no name for stdin), then every line",
  23. "in that file is matched against every other line.",
  24. "",
  25. "Sample output:",
  26. "1.000|Similar|Similar|",
  27. "0.811|socialist tendencies|social tenderness|",
  28. "0.800|TWO|TO|",
  29. "0.800|TOE|TO|",
  30. "0.769|JOKER|JOE KERR|",
  31. "",
  32. "Options:",
  33. "",
  34. "-h\tUsage statement.",
  35. "-H\tThis Help page.",
  36. "-f\tFolds lower-case letters into upper case.",
  37. "-p\tIgnores all punctuation.",
  38. "-b\tIgnores leading and trailing blanks (spaces and tabs)",
  39. "\tand treats other strings of blanks as equivalent.",
  40. "-w\tIgnores all blanks (space and tab characters); for",
  41. "\texample, `if ( a == b )' will compare equal to `if(a==b)'.",
  42. "-t nn\tThreshold: minimum similarity to be reported. Format of nn",
  43. "\tis a decimal number <= 1.000. Default is to report all lines.",
  44. "",
  45. };
  46.  
  47. #include <stdio.h>
  48. #include <stdlib.h>
  49. #include <string.h>
  50. #include <ctype.h>
  51. #include <sys/types.h>
  52. #include <sys/stat.h>
  53. #include <fcntl.h>
  54. #include <unistd.h>
  55.  
  56. #define  PUBLIC
  57. #define  LOCAL  static
  58. #define  EMPTY
  59. #define  esac break
  60.  
  61. LOCAL    FILE *OUT = NULL;
  62. LOCAL    FILE *ERR = NULL;
  63.  
  64. #define  SZ_BUF 8000
  65.  
  66. #define  NUL  '\0'
  67. #define  NL   '\n'
  68. #define  DOT  '.'
  69. #define  TAB  '\t'
  70. #define  BLK  ' '
  71.  
  72. ////.. Local constants.
  73.  
  74. LOCAL  char *Prog   = "Sim";
  75. LOCAL  int    fOpt  = 0;
  76. LOCAL  int    pOpt  = 0;
  77. LOCAL  int    bOpt  = 0;
  78. LOCAL  int    wOpt  = 0;
  79. LOCAL  double tOpt  = -0.001;
  80.  
  81. ////.. Storage for text lines.
  82.  
  83. #define INIT_TEXT (16 * 1024)
  84. #define INCR_TEXT (32 * 1024)
  85.  
  86. struct File_t {     //.. Each file gets its own list of texts.
  87.  
  88.     char    *fName;     //.. Points back to argv.
  89.     int     fd;
  90.  
  91.     char    *Text;      //.. malloc: Complete text from input file.
  92.     size_t  nText;      //.. Size of text read.
  93.     size_t  nSize;      //.. Size of allocation.
  94.  
  95.     char    **Rows;     //.. malloc: Array of pointers to rows.
  96.     size_t  nRows;
  97. };
  98. typedef struct File_t File_t;
  99.  
  100. LOCAL File_t File_Group [2] = { 0 };
  101.  
  102. LOCAL File_t *pFileA = & File_Group[0];
  103. LOCAL File_t *pFileB = & File_Group[1];
  104.  
  105. ////.. Storage for results.
  106.  
  107. struct Out_t {
  108.  
  109.     char    Rank[8];    //.. As a string.
  110.     char    *A;
  111.     char    *B;
  112. };
  113. typedef struct Out_t Out_t;
  114.  
  115. struct Results_t {
  116.  
  117.     size_t  nMax;
  118.     size_t  nUse;
  119.     Out_t   *pOut;
  120. };
  121. typedef struct Results_t Results_t;
  122.  
  123. LOCAL Results_t Results = { 0 };
  124.  
  125. LOCAL Results_t *Out = & Results;
  126.  
  127.  
  128. ////.. Load a file into a File_t list.
  129.  
  130. LOCAL int fileLoader (File_t *pF, char *fn)
  131.  
  132. {
  133. struct stat Stat;
  134. ssize_t Init = INIT_TEXT;
  135. ssize_t Incr = INCR_TEXT;
  136. ssize_t rc;
  137.  
  138.     if (! strcmp (fn, "-")) {
  139.         pF->fName = "stdin"; pF->fd = 0;
  140.     } else {
  141.         pF->fName = fn; pF->fd = open (fn, O_RDONLY);
  142.     }
  143.     if (pF->fd < 0) { perror (pF->fName); exit (1); }
  144.    
  145.     //.. For a regular file, we can find the available size.
  146.     //.. If stdin is a pipe, we have to guess and extend.
  147.     if (! fstat (pF->fd, & Stat) && S_ISREG (Stat.st_mode)) {
  148.         Init = Stat.st_size + 100; Incr = 1024;
  149.     }
  150.     while (1) {
  151.         if (pF->nSize - pF->nText < 80) {
  152.             pF->nSize = (pF->nSize == 0) ? (Init) : (pF->nSize + Incr);
  153.             pF->Text = realloc ((void*) pF->Text, pF->nSize);
  154.         }
  155.         rc = read (pF->fd, (void *) pF->Text + pF->nText,
  156.             pF->nSize - pF->nText - 4);
  157.         if (rc > 0) pF->nText += rc; else break;
  158.     }
  159.     if (pF->fd > 0) close (pF->fd);
  160.     return (0);
  161. }
  162.  
  163. ////.. Index the rows in a file.
  164.  
  165. LOCAL int fileIndex (File_t *pF)
  166.  
  167. {
  168. char *p, *q, *r, *e;
  169. size_t nRow = 0;
  170.  
  171.     p = pF->Text; e = p + pF->nText;
  172.  
  173.     if (*(e - 1) != NL) {
  174.         *e++ = NL;
  175.         printf ("Last line needs newline\n");
  176.     }
  177.     *e = NUL;
  178.  
  179.     //.. Count the rows in the data.
  180.     for (q = p; q < e; q++) if (*q == NL) ++nRow;
  181.  
  182.     pF->Rows = malloc ((nRow + 2) * sizeof (pF->Rows));
  183.  
  184.     //.. Make each input line a null-terminated string, and keep its address.
  185.     for (q = p; q < e; q = r) {
  186.         for (r = q; r < e; r++) {
  187.             if (*r == NL) {
  188.                 *(pF->Rows + pF->nRows++) = q;
  189.                 *r++ = NUL; break;
  190.             }
  191.         }
  192.     }
  193. }
  194.  
  195. ////.. Return the longest matching string within two inputs.
  196.  
  197. LOCAL int strMatch (char **ax, char **bx, char *ap, char *ae, char *bp, char *be)
  198.  
  199. {
  200. int m = 0; char *am = ap, *bm = bp;
  201. char *aq, *bq, *ar, *br, *af = ae, *bf = be;
  202.  
  203.     for (aq = ap; aq < af; aq++) {
  204.         for (bq = bp; bq < bf; bq++) {
  205.             for (ar = aq, br = bq; ar < af && br < bf && *ar == *br;
  206.                     ar++, br++) EMPTY;
  207.             if (m < ar - aq) {
  208.                 m = ar - aq; am = aq; bm = bq; af = ae - m; bf = be - m;
  209.     }   }   }
  210.     *ax = am; *bx = bm; return (m);
  211. }
  212.  
  213. ////.. Recurse through the entire partitioning.
  214.  
  215. LOCAL int strPart (char *ap, char *ae, char *bp, char *be)
  216.  
  217. {
  218. int n; char *aq, *bq;
  219.  
  220.     //.. Partition at the longest matching string.
  221.     if ((n = strMatch (& aq, & bq, ap, ae, bp, be)) > 0) {
  222.         //.. Repeat on the right-hand remnant.
  223.         if (aq + n < ae && bq + n < be) n += strPart (aq + n, ae, bq + n, be);
  224.         //.. Repeat on the left-hand remnant.
  225.         if (ap     < aq && bp     < bq) n += strPart (ap,     aq, bp,     bq);
  226.     }
  227.     return (n);
  228. }
  229.  
  230. ////.. Copy a string, discarding all or multiple whitespace.
  231.  
  232. LOCAL inline void wDrop (char *t, char *p)
  233.  
  234. {
  235. char *e; for (e = p; *e; e++) EMPTY;
  236.  
  237.     while (p < e && isspace (*p)) p++;
  238.     while (p < e && isspace (*(e - 1))) e--;
  239.     while (p < e)
  240.         if (! isspace (*p)) *t++ = *p++;
  241.         else { if (! wOpt) *t++ = BLK; while (p < e && isspace (*p)) p++; }
  242.     *t++ = NUL;
  243. }
  244.  
  245. ////.. Discard punctuation from a string in-situ.
  246.  
  247. LOCAL inline void pDrop (char *tx)
  248.  
  249. {
  250. char *p, *q;
  251.  
  252.     for (p = tx, q = p; *p; p++) if (! ispunct (*p)) *q++ = *p;
  253.     *q++ = NUL;
  254. }
  255.  
  256. ////.. Fold lowercase into uppercase in situ.
  257.  
  258. LOCAL inline void cFold (char *tx)
  259.  
  260. {
  261. char *p;
  262.  
  263.     for (p = tx; *p; p++) *p = toupper (*p);
  264. }
  265.  
  266. ////.. Evaluate similarity of two strings.
  267.  
  268. LOCAL double Similar (char *aStr, char *bStr)
  269.  
  270. {
  271. char aX [SZ_BUF], bX [SZ_BUF];
  272. int n, n1, n2, t;
  273. char *ap = aX, *bp = bX;
  274. char *ae, *be;
  275.  
  276.     //.. Copy original lines to buffers, applying options.
  277.     if (bOpt || wOpt) { wDrop (ap, aStr); wDrop (bp, bStr); }
  278.     if (pOpt) { pDrop (ap); pDrop (bp); }
  279.     if (fOpt) { cFold (ap); cFold (bp); }
  280.     //.. Make pointers to initial terminators.
  281.     for (ae = ap; *ae; ae++) EMPTY; for (be = bp; *be; be++) EMPTY;
  282.  
  283.     //.. Test both ways round, and return the best outcome.
  284.     n1 = strPart (ap, ae, bp, be); n2 = strPart (bp, be, ap, ae);
  285.     n = (n1 > n2) ? n1 : n2; t = (ae - ap) + (be - bp);
  286.     return ((double) (n + n) / (double) t);
  287. }
  288.  
  289. ////.. Compare n rows from one file, 2 at a time. (n-C-r)
  290.  
  291. LOCAL void doOneFile (File_t *fA)
  292.  
  293. {
  294. char **pA, **eA;
  295. char **pB;
  296. double s;
  297. Out_t *pOut;
  298.  
  299.     Out->nMax = (fA->nRows * fA->nRows / 2);
  300.     Out->pOut = malloc (Out->nMax * sizeof (Out_t));
  301.  
  302.     for (pA = fA->Rows, eA = pA + fA->nRows; pA < eA; pA++) {
  303.         for (pB = fA->Rows; pB < pA; pB++) {
  304.             s = Similar (*pA, *pB);
  305.             if (s >= tOpt) {
  306.                 pOut = Out->pOut + Out->nUse++;
  307.                 sprintf (pOut->Rank, "%5.3f", s);
  308.                 pOut->A = *pA; pOut->B = *pB;
  309.             }
  310.         }
  311.     }
  312. }
  313.  
  314. ////.. Compare m rows from one file with n rows from another (m x n).
  315.  
  316. LOCAL void doTwoFiles (File_t *fA, File_t *fB)
  317.  
  318. {
  319. char **pA, **eA;
  320. char **pB, **eB;
  321. double s;
  322. Out_t *pOut;
  323.  
  324.     Out->nMax = (fA->nRows * fB->nRows);
  325.     Out->pOut = malloc (Out->nMax * sizeof (Out_t));
  326.  
  327.     for (pA = fA->Rows, eA = pA + fA->nRows; pA < eA; pA++) {
  328.         for (pB = fB->Rows, eB = pB + fB->nRows; pB < eB; pB++) {
  329.             s = Similar (*pA, *pB);
  330.             if (s >= tOpt) {
  331.                 pOut = Out->pOut + Out->nUse++;
  332.                 sprintf (pOut->Rank, "%5.3f", s);
  333.                 pOut->A = *pA; pOut->B = *pB;
  334.             }
  335.         }
  336.     }
  337. }
  338.  
  339. ////.. Output result compare function, as per qsort interface.
  340.  
  341. LOCAL int compareOut (const void *p, const void *q)
  342.  
  343. {
  344. Out_t *P = (Out_t *) p; Out_t *Q = (Out_t *) q;
  345. int r;
  346.  
  347.     r = strcmp (Q->Rank, P->Rank); if (r) return (r);
  348.     r = strcmp (P->A, Q->A); if (r) return (r);
  349.     return (strcmp (P->B, Q->B));
  350. }
  351.  
  352. ////.. Process one or two files.
  353.  
  354. LOCAL int FileNames (char *fnA, char *fnB)
  355.  
  356. {
  357. Out_t *p, *e;
  358.  
  359.     if (fnA != NULL) {
  360.         fileLoader (pFileA, fnA);
  361.         fileIndex (pFileA);
  362.     }
  363.     if (fnB != NULL) {
  364.         fileLoader (pFileB, fnB);
  365.         fileIndex (pFileB);
  366.     }
  367.     if (pFileA->nRows > 0 && pFileB->nRows == 0) doOneFile  (pFileA);
  368.     if (pFileA->nRows > 0 && pFileB->nRows > 0)  doTwoFiles (pFileA, pFileB);
  369.  
  370.     qsort ((void *) Out->pOut, Out->nUse, sizeof (Out_t), compareOut);
  371.     for (p = Out->pOut, e = p + Out->nUse; p < e; p++) {
  372.         printf ("%s|%s|%s|\n", p->Rank, p->A, p->B);
  373.     }
  374.     return (0);
  375. }
  376.  
  377. ////.. Argument parsing.
  378.  
  379. PUBLIC int main (int argc, char **argv)
  380.  
  381. {
  382. char *s, *e;
  383.  
  384.     OUT = stdout;
  385.     ERR = stderr;
  386.  
  387.     argc--; argv++;
  388.  
  389.     while (argc > 0) {
  390.         if (**argv != '-') break;
  391.         if (**argv == '-' && *(*argv + 1) == NUL) break;
  392.  
  393.         switch (*(*argv + 1)) {
  394.         case 'h':
  395.             fprintf (ERR, "%s\n", Usage); return (2);
  396.         esac;
  397.  
  398.         case 'H':  { char **hs, **he;
  399.             fprintf (OUT, "%s\n", Usage);
  400.             for (hs = Help, he = Help + sizeof (Help) / sizeof (*Help);
  401.                 hs < he; hs++) fprintf (OUT, "%s\n", *hs);
  402.             return (2);
  403.         } esac;
  404.  
  405.         case 'f': fOpt = 1; argc--; argv++; esac;
  406.         case 'p': pOpt = 1; argc--; argv++; esac;
  407.         case 'b': bOpt = 1; argc--; argv++; esac;
  408.         case 'w': wOpt = 1; argc--; argv++; esac;
  409.  
  410.         case 't':
  411.             if (argc < 2) { fprintf (ERR, "%s\n", Usage); return (2); }
  412.             tOpt = atof (*(argv + 1));
  413.             argc -= 2; argv += 2;
  414.         esac;
  415.  
  416.         default:  {
  417.             fprintf (ERR, "%s\n", Usage); return (2);
  418.         } esac;
  419.  
  420.         } //.. END switch (*(*argv + 1))
  421.     } //.. END while (argc > 0)
  422.  
  423.     if (argc == 0) return (FileNames ("-", NULL));
  424.     if (argc == 1) return (FileNames (*(argv + 0), NULL));
  425.     if (argc == 2) return (FileNames (*(argv + 0), *(argv + 1)));
  426.  
  427.     fprintf (ERR, "%s\n", Usage); return (2);
  428. }
  429.  
  430. ////.. Build with: cc -o Sim Sim.c -lm
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement