Advertisement
Paul_Pedant

SimFile: Fuzzy match of all pairs of strings in a file.

Jun 3rd, 2020
365
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 2.49 KB | None | 0 0
  1. #! /bin/ksh
  2. #! Author: Paul Stillman. Copyright (2008) Pipe Dreams Ltd.
  3.  
  4. Usage () { expand -4 <<'[][]'
  5.  
  6. Usage:  SimFile [-d x] [-r] [-f] [-b] [-w] [-t nn]
  7.         SimFile [-h] [-H]
  8.  
  9. [][]
  10. }
  11. Help () { Usage; expand -4 <<'[][]'
  12. SimFile reports the similarity between every combination (two at a time)
  13. of text lines read from standard input, on a scale of 0.000 to 1.000.
  14. Blank and duplicate lines are omitted from the comparisons.
  15. Each report line is e.g. "0.778|This line|That line"
  16.  
  17. Caution: n lines take 0.5 x n x (n - 1) comparisons.
  18.  
  19. Options:
  20. -h  Usage statement.
  21. -H  This Help page.
  22. -d  Delimiter for data pairs in report. May need quoting. Default is "|".
  23. -r  Rank: sorts output with best matches first. Default is unsorted.
  24. -f  Folds lower-case letters into upper case.
  25. -b  Ignores leading and trailing blanks (spaces and tabs)
  26.     and treats other strings of blanks as equivalent.
  27. -w  Ignores all blanks (space and tab characters); for
  28.     example, `if ( a == b )' will compare equal to `if(a==b)'.
  29. -t  Threshold: minimum similarity to be reported. Format of nn
  30.     is a decimal number <= 1.000. Default is to report all lines.
  31.  
  32. See Also: Sim -H
  33.  
  34. [][]
  35. }
  36.  
  37. #### Shell Variables.
  38.  
  39. Sim="./Sim"
  40.  
  41. #### Do a comparison between all unique non-empty elements.
  42.  
  43. function Rank {     #:: (dOpt, fOpt, bOpt, wOpt, tOpt) < strings
  44.  
  45.     typeset D="${1}" F="${2}" B="${3}" W="${4}" T="${5}"
  46.     typeset X="$( awk '! /^[ \011]*$/' | sort | uniq )"
  47.     typeset N=1 A LC; print - "${X}" | wc -l | read LC
  48.  
  49.     typeset NAWK='
  50. BEGIN { D = "'"${D}"'"; }
  51. $1 == 9.999 { A = substr ($0, 7); next; }
  52. { printf ("%5s%s%s%s%s\n", substr ($0, 1, 5), D, A, D, substr ($0, 7)); }
  53. '
  54.     while [[ N -lt LC ]]; do
  55.         A="$( print - "${X}" | tail -n "+${N}" | head -n 1 )"
  56.         print - "9.999 ${A}"
  57.         N=$(( 1 + N ))
  58.         print - "${X}" | tail -n "+${N}" |
  59.             "${Sim}" ${F} ${B} ${W} -t "${T}" - "${A}"
  60.     done | awk "${NAWK}"
  61. }
  62.  
  63.  
  64. #### Script Body Starts here.
  65.  
  66.     typeset dOpt="|" rOpt fOpt bOpt wOpt tOpt="-0.001"
  67.  
  68.     while [[ "${#}" -gt 0 ]]; do case "${1}" in
  69.     (-h) Usage; exit 2;; (-H) Help; exit 2;;
  70.     (-d)    shift; dOpt="${1:-|}"; [[ "${#}" -gt 0 ]] && shift 1;;
  71.     (-r)    shift; rOpt=rOpt;;
  72.     (-f)    shift; fOpt="-f";;
  73.     (-b)    shift; bOpt="-b";;
  74.     (-w)    shift; wOpt="-w";;
  75.     (-t)    shift; tOpt="${1:--0.001}"; [[ "${#}" -gt 0 ]] && shift 1;;
  76.     (*)     print -u2 - "Unknown option ${1}"; exit 1;;
  77.     esac; done
  78.  
  79.     case "${rOpt}" in
  80.     (rOpt)  Rank "${dOpt}" "${fOpt}" "${bOpt}" "${wOpt}" "${tOpt}" | sort -r;;
  81.     (*)     Rank "${dOpt}" "${fOpt}" "${bOpt}" "${wOpt}" "${tOpt}";;
  82.     esac
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement