Advertisement
cldscchttn

How to Write a Spelling Corrector

Jul 20th, 2017
921
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.72 KB | None | 0 0
  1. # Spelling corrector in R
  2. # Claudio Sacchettini
  3. #
  4. # translated from
  5. # How to Write a Spelling Corrector (Peter Norvig)
  6. # http://norvig.com/spell-correct.html
  7.  
  8.  
  9. words <- function(text) strsplit(tolower(text),'[^a-z]+')[[1]]
  10.  
  11. train <- function(features) tapply(features, features, length)
  12.  
  13. con <- file("big.txt", "r")
  14. NWORDS = train(words(readChar(con,10000000)))
  15. close(con)
  16.  
  17. alphabet = "abcdefghijklmnopqrstuvwxyz"
  18.  
  19. edits1 <- function(word) {
  20.   a <- vector()
  21.   b <- vector()
  22.   for (i in 0:nchar(word)) {a[i+1] <- substring(word,1,i)
  23.                             b[i+1] <- substring(word,i+1,nchar(word))}
  24.   c <- unlist(strsplit(alphabet, NULL))
  25.   deletes <- paste(a[b!=""],substring(b[b!=""],2), sep="")
  26.   transposes <- paste(a, substring(b[length(b)>1],2,2), substring(b[length(b)>1],1,1), substring(b[length(b)>1],3), sep="")
  27.   replaces <- paste(rep(a[b!=""],each=nchar(alphabet)), rep(c,nchar(word)), rep(substring(b[b!=""],2),each=nchar(alphabet)), sep="")
  28.   inserts <- paste(rep(a,each=nchar(alphabet)), rep(c,nchar(word)), rep(b,each=nchar(alphabet)), sep="")
  29.   return(unique(c(deletes, transposes, replaces, inserts)))
  30.   }
  31.  
  32. known_edits2 <- function(word) {
  33.   e2 <- vector()
  34.   for (e1 in 1:length(edits1(word))) {
  35.   e2 <- c(e2, edits1(edits1(word)[e1]))
  36.   }
  37.   return(unique(e2[e2 %in% names(NWORDS)]))
  38.   }
  39.  
  40. known <- function(words) words[words %in% names(NWORDS)]
  41.  
  42. correction <- function(word) {
  43.   candidates <- if(length(known(word))>0) known(word) else (if(length(known(edits1(word)))>0) known(edits1(word)) else (if(length(known_edits2(word))>0) known_edits2(word) else word))
  44.   return(if (length(candidates)==1 & candidates[1]==word) candidates else names(which.max(NWORDS[names(NWORDS) %in% candidates])))
  45.   }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement