Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Spelling corrector in R
- # Claudio Sacchettini
- #
- # translated from
- # How to Write a Spelling Corrector (Peter Norvig)
- # http://norvig.com/spell-correct.html
- words <- function(text) strsplit(tolower(text),'[^a-z]+')[[1]]
- train <- function(features) tapply(features, features, length)
- con <- file("big.txt", "r")
- NWORDS = train(words(readChar(con,10000000)))
- close(con)
- alphabet = "abcdefghijklmnopqrstuvwxyz"
- edits1 <- function(word) {
- a <- vector()
- b <- vector()
- for (i in 0:nchar(word)) {a[i+1] <- substring(word,1,i)
- b[i+1] <- substring(word,i+1,nchar(word))}
- c <- unlist(strsplit(alphabet, NULL))
- deletes <- paste(a[b!=""],substring(b[b!=""],2), sep="")
- transposes <- paste(a, substring(b[length(b)>1],2,2), substring(b[length(b)>1],1,1), substring(b[length(b)>1],3), sep="")
- replaces <- paste(rep(a[b!=""],each=nchar(alphabet)), rep(c,nchar(word)), rep(substring(b[b!=""],2),each=nchar(alphabet)), sep="")
- inserts <- paste(rep(a,each=nchar(alphabet)), rep(c,nchar(word)), rep(b,each=nchar(alphabet)), sep="")
- return(unique(c(deletes, transposes, replaces, inserts)))
- }
- known_edits2 <- function(word) {
- e2 <- vector()
- for (e1 in 1:length(edits1(word))) {
- e2 <- c(e2, edits1(edits1(word)[e1]))
- }
- return(unique(e2[e2 %in% names(NWORDS)]))
- }
- known <- function(words) words[words %in% names(NWORDS)]
- correction <- function(word) {
- candidates <- if(length(known(word))>0) known(word) else (if(length(known(edits1(word)))>0) known(edits1(word)) else (if(length(known_edits2(word))>0) known_edits2(word) else word))
- return(if (length(candidates)==1 & candidates[1]==word) candidates else names(which.max(NWORDS[names(NWORDS) %in% candidates])))
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement