Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ----------------------------------------PRACTICAL-1-------------------------------------------------------
- # create the data
- age <- c(21, 2, 18, 221, 34)
- agegroup <- c("adult", "child", "adult", "elderly", "child")
- height <- c(6.0, 3, 5.7, 5, -7)
- status <- c("single", "married", "married", "widowed", "married")
- yearsmarried <- c(-1, 0, 20, 2, 3)
- # combine the data into a data frame
- people <- data.frame(age, agegroup, height, status, yearsmarried)
- # write the data frame to a text file
- write.table(people, "people.txt", sep="\t", row.names=FALSE)
- # read the data from the file
- people <- read.table("people.txt", header=TRUE, sep="\t")
- # create the ruleset E
- library(dplyr)
- E <- list(
- check(age > 0 & age <= 150, "Age should be in the range 0-150"),
- check(age > yearsmarried, "Age should be greater than yearsmarried"),
- check(status %in% c("married", "single", "widowed"), "Status should be married or single or widowed"),
- check(
- ifelse(age < 18, agegroup == "child",
- ifelse(age <= 65, agegroup == "adult", agegroup == "elderly")),
- "Agegroup should be child, adult, or elderly"
- )
- )
- # apply the ruleset E to the data
- violations <- people %>%
- validate(E)
- # summarize the results
- summary(violations)
- # visualize the results
- library(ggplot2)
- ggplot(violations, aes(x=rule, y=row)) +
- geom_point(size=3, color="red") +
- ggtitle("Violations of Ruleset E") +
- ylab("Row") +
- xlab("Rule")
- -----------------------------------------PRACTICAL-2-------------------------------------
- # load the dataset
- dirty_iris <- read.csv("dirty_iris.csv", header=TRUE)
- # calculate the number and percentage of complete observations
- complete_obs <- complete.cases(dirty_iris)
- num_complete <- sum(complete_obs)
- perc_complete <- mean(complete_obs) * 100
- cat("Number of complete observations:", num_complete, "\n")
- cat("Percentage of complete observations:", perc_complete, "%\n")
- # replace all special values with NA
- dirty_iris[dirty_iris == "NA"] <- NA
- dirty_iris[dirty_iris == "N/A"] <- NA
- dirty_iris[dirty_iris == "?"] <- NA
- Species %in% c("setosa", "versicolor", "virginica")
- Sepal.Length > 0
- Sepal.Length <= 30
- Sepal.Length > Petal.Length
- Petal.Length >= 2 * Petal.Width
- # read the rules using the editrules package
- library(editrules)
- rules <- editfile("iris_rules.txt")
- print(rules)
- # apply the rules to the dataset and count the number of violations
- violations <- validate(dirty_iris, rules)
- num_violations <- nrow(violations)
- cat("Number of violations:", num_violations, "\n")
- # summarize the violations
- summary(violations)
- # plot the violations
- plot(violations)
- # create a boxplot of sepal length
- boxplot(dirty_iris$Sepal.Length)
- # calculate the outliers using boxplot.stats
- sepallength_stats <- boxplot.stats(dirty_iris$Sepal.Length)
- outliers <- sepallength_stats$out
- cat("Number of outliers in sepal length:", length(outliers), "\n")
- -----------------------------------------PRACTICAL-3--------------------------------------
- # load the wine dataset
- wine <- read.csv("wine.csv", header = TRUE)
- # check if all attributes are standardized
- mean_wine <- apply(wine, 2, mean)
- sd_wine <- apply(wine, 2, sd)
- if(all(abs(mean_wine) < 1e-10) && all(abs(sd_wine - 1) < 1e-10)) {
- cat("All attributes are standardized.\n")
- } else {
- # standardize the attributes
- wine_std <- scale(wine)
- cat("Attributes have been standardized.\n")
- }
- # load the Iris dataset
- iris <- read.csv("iris.csv", header = TRUE)
- # check if all attributes are standardized
- mean_iris <- apply(iris[,1:4], 2, mean)
- sd_iris <- apply(iris[,1:4], 2, sd)
- if(all(abs(mean_iris) < 1e-10) && all(abs(sd_iris - 1) < 1e-10)) {
- cat("All attributes are standardized.\n")
- } else {
- # standardize the attributes
- iris_std <- scale(iris[,1:4])
- iris_std <- cbind(iris_std, iris[,5])
- colnames(iris_std) <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
- cat("Attributes have been standardized.\n")
- }
- ---------------------------------------PRACTICAL-4-------------------------------------
- # generate example data
- transactions <- list(
- c("beer", "chips", "nuts", "salsa"),
- c("beer", "chips", "nuts"),
- c("beer", "chips"),
- c("beer", "salsa"),
- c("beer", "nuts"),
- c("chips", "nuts", "salsa"),
- c("chips", "nuts"),
- c("chips", "salsa"),
- c("nuts", "salsa")
- )
- # load the arules package
- library(arules)
- # convert the transaction data to a transaction object
- trans <- as(transactions, "transactions")
- # run the Apriori algorithm
- frequentItemsets <- apriori(trans, parameter = list(supp = 0.5, conf = 0.75))
- # view the frequent itemsets
- inspect(frequentItemsets)
- # extract the association rules
- associationRules <- as(frequentItemsets, "rules")
- # view the association rules
- inspect(associationRules)
- # run the Apriori algorithm
- frequentItemsets <- apriori(trans, parameter = list(supp = 0.6, conf = 0.6))
- # view the frequent itemsets
- inspect(frequentItemsets)
- # extract the association rules
- associationRules <- as(frequentItemsets, "rules")
- # view the association rules
- inspect(associationRules)
- -------------------------------------------------------------PRACTICAL-5-------------------------
- NAIVE BAYES----------------------------
- library(caTools)
- library(e1071)
- set.seed(123)
- # Load iris dataset
- data(iris)
- # Split dataset into training and testing sets
- split = sample.split(iris$Species, SplitRatio = 0.75)
- train = subset(iris, split == TRUE)
- test = subset(iris, split == FALSE)
- # Train Naive Bayes classifier
- model = naiveBayes(Species ~ ., data = train)
- # Make predictions on testing set
- predictions = predict(model, test)
- # Calculate confusion matrix
- table(predictions, test$Species)
- # Calculate accuracy
- mean(predictions == test$Species)
- split = sample.split(iris$Species, SplitRatio = 0.75)
- train = subset(iris, split == TRUE)
- test = subset(iris, split == FALSE)
- model = naiveBayes(Species ~ ., data = train)
- predictions = predict(model, test)
- mean(predictions == test$Species)
- split = sample.split(iris$Species, SplitRatio = 0.666)
- train = subset(iris, split == TRUE)
- test = subset(iris, split == FALSE)
- model = naiveBayes(Species ~ ., data = train)
- predictions = predict(model, test)
- mean(predictions == test$Species)
- split = sample.split(iris$Species, SplitRatio = 0.75)
- train = subset(iris, split == TRUE)
- test = subset(iris, split == FALSE)
- model = naiveBayes(Species ~ ., data = train)
- predictions = predict(model, test)
- mean(predictions == test$Species)
- accuracy = numeric(100)
- for(i in 1:100) {
- split = sample.split(iris$Species, SplitRatio = 0.75)
- train = subset(iris, split == TRUE)
- test = subset(iris, split == FALSE)
- model = naiveBayes(Species ~ ., data = train)
- predictions = predict(model, test)
- accuracy[i] = mean(predictions == test$Species)
- }
- mean(accuracy)
- model = naiveBayes(Species ~ ., data = iris)
- accuracy = cv.accuracy(model, iris, FUN = function(x, y) mean(predict(x, y) == y$Species))
- mean(accuracy)
- # Scale data
- train_scaled = scale(train[,1:4])
- test_scaled = scale(test[,1:4])
- # Train Naive Bayes classifier on scaled data
- model = naiveBayes(Species ~ ., data = train_scaled)
- predictions = predict(model,
- KNN---------------------------------------
- # Load the Iris dataset
- data(iris)
- # Split the dataset into training and testing sets
- set.seed(123)
- train_index <- sample(1:nrow(iris), 0.75*nrow(iris))
- train_data <- iris[train_index,]
- test_data <- iris[-train_index,]
- # Scale the data to standard format
- train_data_scaled <- scale(train_data[,1:4])
- test_data_scaled <- scale(test_data[,1:4])
- # Train the k-NN model using the training set
- library(class)
- k <- 3
- knn_model <- knn(train_data_scaled, test_data_scaled, train_data$Species, k)
- # Evaluate the model on the testing set
- table(knn_model, test_data$Species)
- accuracy <- sum(knn_model == test_data$Species) / length(test_data$Species)
- print(paste("Accuracy:", round(accuracy, 4)))
- DECISION TREE----------------------------------
- library(caret)
- library(rpart)
- data(iris)
- # Situation 5.1 a) Training set = 75% Test set = 25%
- set.seed(123)
- trainIndex <- createDataPartition(iris$Species, p = 0.75, list = FALSE)
- train <- iris[trainIndex,]
- test <- iris[-trainIndex,]
- # Situation 5.1 b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
- set.seed(123)
- trainIndex <- createDataPartition(iris$Species, p = 0.666, list = FALSE)
- train <- iris[trainIndex,]
- test <- iris[-trainIndex,]
- # Situation 5.2 i) hold out method
- set.seed(123)
- trainIndex <- sample(nrow(iris), 0.75*nrow(iris))
- train <- iris[trainIndex,]
- test <- iris[-trainIndex,]
- # Situation 5.2 ii) Random subsampling
- set.seed(123)
- subsamples <- split(iris, sample(1:5, nrow(iris), replace = TRUE))
- train <- do.call(rbind, subsamples[-1])
- test <- subsamples[[1]]
- # Situation 5.2 iii) Cross-Validation
- trainControl <- trainControl(method = "cv", number = 10)
- model <- train(Species ~ ., data = iris, method = "rpart", trControl = trainControl)
- train[,1:4] <- scale(train[,1:4])
- test[,1:4] <- scale(test[,1:4])
- # Build Decision tree classifiers
- model1 <- rpart(Species ~ ., data = train, method = "class")
- model2 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(cp = 0.01))
- model3 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(minsplit = 20))
- model4 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(maxdepth = 2))
- # Make predictions and calculate accuracy for each model
- pred1 <- predict(model1, newdata = test, type = "class")
- confusionMatrix(pred1, test$Species)$overall[1]
- ------------------------------------------------------PRACTICAL-6---------------------------------------
- # Load the iris dataset
- data(iris)
- # Select only the numeric variables for clustering
- iris_numeric <- iris[, 1:4]
- # Scale the variables to have mean = 0 and standard deviation = 1
- iris_scaled <- scale(iris_numeric)
- # Simple Kmeans clustering
- set.seed(123)
- kmeans_result <- kmeans(iris_scaled, centers = 3, nstart = 20)
- # DBSCAN clustering
- library(dbscan)
- dbscan_result <- dbscan(iris_scaled, eps = 0.4, minPts = 5)
- # Hierarchical clustering
- hclust_result <- hclust(dist(iris_scaled), method = "complete")
- hclust_groups <- cutree(hclust_result, k = 3)
- # Compare the performance of the clustering algorithms
- library(cluster)
- library(factoextra)
- # Silhouette analysis for Kmeans
- fviz_nbclust(iris_scaled, kmeans, method = "silhouette")
- # Elbow method for Kmeans
- fviz_nbclust(iris_scaled, kmeans, method = "wss")
- # Plot DBSCAN results
- fviz_cluster(dbscan_result, iris_scaled)
- # Dendrogram for hierarchical clustering
- fviz_dend(hclust_result, k = 3, cex = 0.5)
- # Silhouette analysis for hierarchical clustering
- silhouette(hclust_groups, dist(iris_scaled))
- # Adjust the parameters and repeat the analysis to compare the performance of the algorithms
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement