Advertisement
TUJHE_KYA_BHAI

DM_PRACTICALS

May 2nd, 2023
641
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 10.88 KB | None | 0 0
  1. ----------------------------------------PRACTICAL-1-------------------------------------------------------
  2. # create the data
  3. age <- c(21, 2, 18, 221, 34)
  4. agegroup <- c("adult", "child", "adult", "elderly", "child")
  5. height <- c(6.0, 3, 5.7, 5, -7)
  6. status <- c("single", "married", "married", "widowed", "married")
  7. yearsmarried <- c(-1, 0, 20, 2, 3)
  8.  
  9. # combine the data into a data frame
  10. people <- data.frame(age, agegroup, height, status, yearsmarried)
  11.  
  12. # write the data frame to a text file
  13. write.table(people, "people.txt", sep="\t", row.names=FALSE)
  14. # read the data from the file
  15. people <- read.table("people.txt", header=TRUE, sep="\t")
  16. # create the ruleset E
  17. library(dplyr)
  18.  
  19. E <- list(
  20.   check(age > 0 & age <= 150, "Age should be in the range 0-150"),
  21.   check(age > yearsmarried, "Age should be greater than yearsmarried"),
  22.   check(status %in% c("married", "single", "widowed"), "Status should be married or single or widowed"),
  23.   check(
  24.     ifelse(age < 18, agegroup == "child",
  25.            ifelse(age <= 65, agegroup == "adult", agegroup == "elderly")),
  26.     "Agegroup should be child, adult, or elderly"
  27.   )
  28. )
  29. # apply the ruleset E to the data
  30. violations <- people %>%
  31.   validate(E)
  32.  
  33. # summarize the results
  34. summary(violations)
  35. # visualize the results
  36. library(ggplot2)
  37.  
  38. ggplot(violations, aes(x=rule, y=row)) +
  39.   geom_point(size=3, color="red") +
  40.   ggtitle("Violations of Ruleset E") +
  41.   ylab("Row") +
  42.   xlab("Rule")
  43. -----------------------------------------PRACTICAL-2-------------------------------------
  44. # load the dataset
  45. dirty_iris <- read.csv("dirty_iris.csv", header=TRUE)
  46. # calculate the number and percentage of complete observations
  47. complete_obs <- complete.cases(dirty_iris)
  48. num_complete <- sum(complete_obs)
  49. perc_complete <- mean(complete_obs) * 100
  50. cat("Number of complete observations:", num_complete, "\n")
  51. cat("Percentage of complete observations:", perc_complete, "%\n")
  52. # replace all special values with NA
  53. dirty_iris[dirty_iris == "NA"] <- NA
  54. dirty_iris[dirty_iris == "N/A"] <- NA
  55. dirty_iris[dirty_iris == "?"] <- NA
  56. Species %in% c("setosa", "versicolor", "virginica")
  57. Sepal.Length > 0
  58. Sepal.Length <= 30
  59. Sepal.Length > Petal.Length
  60. Petal.Length >= 2 * Petal.Width
  61. # read the rules using the editrules package
  62. library(editrules)
  63.  
  64. rules <- editfile("iris_rules.txt")
  65. print(rules)
  66. # apply the rules to the dataset and count the number of violations
  67. violations <- validate(dirty_iris, rules)
  68. num_violations <- nrow(violations)
  69. cat("Number of violations:", num_violations, "\n")
  70.  
  71. # summarize the violations
  72. summary(violations)
  73.  
  74. # plot the violations
  75. plot(violations)
  76. # create a boxplot of sepal length
  77. boxplot(dirty_iris$Sepal.Length)
  78.  
  79. # calculate the outliers using boxplot.stats
  80. sepallength_stats <- boxplot.stats(dirty_iris$Sepal.Length)
  81. outliers <- sepallength_stats$out
  82. cat("Number of outliers in sepal length:", length(outliers), "\n")
  83.  
  84.  
  85. -----------------------------------------PRACTICAL-3--------------------------------------
  86. # load the wine dataset
  87. wine <- read.csv("wine.csv", header = TRUE)
  88.  
  89. # check if all attributes are standardized
  90. mean_wine <- apply(wine, 2, mean)
  91. sd_wine <- apply(wine, 2, sd)
  92. if(all(abs(mean_wine) < 1e-10) && all(abs(sd_wine - 1) < 1e-10)) {
  93.   cat("All attributes are standardized.\n")
  94. } else {
  95.   # standardize the attributes
  96.   wine_std <- scale(wine)
  97.   cat("Attributes have been standardized.\n")
  98. }
  99. # load the Iris dataset
  100. iris <- read.csv("iris.csv", header = TRUE)
  101.  
  102. # check if all attributes are standardized
  103. mean_iris <- apply(iris[,1:4], 2, mean)
  104. sd_iris <- apply(iris[,1:4], 2, sd)
  105. if(all(abs(mean_iris) < 1e-10) && all(abs(sd_iris - 1) < 1e-10)) {
  106.   cat("All attributes are standardized.\n")
  107. } else {
  108.   # standardize the attributes
  109.   iris_std <- scale(iris[,1:4])
  110.   iris_std <- cbind(iris_std, iris[,5])
  111.   colnames(iris_std) <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  112.   cat("Attributes have been standardized.\n")
  113. }
  114.  
  115. ---------------------------------------PRACTICAL-4-------------------------------------
  116. # generate example data
  117. transactions <- list(
  118.   c("beer", "chips", "nuts", "salsa"),
  119.   c("beer", "chips", "nuts"),
  120.   c("beer", "chips"),
  121.   c("beer", "salsa"),
  122.   c("beer", "nuts"),
  123.   c("chips", "nuts", "salsa"),
  124.   c("chips", "nuts"),
  125.   c("chips", "salsa"),
  126.   c("nuts", "salsa")
  127. )
  128. # load the arules package
  129. library(arules)
  130.  
  131. # convert the transaction data to a transaction object
  132. trans <- as(transactions, "transactions")
  133.  
  134. # run the Apriori algorithm
  135. frequentItemsets <- apriori(trans, parameter = list(supp = 0.5, conf = 0.75))
  136.  
  137. # view the frequent itemsets
  138. inspect(frequentItemsets)
  139.  
  140. # extract the association rules
  141. associationRules <- as(frequentItemsets, "rules")
  142.  
  143. # view the association rules
  144. inspect(associationRules)
  145. # run the Apriori algorithm
  146. frequentItemsets <- apriori(trans, parameter = list(supp = 0.6, conf = 0.6))
  147.  
  148. # view the frequent itemsets
  149. inspect(frequentItemsets)
  150.  
  151. # extract the association rules
  152. associationRules <- as(frequentItemsets, "rules")
  153.  
  154. # view the association rules
  155. inspect(associationRules)
  156. -------------------------------------------------------------PRACTICAL-5-------------------------
  157. NAIVE BAYES----------------------------
  158. library(caTools)
  159. library(e1071)
  160. set.seed(123)
  161.  
  162. # Load iris dataset
  163. data(iris)
  164.  
  165. # Split dataset into training and testing sets
  166. split = sample.split(iris$Species, SplitRatio = 0.75)
  167. train = subset(iris, split == TRUE)
  168. test = subset(iris, split == FALSE)
  169. # Train Naive Bayes classifier
  170. model = naiveBayes(Species ~ ., data = train)
  171. # Make predictions on testing set
  172. predictions = predict(model, test)
  173. # Calculate confusion matrix
  174. table(predictions, test$Species)
  175.  
  176. # Calculate accuracy
  177. mean(predictions == test$Species)
  178. split = sample.split(iris$Species, SplitRatio = 0.75)
  179. train = subset(iris, split == TRUE)
  180. test = subset(iris, split == FALSE)
  181.  
  182. model = naiveBayes(Species ~ ., data = train)
  183. predictions = predict(model, test)
  184.  
  185. mean(predictions == test$Species)
  186. split = sample.split(iris$Species, SplitRatio = 0.666)
  187. train = subset(iris, split == TRUE)
  188. test = subset(iris, split == FALSE)
  189.  
  190. model = naiveBayes(Species ~ ., data = train)
  191. predictions = predict(model, test)
  192.  
  193. mean(predictions == test$Species)
  194. split = sample.split(iris$Species, SplitRatio = 0.75)
  195. train = subset(iris, split == TRUE)
  196. test = subset(iris, split == FALSE)
  197.  
  198. model = naiveBayes(Species ~ ., data = train)
  199. predictions = predict(model, test)
  200.  
  201. mean(predictions == test$Species)
  202. accuracy = numeric(100)
  203.  
  204. for(i in 1:100) {
  205.   split = sample.split(iris$Species, SplitRatio = 0.75)
  206.   train = subset(iris, split == TRUE)
  207.   test = subset(iris, split == FALSE)
  208.  
  209.   model = naiveBayes(Species ~ ., data = train)
  210.   predictions = predict(model, test)
  211.  
  212.   accuracy[i] = mean(predictions == test$Species)
  213. }
  214.  
  215. mean(accuracy)
  216. model = naiveBayes(Species ~ ., data = iris)
  217.  
  218. accuracy = cv.accuracy(model, iris, FUN = function(x, y) mean(predict(x, y) == y$Species))
  219.  
  220. mean(accuracy)
  221. # Scale data
  222. train_scaled = scale(train[,1:4])
  223. test_scaled = scale(test[,1:4])
  224.  
  225. # Train Naive Bayes classifier on scaled data
  226. model = naiveBayes(Species ~ ., data = train_scaled)
  227. predictions = predict(model,
  228.  
  229. KNN---------------------------------------
  230. # Load the Iris dataset
  231. data(iris)
  232.  
  233. # Split the dataset into training and testing sets
  234. set.seed(123)
  235. train_index <- sample(1:nrow(iris), 0.75*nrow(iris))
  236. train_data <- iris[train_index,]
  237. test_data <- iris[-train_index,]
  238.  
  239. # Scale the data to standard format
  240. train_data_scaled <- scale(train_data[,1:4])
  241. test_data_scaled <- scale(test_data[,1:4])
  242.  
  243. # Train the k-NN model using the training set
  244. library(class)
  245. k <- 3
  246. knn_model <- knn(train_data_scaled, test_data_scaled, train_data$Species, k)
  247.  
  248. # Evaluate the model on the testing set
  249. table(knn_model, test_data$Species)
  250. accuracy <- sum(knn_model == test_data$Species) / length(test_data$Species)
  251. print(paste("Accuracy:", round(accuracy, 4)))
  252.                      
  253. DECISION TREE----------------------------------
  254.                       library(caret)
  255. library(rpart)
  256. data(iris)
  257. # Situation 5.1 a) Training set = 75% Test set = 25%
  258. set.seed(123)
  259. trainIndex <- createDataPartition(iris$Species, p = 0.75, list = FALSE)
  260. train <- iris[trainIndex,]
  261. test <- iris[-trainIndex,]
  262.  
  263. # Situation 5.1 b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
  264. set.seed(123)
  265. trainIndex <- createDataPartition(iris$Species, p = 0.666, list = FALSE)
  266. train <- iris[trainIndex,]
  267. test <- iris[-trainIndex,]
  268.  
  269. # Situation 5.2 i) hold out method
  270. set.seed(123)
  271. trainIndex <- sample(nrow(iris), 0.75*nrow(iris))
  272. train <- iris[trainIndex,]
  273. test <- iris[-trainIndex,]
  274.  
  275. # Situation 5.2 ii) Random subsampling
  276. set.seed(123)
  277. subsamples <- split(iris, sample(1:5, nrow(iris), replace = TRUE))
  278. train <- do.call(rbind, subsamples[-1])
  279. test <- subsamples[[1]]
  280.  
  281. # Situation 5.2 iii) Cross-Validation
  282. trainControl <- trainControl(method = "cv", number = 10)
  283. model <- train(Species ~ ., data = iris, method = "rpart", trControl = trainControl)
  284. train[,1:4] <- scale(train[,1:4])
  285. test[,1:4] <- scale(test[,1:4])
  286. # Build Decision tree classifiers
  287. model1 <- rpart(Species ~ ., data = train, method = "class")
  288. model2 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(cp = 0.01))
  289. model3 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(minsplit = 20))
  290. model4 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(maxdepth = 2))
  291.  
  292. # Make predictions and calculate accuracy for each model
  293. pred1 <- predict(model1, newdata = test, type = "class")
  294. confusionMatrix(pred1, test$Species)$overall[1]
  295. ------------------------------------------------------PRACTICAL-6---------------------------------------
  296.                       # Load the iris dataset
  297. data(iris)
  298.  
  299. # Select only the numeric variables for clustering
  300. iris_numeric <- iris[, 1:4]
  301.  
  302. # Scale the variables to have mean = 0 and standard deviation = 1
  303. iris_scaled <- scale(iris_numeric)
  304.  
  305. # Simple Kmeans clustering
  306. set.seed(123)
  307. kmeans_result <- kmeans(iris_scaled, centers = 3, nstart = 20)
  308.  
  309. # DBSCAN clustering
  310. library(dbscan)
  311. dbscan_result <- dbscan(iris_scaled, eps = 0.4, minPts = 5)
  312.  
  313. # Hierarchical clustering
  314. hclust_result <- hclust(dist(iris_scaled), method = "complete")
  315. hclust_groups <- cutree(hclust_result, k = 3)
  316.  
  317. # Compare the performance of the clustering algorithms
  318. library(cluster)
  319. library(factoextra)
  320.  
  321. # Silhouette analysis for Kmeans
  322. fviz_nbclust(iris_scaled, kmeans, method = "silhouette")
  323.  
  324. # Elbow method for Kmeans
  325. fviz_nbclust(iris_scaled, kmeans, method = "wss")
  326.  
  327. # Plot DBSCAN results
  328. fviz_cluster(dbscan_result, iris_scaled)
  329.  
  330. # Dendrogram for hierarchical clustering
  331. fviz_dend(hclust_result, k = 3, cex = 0.5)
  332.  
  333. # Silhouette analysis for hierarchical clustering
  334. silhouette(hclust_groups, dist(iris_scaled))
  335.  
  336. # Adjust the parameters and repeat the analysis to compare the performance of the algorithms
  337.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement