Advertisement
garrisonnn

pfda

Jan 22nd, 2024 (edited)
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.63 KB | None | 0 0
  1. # Mark Anthony Varghese, TP072591
  2. # Tharmasenar A/L Devandran, TP072827
  3. # Movinraj Saravanan, TP072789
  4. # Garrison Goh Zen Ken, TP072740
  5.  
  6. library(tidyr)
  7. library(dplyr)
  8. library(ggplot2)
  9.  
  10. #==========
  11. # add csv file
  12. #==========
  13.  
  14. df = read.csv("C:\\Users\\garri\\OneDrive\\Documents\\School\\Year 2 - Semester 1\\Programming for Data Analysis\\Assignment Files\\kl_prop_data_assignment.csv", header=TRUE)
  15.  
  16. #==========
  17. # reading the csv file
  18. #==========
  19.  
  20. View(df) # view current dataset
  21. glimpse(df)
  22.  
  23. #==========
  24. # cleaning
  25. #==========
  26.  
  27. # removing duplicates from dataset
  28. dfClean = unique(df)
  29. View(df)
  30.  
  31. # dropping unnecessary columns
  32. dfClean = subset(dfClean, select = -c(3,4,5))
  33. View(dfClean)
  34.  
  35. # naming the headers
  36. names(dfClean) = c("LOCATION", "PRICE_IN_RM", "PROPERTY_TYPE", "SIZE", "FURNISHING")
  37. View(dfClean)
  38.  
  39. # separating size column into 2 columns, building type and size
  40. dfClean = separate(dfClean, col = SIZE, into = c("BUILDING_TYPE","SIZE"), sep= " : ")
  41. View(dfClean)
  42.  
  43. # removing the commas from size column
  44. dfClean$SIZE <- gsub(",", "", dfClean$SIZE)
  45. # removing the quotation marks from size column
  46. dfClean$SIZE <- gsub("'", "", dfClean$SIZE)
  47. # removing the tilde from size column
  48. dfClean$SIZE <- gsub("`", "", dfClean$SIZE)
  49.  
  50.  
  51. # replacing x with * in SIZE column
  52. dfClean$SIZE <- gsub("x", "*", dfClean$SIZE)
  53. # replacing X with * in SIZE column
  54. dfClean$SIZE <- gsub("X", "*", dfClean$SIZE)
  55.  
  56.  
  57. # extract the unit of area in size column
  58. dfClean$UNIT_OF_AREA <- gsub("[^sq.m.ft]", "", dfClean$SIZE)
  59.  
  60. dfClean$SIZE <- gsub("[^0-9*+]", "", dfClean$SIZE)
  61.  
  62.  
  63. # Define a regular expression to match rows with only numbers and mathematical operators
  64. pattern <- "^[0-9+*]+$"
  65.  
  66. # Use grepl to filter rows
  67. dfClean <- dfClean[grepl(pattern, dfClean$SIZE), ]
  68.  
  69. # Replace consecutive * symbols with a single *
  70. dfClean$SIZE <- gsub("\\*+", "*", dfClean$SIZE)
  71.  
  72. # Function to evaluate expressions
  73. evaluate_expression <- function(x) {
  74.   result <- tryCatch({
  75.     eval(parse(text = x))
  76.   }, error = function(e) {
  77.     return(NA)  # return NA in case of a parse error
  78.   })
  79.   return(result)
  80. }
  81.  
  82. # Apply the function to the column
  83. dfClean$SIZE <- sapply(dfClean$SIZE, evaluate_expression)
  84.  
  85. # converting size column from char to numeric
  86. dfClean$SIZE <- as.numeric(dfClean$SIZE)
  87.  
  88. # convert price column from char to numeric
  89. dfClean$PRICE_IN_RM <- as.numeric(gsub("[^0-9]", "", dfClean$PRICE_IN_RM))
  90.  
  91. generalize_property_type <- function(property_type) {
  92.   # Use gsub to replace specific patterns
  93.   property_type <- gsub("\\s*\\(.*\\)", "", property_type)
  94.   return(property_type)
  95. }
  96.  
  97. # Apply the function to the PROPERTY_TYPE column
  98. dfClean$PROPERTY_TYPE <- sapply(dfClean$PROPERTY_TYPE, generalize_property_type)
  99.  
  100.  
  101. # omit observations with size = 0
  102. dfClean <- dfClean %>%
  103.   filter(SIZE != 0)
  104.  
  105. View(dfClean)
  106. summary(dfClean)
  107.  
  108.  
  109. #======
  110. # filter the dataset to damansara heights and kepong locations only and remove null rows
  111. df_Kepong_DMSRHeights = filter(dfClean, (LOCATION == "Kepong, Kuala Lumpur" | LOCATION == "Damansara Heights, Kuala Lumpur") &
  112.                                  (!is.na(PRICE_IN_RM) & !is.na(PROPERTY_TYPE) & !is.na(BUILDING_TYPE) &
  113.                                     !is.na(SIZE) & !is.na(FURNISHING)))
  114. View(df_Kepong_DMSRHeights)
  115.  
  116. # visualize outliers in the data
  117. options(scipen =999)
  118. ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot()
  119. ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot(outlier.shape = NA)
  120.  
  121. ggplot(df_Kepong_DMSRHeights, aes(x = SIZE, y = PROPERTY_TYPE)) + geom_boxplot()
  122.  
  123. # removing the outliers in the data
  124. # PRICE column
  125. df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
  126.   group_by(PROPERTY_TYPE) %>%
  127.   filter(PRICE_IN_RM > quantile(PRICE_IN_RM, 0.25) - (1.5 * IQR(PRICE_IN_RM)) &
  128.            PRICE_IN_RM < quantile(PRICE_IN_RM, 0.75) + (1.5 * IQR(PRICE_IN_RM)))
  129.  
  130. # SIZE column
  131. df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
  132.   group_by(PROPERTY_TYPE) %>%
  133.   filter(SIZE > quantile(SIZE, 0.25) - (1.5 * IQR(SIZE)) &
  134.            SIZE < quantile(SIZE, 0.75) + (1.5 * IQR(SIZE)))
  135.  
  136. #===FURNISHING column===
  137. # set all blank data in FURNISHING column to Unknown
  138. df_Kepong_DMSRHeights$FURNISHING[df_Kepong_DMSRHeights$FURNISHING == ""] <- "Unknown"
  139.  
  140. #======
  141. # separating into 3 dataframes (df_DMSRHeights, df_Kepong, df_DMSRHeights_kepong)
  142. df_DMSRHeights = filter(df_Kepong_DMSRHeights, LOCATION == "Damansara Heights, Kuala Lumpur")
  143. View(df_DMSRHeights)
  144.  
  145. df_Kepong = filter(df_Kepong_DMSRHeights, LOCATION == "Kepong, Kuala Lumpur")
  146. View(df_Kepong)
  147.  
  148. summary(df_Kepong_DMSRHeights)
  149. View(df_Kepong_DMSRHeights)
  150.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement