Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Mark Anthony Varghese, TP072591
- # Tharmasenar A/L Devandran, TP072827
- # Movinraj Saravanan, TP072789
- # Garrison Goh Zen Ken, TP072740
- library(tidyr)
- library(dplyr)
- library(ggplot2)
- #==========
- # add csv file
- #==========
- df = read.csv("C:\\Users\\garri\\OneDrive\\Documents\\School\\Year 2 - Semester 1\\Programming for Data Analysis\\Assignment Files\\kl_prop_data_assignment.csv", header=TRUE)
- #==========
- # reading the csv file
- #==========
- View(df) # view current dataset
- glimpse(df)
- #==========
- # cleaning
- #==========
- # removing duplicates from dataset
- dfClean = unique(df)
- View(df)
- # dropping unnecessary columns
- dfClean = subset(dfClean, select = -c(3,4,5))
- View(dfClean)
- # naming the headers
- names(dfClean) = c("LOCATION", "PRICE_IN_RM", "PROPERTY_TYPE", "SIZE", "FURNISHING")
- View(dfClean)
- # separating size column into 2 columns, building type and size
- dfClean = separate(dfClean, col = SIZE, into = c("BUILDING_TYPE","SIZE"), sep= " : ")
- View(dfClean)
- # removing the commas from size column
- dfClean$SIZE <- gsub(",", "", dfClean$SIZE)
- # removing the quotation marks from size column
- dfClean$SIZE <- gsub("'", "", dfClean$SIZE)
- # removing the tilde from size column
- dfClean$SIZE <- gsub("`", "", dfClean$SIZE)
- # replacing x with * in SIZE column
- dfClean$SIZE <- gsub("x", "*", dfClean$SIZE)
- # replacing X with * in SIZE column
- dfClean$SIZE <- gsub("X", "*", dfClean$SIZE)
- # extract the unit of area in size column
- dfClean$UNIT_OF_AREA <- gsub("[^sq.m.ft]", "", dfClean$SIZE)
- dfClean$SIZE <- gsub("[^0-9*+]", "", dfClean$SIZE)
- # Define a regular expression to match rows with only numbers and mathematical operators
- pattern <- "^[0-9+*]+$"
- # Use grepl to filter rows
- dfClean <- dfClean[grepl(pattern, dfClean$SIZE), ]
- # Replace consecutive * symbols with a single *
- dfClean$SIZE <- gsub("\\*+", "*", dfClean$SIZE)
- # Function to evaluate expressions
- evaluate_expression <- function(x) {
- result <- tryCatch({
- eval(parse(text = x))
- }, error = function(e) {
- return(NA) # return NA in case of a parse error
- })
- return(result)
- }
- # Apply the function to the column
- dfClean$SIZE <- sapply(dfClean$SIZE, evaluate_expression)
- # converting size column from char to numeric
- dfClean$SIZE <- as.numeric(dfClean$SIZE)
- # convert price column from char to numeric
- dfClean$PRICE_IN_RM <- as.numeric(gsub("[^0-9]", "", dfClean$PRICE_IN_RM))
- generalize_property_type <- function(property_type) {
- # Use gsub to replace specific patterns
- property_type <- gsub("\\s*\\(.*\\)", "", property_type)
- return(property_type)
- }
- # Apply the function to the PROPERTY_TYPE column
- dfClean$PROPERTY_TYPE <- sapply(dfClean$PROPERTY_TYPE, generalize_property_type)
- # omit observations with size = 0
- dfClean <- dfClean %>%
- filter(SIZE != 0)
- View(dfClean)
- summary(dfClean)
- #======
- # filter the dataset to damansara heights and kepong locations only and remove null rows
- df_Kepong_DMSRHeights = filter(dfClean, (LOCATION == "Kepong, Kuala Lumpur" | LOCATION == "Damansara Heights, Kuala Lumpur") &
- (!is.na(PRICE_IN_RM) & !is.na(PROPERTY_TYPE) & !is.na(BUILDING_TYPE) &
- !is.na(SIZE) & !is.na(FURNISHING)))
- View(df_Kepong_DMSRHeights)
- # visualize outliers in the data
- options(scipen =999)
- ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot()
- ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot(outlier.shape = NA)
- ggplot(df_Kepong_DMSRHeights, aes(x = SIZE, y = PROPERTY_TYPE)) + geom_boxplot()
- # removing the outliers in the data
- # PRICE column
- df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
- group_by(PROPERTY_TYPE) %>%
- filter(PRICE_IN_RM > quantile(PRICE_IN_RM, 0.25) - (1.5 * IQR(PRICE_IN_RM)) &
- PRICE_IN_RM < quantile(PRICE_IN_RM, 0.75) + (1.5 * IQR(PRICE_IN_RM)))
- # SIZE column
- df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
- group_by(PROPERTY_TYPE) %>%
- filter(SIZE > quantile(SIZE, 0.25) - (1.5 * IQR(SIZE)) &
- SIZE < quantile(SIZE, 0.75) + (1.5 * IQR(SIZE)))
- #===FURNISHING column===
- # set all blank data in FURNISHING column to Unknown
- df_Kepong_DMSRHeights$FURNISHING[df_Kepong_DMSRHeights$FURNISHING == ""] <- "Unknown"
- #======
- # separating into 3 dataframes (df_DMSRHeights, df_Kepong, df_DMSRHeights_kepong)
- df_DMSRHeights = filter(df_Kepong_DMSRHeights, LOCATION == "Damansara Heights, Kuala Lumpur")
- View(df_DMSRHeights)
- df_Kepong = filter(df_Kepong_DMSRHeights, LOCATION == "Kepong, Kuala Lumpur")
- View(df_Kepong)
- summary(df_Kepong_DMSRHeights)
- View(df_Kepong_DMSRHeights)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement