pfda

# Mark Anthony Varghese, TP072591
# Tharmasenar A/L Devandran, TP072827
# Movinraj Saravanan, TP072789
# Garrison Goh Zen Ken, TP072740

library(tidyr)
library(dplyr)
library(ggplot2)

#==========
# add csv file
#==========

df = read.csv("C:\\Users\\garri\\OneDrive\\Documents\\School\\Year 2 - Semester 1\\Programming for Data Analysis\\Assignment Files\\kl_prop_data_assignment.csv", header=TRUE)

#==========
# reading the csv file
#==========

View(df) # view current dataset
glimpse(df)

#==========
# cleaning
#==========

# removing duplicates from dataset
dfClean = unique(df)
View(df)

# dropping unnecessary columns
dfClean = subset(dfClean, select = -c(3,4,5))
View(dfClean)

# naming the headers
names(dfClean) = c("LOCATION", "PRICE_IN_RM", "PROPERTY_TYPE", "SIZE", "FURNISHING")
View(dfClean)

# separating size column into 2 columns, building type and size
dfClean = separate(dfClean, col = SIZE, into = c("BUILDING_TYPE","SIZE"), sep= " : ")
View(dfClean)

# removing the commas from size column
dfClean$SIZE <- gsub(",", "", dfClean$SIZE)
# removing the quotation marks from size column
dfClean$SIZE <- gsub("'", "", dfClean$SIZE)
# removing the tilde from size column
dfClean$SIZE <- gsub("`", "", dfClean$SIZE)


# replacing x with * in SIZE column
dfClean$SIZE <- gsub("x", "*", dfClean$SIZE)
# replacing X with * in SIZE column
dfClean$SIZE <- gsub("X", "*", dfClean$SIZE)


# extract the unit of area in size column
dfClean$UNIT_OF_AREA <- gsub("[^sq.m.ft]", "", dfClean$SIZE)

dfClean$SIZE <- gsub("[^0-9*+]", "", dfClean$SIZE)


# Define a regular expression to match rows with only numbers and mathematical operators
pattern <- "^[0-9+*]+$"

# Use grepl to filter rows
dfClean <- dfClean[grepl(pattern, dfClean$SIZE), ]

# Replace consecutive * symbols with a single *
dfClean$SIZE <- gsub("\\*+", "*", dfClean$SIZE)

# Function to evaluate expressions
evaluate_expression <- function(x) {
  result <- tryCatch({
    eval(parse(text = x))
  }, error = function(e) {
    return(NA)  # return NA in case of a parse error
  })
  return(result)
}

# Apply the function to the column
dfClean$SIZE <- sapply(dfClean$SIZE, evaluate_expression)

# converting size column from char to numeric
dfClean$SIZE <- as.numeric(dfClean$SIZE)

# convert price column from char to numeric
dfClean$PRICE_IN_RM <- as.numeric(gsub("[^0-9]", "", dfClean$PRICE_IN_RM))

generalize_property_type <- function(property_type) {
  # Use gsub to replace specific patterns
  property_type <- gsub("\\s*\\(.*\\)", "", property_type)
  return(property_type)
}

# Apply the function to the PROPERTY_TYPE column
dfClean$PROPERTY_TYPE <- sapply(dfClean$PROPERTY_TYPE, generalize_property_type)


# omit observations with size = 0
dfClean <- dfClean %>%
  filter(SIZE != 0)

View(dfClean)
summary(dfClean)


#======
# filter the dataset to damansara heights and kepong locations only and remove null rows
df_Kepong_DMSRHeights = filter(dfClean, (LOCATION == "Kepong, Kuala Lumpur" | LOCATION == "Damansara Heights, Kuala Lumpur") &
                                 (!is.na(PRICE_IN_RM) & !is.na(PROPERTY_TYPE) & !is.na(BUILDING_TYPE) &
                                    !is.na(SIZE) & !is.na(FURNISHING)))
View(df_Kepong_DMSRHeights)

# visualize outliers in the data
options(scipen =999)
ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot()
ggplot(df_Kepong_DMSRHeights, aes(x = PRICE_IN_RM, y = PROPERTY_TYPE)) + geom_boxplot(outlier.shape = NA)

ggplot(df_Kepong_DMSRHeights, aes(x = SIZE, y = PROPERTY_TYPE)) + geom_boxplot()

# removing the outliers in the data
# PRICE column
df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
  group_by(PROPERTY_TYPE) %>%
  filter(PRICE_IN_RM > quantile(PRICE_IN_RM, 0.25) - (1.5 * IQR(PRICE_IN_RM)) &
           PRICE_IN_RM < quantile(PRICE_IN_RM, 0.75) + (1.5 * IQR(PRICE_IN_RM)))

# SIZE column
df_Kepong_DMSRHeights <- df_Kepong_DMSRHeights %>%
  group_by(PROPERTY_TYPE) %>%
  filter(SIZE > quantile(SIZE, 0.25) - (1.5 * IQR(SIZE)) &
           SIZE < quantile(SIZE, 0.75) + (1.5 * IQR(SIZE)))

#===FURNISHING column===
# set all blank data in FURNISHING column to Unknown
df_Kepong_DMSRHeights$FURNISHING[df_Kepong_DMSRHeights$FURNISHING == ""] <- "Unknown"

#======
# separating into 3 dataframes (df_DMSRHeights, df_Kepong, df_DMSRHeights_kepong)
df_DMSRHeights = filter(df_Kepong_DMSRHeights, LOCATION == "Damansara Heights, Kuala Lumpur")
View(df_DMSRHeights)

df_Kepong = filter(df_Kepong_DMSRHeights, LOCATION == "Kepong, Kuala Lumpur")
View(df_Kepong)

summary(df_Kepong_DMSRHeights)
View(df_Kepong_DMSRHeights)