Untitled

library(corrplot)
library(ggplot2)
library(reshape2)


# Set a seed for reproducibility of results
set.seed(123)

# Simulate the sizes of houses and their prices
size <- rnorm(100, mean = 1500, sd = 500) # Size of houses in square feet
price <- 50000 + 100 * size + rnorm(100, mean = 0, sd = 50000) # Prices of houses. The '50000 + 100 * size' part simulates a linear relationship between size and price, and the 'rnorm()' part simulates random noise.

# Create a dataframe combining the sizes and corresponding prices
df <- data.frame(size, price)

# Calculate and print correlation between size and price
correlation <- cor(df$size, df$price)
cat("Correlation between size and price: ", correlation, "\n")

# Generate a scatter plot
plot(df$size, df$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)

# Generate a correlation matrix
corr_matrix <- cor(df)
print(corr_matrix)

# Generate a heatmap of correlations
corrplot(corr_matrix, method = "color", type = "upper", order = "hclust",
         addCoef.col = "black", # Add correlation coefficients on the heatmap
         tl.col = "black", # Text label color
         tl.srt = 45) # Text label rotation

# We are going to use a linear regression model to analyze the relationship between house size (independent variable) and its price (dependent variable).
# Null Hypothesis (H0): There is no linear relationship between house size and its price (the slope is zero).
# Alternative Hypothesis (H1): There is a linear relationship between house size and its price (the slope is not zero).

# Fit a linear regression model to the data
model <- lm(price ~ size, data = df) # 'price' is the dependent variable and 'size' is the independent variable

# Generate a scatter plot
plot(df$size, df$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)
abline(model, col = "red") # Add regression line to the plot


# Print a summary of the regression model, which includes the coefficients, the R-squared value, and the p-value
summary(model)

# Interpretation Hints:
# 1. Look at the 'Estimate' for 'size' in the 'Coefficients' table. This is the slope of the regression line, and it indicates the change in house price for each one-unit increase in house size. If the p-value associated with this estimate is less than 0.05, then the relationship is significant, and we can reject the null hypothesis.
# 2. The '(Intercept)' in the 'Coefficients' table is the y-intercept of the regression line, which is the predicted price when the size is zero.
# 3. The 'Residuals' section gives you information about the distribution of the residuals, which should ideally be normally distributed. Look for any large deviations in these values.
# 4. The 'R-squared' value indicates the proportion of variance in the dependent variable that can be explained by the independent variable(s). The closer this value is to 1, the better the fit of the model.

# Predict values based on the model
predicted_prices <- predict(model, df)
print("Predicted prices based on the model:")
print(predicted_prices)

# Generate predicted values
predicted_prices <- predict(model, df)

# Create a data frame for plotting
plot_df <- data.frame(Actual = df$price, Predicted = predicted_prices)

# Generate the plot of actual price v/s predicted price
ggplot(data = plot_df, aes(x = Actual, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Actual vs Predicted Prices",
       x = "Actual Price",
       y = "Predicted Price") +
  theme_minimal()

# Show residuals of the model
residuals <- resid(model)
print("Residuals of the model:")
print(residuals)

# Plot residuals to check if they're normally distributed
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue")