Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(corrplot)
- library(ggplot2)
- library(reshape2)
- # Set a seed for reproducibility of results
- set.seed(123)
- # Simulate the sizes of houses and their prices
- size <- rnorm(100, mean = 1500, sd = 500) # Size of houses in square feet
- price <- 50000 + 100 * size + rnorm(100, mean = 0, sd = 50000) # Prices of houses. The '50000 + 100 * size' part simulates a linear relationship between size and price, and the 'rnorm()' part simulates random noise.
- # Create a dataframe combining the sizes and corresponding prices
- df <- data.frame(size, price)
- # Calculate and print correlation between size and price
- correlation <- cor(df$size, df$price)
- cat("Correlation between size and price: ", correlation, "\n")
- # Generate a scatter plot
- plot(df$size, df$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)
- # Generate a correlation matrix
- corr_matrix <- cor(df)
- print(corr_matrix)
- # Generate a heatmap of correlations
- corrplot(corr_matrix, method = "color", type = "upper", order = "hclust",
- addCoef.col = "black", # Add correlation coefficients on the heatmap
- tl.col = "black", # Text label color
- tl.srt = 45) # Text label rotation
- # We are going to use a linear regression model to analyze the relationship between house size (independent variable) and its price (dependent variable).
- # Null Hypothesis (H0): There is no linear relationship between house size and its price (the slope is zero).
- # Alternative Hypothesis (H1): There is a linear relationship between house size and its price (the slope is not zero).
- # Fit a linear regression model to the data
- model <- lm(price ~ size, data = df) # 'price' is the dependent variable and 'size' is the independent variable
- # Generate a scatter plot
- plot(df$size, df$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)
- abline(model, col = "red") # Add regression line to the plot
- # Print a summary of the regression model, which includes the coefficients, the R-squared value, and the p-value
- summary(model)
- # Interpretation Hints:
- # 1. Look at the 'Estimate' for 'size' in the 'Coefficients' table. This is the slope of the regression line, and it indicates the change in house price for each one-unit increase in house size. If the p-value associated with this estimate is less than 0.05, then the relationship is significant, and we can reject the null hypothesis.
- # 2. The '(Intercept)' in the 'Coefficients' table is the y-intercept of the regression line, which is the predicted price when the size is zero.
- # 3. The 'Residuals' section gives you information about the distribution of the residuals, which should ideally be normally distributed. Look for any large deviations in these values.
- # 4. The 'R-squared' value indicates the proportion of variance in the dependent variable that can be explained by the independent variable(s). The closer this value is to 1, the better the fit of the model.
- # Predict values based on the model
- predicted_prices <- predict(model, df)
- print("Predicted prices based on the model:")
- print(predicted_prices)
- # Generate predicted values
- predicted_prices <- predict(model, df)
- # Create a data frame for plotting
- plot_df <- data.frame(Actual = df$price, Predicted = predicted_prices)
- # Generate the plot of actual price v/s predicted price
- ggplot(data = plot_df, aes(x = Actual, y = Predicted)) +
- geom_point() +
- geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
- labs(title = "Actual vs Predicted Prices",
- x = "Actual Price",
- y = "Predicted Price") +
- theme_minimal()
- # Show residuals of the model
- residuals <- resid(model)
- print("Residuals of the model:")
- print(residuals)
- # Plot residuals to check if they're normally distributed
- hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement