Untitled

# Load necessary library
library(ggplot2)
library(corrplot)
library(car)  # for vif() function

# mtcars is a built-in dataset in R. Let's see what it looks like.
head(mtcars)

# Calculate correlation matrix
corr_matrix <- cor(mtcars)

# Print correlation matrix
print(corr_matrix)

# Create a correlation plot using corrplot()
corrplot(corr_matrix, method = "circle")

# You can also create a correlation heatmap with ggplot2
corr_data <- reshape2::melt(corr_matrix)
names(corr_data) <- c("Variable 1", "Variable 2", "Correlation")
ggplot(corr_data, aes('Variable 1', 'Variable 2', fill = Correlation)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1,
                                   size = 12, hjust = 1)) +
  coord_fixed()

# Fit a multiple linear regression model to the data using "lm" function
# We will try to predict "mpg" (Miles/(US) gallon) using "hp" (Gross horsepower) and "wt" (Weight (1000 lbs))
model <- lm(mpg ~ hp + wt, data = mtcars)

# Print a summary of the model
summary(model)

# The summary includes coefficients for each predictor variable (Intercept, hp, and wt), and their significance levels.
# For instance, the p-value associated with the hp and wt variable tells us whether that variable is a significant predictor
# of mpg after accounting for the other variables in the model.

# Check multicollinearity using VIF
vif_values <- vif(model)

# Print VIF values
print(vif_values)

# If VIF values are high (>5 or >10 typically), then there's multicollinearity.
# You may want to remove one of the predictors or use regularization techniques to handle it.

# For instance, let's assume that the VIF for "hp" was high.
# We would fit the model without "hp"
model2 <- lm(mpg ~ wt, data = mtcars)

# Checking the VIF for the new model
print(vif(model2))

# We can generate predictions from our previous model
mtcars$predicted_mpg <- predict(model, mtcars)

# Create a scatter plot of actual vs predicted values
ggplot(mtcars, aes(x = mpg, y = predicted_mpg)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Actual vs Predicted MPG",
       x = "Actual MPG",
       y = "Predicted MPG") +
  theme_minimal()

# You can see the line of best fit in red, and the individual predictions as points.
# The closer these points are to the line, the more accurate our predictions are.

# To check the assumptions of the linear regression model, we can look at the residuals.
residuals <- residuals(model)

# Plot the residuals
plot(residuals, main="Residuals of the Model", ylab="Residuals", xlab="Index")
abline(h=0, col="red")

# In this plot, residuals are plotted against the index of observations.
# Ideally, we want to see residuals scattered randomly around zero, which would suggest that our model's assumptions are met.