overfitting_bagging.py

# %% [markdown]
# ## Title :
# Regression with Bagging
#
# ## Description :
# The aim of this exercise is to understand regression using Bagging.
#
# <img src="./fig3.png" style="width: 900px;">
#
# ## Instructions:
#
# - Read the dataset `airquality.csv` as a Pandas dataframe.
# - Take a quick look at the dataset.
# - Split the data into train and test sets.
# - Specify the number of bootstraps as 30 and a maximum depth of 3.
# - Define a Bagging Regression model that uses Decision Tree as its base estimator.
# - Fit the model on the train data.
# - Use the helper code to predict using the mean model and individual estimators. The plot will look similar to the one given above.
# - Predict on the test data using the first estimator and the mean model.
# - Compute and display the test MSEs.
#
# ## Hints:
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html" target="_blank">sklearn.train_test_split()</a>
# Split arrays or matrices into random train and test subsets.
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html" target="_blank">BaggingRegressor()</a>
# Returns a Bagging regressor instance.
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html" target="_blank">DecisionTreeRegressor()</a>
# A decision tree regressor.
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.fit" target="_blank">DecisionTreeRegressor.fit()</a>
# Build a decision tree regressor from the training set (X, y).
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.predict" target="_blank">DecisionTreeRegressor.predict()</a>
# Build a decision tree regressor from the training set (X, y).
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html" target="_blank">DecisionTreeRegressor().estimators_ </a>
# A list of estimators. Use this to access any of the estimators.
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html" target="_blank">sklearn.mean_squared_error()</a>
# Mean squared error regression loss.

# %%
# Import necessary libraries
import itertools
import numpy as np
import pandas as pd
from numpy import std
from numpy import mean
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline


# %%
# Read the dataset
df = pd.read_csv("../DATA/airquality.csv",index_col=0)

# Take a quick look at the data
df.head(10)
df.shape


# %%
# Use the column Ozone to drop any NaNs from the dataframe
print('hello')

df = df[df.Ozone.notna()]
df.shape

# %%
# Assign the values of Ozon column as the predictor variable
x = df[['Ozone']].values

# Use temperature as the response data
y = df['Temp']


# %%
# Split the data into train and test sets with train size as 0.8
# and set random_state as 102
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=102)
x_test.shape

# %% [markdown]
# ### Bagging Regressor

# %%
# Specify the number of bootstraps as 30
num_bootstraps = 30

# Specify the maximum depth of the decision tree as 3
max_depth = 3

# Define the Bagging Regressor Model
# Use Decision Tree as your base estimator with depth as mentioned in max_depth
# Initialise number of estimators using the num_bootstraps value
model = BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=num_bootstraps)


# Fit the model on the train data
model.fit(x_train,y_train)


# %%
# Helper code to plot the predictions of individual estimators
plt.figure(figsize=(10,8))

xrange = np.linspace(x.min(),x.max(),80).reshape(-1,1)
plt.plot(x_train,y_train,'o',color='#EFAEA4', markersize=6, label="Train Data")
plt.plot(x_test,y_test,'o',color='#F6345E', markersize=6, label="Test Data")

plt.xlim()
for i in model.estimators_:
    y_pred1 = i.predict(xrange)
    plt.plot(xrange,y_pred1,alpha=0.5,linewidth=0.5,color = '#ABCCE3')
plt.plot(xrange,y_pred1,alpha=0.6,linewidth=1,color = '#ABCCE3',label="Prediction of Individual Estimators")


y_pred = model.predict(xrange)
plt.plot(xrange,y_pred,alpha=0.7,linewidth=3,color='#50AEA4', label='Model Prediction')
plt.xlabel("Ozone", fontsize=16)
plt.ylabel("Temperature", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(loc='best',fontsize=12)
plt.show();


# %%
# Compute the test MSE of the prediction of every individual estimator
y_pred1 = np.mean([estimator.predict(x_test) for estimator in model.estimators_],axis=0)

# Print the test MSE
print("The test MSE of one estimator in the model is", round(mean_squared_error(y_test,y_pred1),2))


# %%
### edTest(test_mse) ###
# Compute the test MSE of the model prediction
y_pred = model.predict(x_test)

# Print the test MSE
print("The test MSE of the model is",round(mean_squared_error(y_test,y_pred),2))