Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # %% [markdown]
- # ## Title :
- # Regression with Bagging
- #
- # ## Description :
- # The aim of this exercise is to understand regression using Bagging.
- #
- # <img src="./fig3.png" style="width: 900px;">
- #
- # ## Instructions:
- #
- # - Read the dataset `airquality.csv` as a Pandas dataframe.
- # - Take a quick look at the dataset.
- # - Split the data into train and test sets.
- # - Specify the number of bootstraps as 30 and a maximum depth of 3.
- # - Define a Bagging Regression model that uses Decision Tree as its base estimator.
- # - Fit the model on the train data.
- # - Use the helper code to predict using the mean model and individual estimators. The plot will look similar to the one given above.
- # - Predict on the test data using the first estimator and the mean model.
- # - Compute and display the test MSEs.
- #
- # ## Hints:
- #
- # <a href="" target="_blank">sklearn.train_test_split()</a>
- # Split arrays or matrices into random train and test subsets.
- #
- # <a href="" target="_blank">BaggingRegressor()</a>
- # Returns a Bagging regressor instance.
- #
- # <a href="" target="_blank">DecisionTreeRegressor()</a>
- # A decision tree regressor.
- #
- # <a href="" target="_blank"></a>
- # Build a decision tree regressor from the training set (X, y).
- #
- # <a href="" target="_blank">DecisionTreeRegressor.predict()</a>
- # Build a decision tree regressor from the training set (X, y).
- #
- # <a href="" target="_blank">DecisionTreeRegressor().estimators_ </a>
- # A list of estimators. Use this to access any of the estimators.
- #
- # <a href="" target="_blank">sklearn.mean_squared_error()</a>
- # Mean squared error regression loss.
- # %%
- # Import necessary libraries
- import itertools
- import numpy as np
- import pandas as pd
- from numpy import std
- from numpy import mean
- import matplotlib.pyplot as plt
- from sklearn.datasets import make_regression
- from sklearn.ensemble import BaggingRegressor
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.metrics import mean_squared_error
- from sklearn.model_selection import train_test_split
- %matplotlib inline
- # %%
- # Read the dataset
- df = pd.read_csv("../DATA/airquality.csv",index_col=0)
- # Take a quick look at the data
- df.head(10)
- df.shape
- # %%
- # Use the column Ozone to drop any NaNs from the dataframe
- print('hello')
- df = df[df.Ozone.notna()]
- df.shape
- # %%
- # Assign the values of Ozon column as the predictor variable
- x = df[['Ozone']].values
- # Use temperature as the response data
- y = df['Temp']
- # %%
- # Split the data into train and test sets with train size as 0.8
- # and set random_state as 102
- x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=102)
- x_test.shape
- # %% [markdown]
- # ### Bagging Regressor
- # %%
- # Specify the number of bootstraps as 30
- num_bootstraps = 30
- # Specify the maximum depth of the decision tree as 3
- max_depth = 3
- # Define the Bagging Regressor Model
- # Use Decision Tree as your base estimator with depth as mentioned in max_depth
- # Initialise number of estimators using the num_bootstraps value
- model = BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=num_bootstraps)
- # Fit the model on the train data
- # %%
- # Helper code to plot the predictions of individual estimators
- plt.figure(figsize=(10,8))
- xrange = np.linspace(x.min(),x.max(),80).reshape(-1,1)
- plt.plot(x_train,y_train,'o',color='#EFAEA4', markersize=6, label="Train Data")
- plt.plot(x_test,y_test,'o',color='#F6345E', markersize=6, label="Test Data")
- plt.xlim()
- for i in model.estimators_:
- y_pred1 = i.predict(xrange)
- plt.plot(xrange,y_pred1,alpha=0.5,linewidth=0.5,color = '#ABCCE3')
- plt.plot(xrange,y_pred1,alpha=0.6,linewidth=1,color = '#ABCCE3',label="Prediction of Individual Estimators")
- y_pred = model.predict(xrange)
- plt.plot(xrange,y_pred,alpha=0.7,linewidth=3,color='#50AEA4', label='Model Prediction')
- plt.xlabel("Ozone", fontsize=16)
- plt.ylabel("Temperature", fontsize=16)
- plt.xticks(fontsize=12)
- plt.yticks(fontsize=12)
- plt.legend(loc='best',fontsize=12)
- # %%
- # Compute the test MSE of the prediction of every individual estimator
- y_pred1 = np.mean([estimator.predict(x_test) for estimator in model.estimators_],axis=0)
- # Print the test MSE
- print("The test MSE of one estimator in the model is", round(mean_squared_error(y_test,y_pred1),2))
- # %%
- ### edTest(test_mse) ###
- # Compute the test MSE of the model prediction
- y_pred = model.predict(x_test)
- # Print the test MSE
- print("The test MSE of the model is",round(mean_squared_error(y_test,y_pred),2))
Add Comment
Please, Sign In to add comment