Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.pipeline import Pipeline
- from sklearn.impute import SimpleImputer
- from sklearn.model_selection import cross_val_score
- import matplotlib.pyplot as plt
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 0. Function that returns the average score - Input = number of trees examined in the RFR model
- # *******************************************************************************************************
- # *******************************************************************************************************
- def get_score(n_estimators):
- my_pipeline = Pipeline(steps=[
- ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))])
- scores = -1 * cross_val_score(my_pipeline, X, y, cv=3, scoring='neg_mean_absolute_error')
- avg = scores.mean()
- print("Average MAE score:", avg)
- return avg
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 1. Basics
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 1a. Read the 2 datasets
- train_data = pd.read_csv('../input/train.csv', index_col='Id')
- test_data = pd.read_csv('../input/test.csv', index_col='Id')
- # 1b. Remove rows with missing target, separate target from predictors
- train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
- y = train_data.SalePrice
- train_data.drop(['SalePrice'], axis=1, inplace=True)
- # 1c. Select numeric columns only (Not OH encoding below)
- numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
- X = train_data[numeric_cols].copy()
- X_test = test_data[numeric_cols].copy()
- X.head()
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 2. Create a simple pipeline with a simple imputer for numerical columns that selected before
- # *******************************************************************************************************
- # *******************************************************************************************************
- my_pipeline = Pipeline(steps=[
- ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=50, random_state=0))])
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 3. Create 5 folds in the dataset - Conduct 5 experiments with 5 different scores - Find the average
- # *******************************************************************************************************
- # *******************************************************************************************************
- # Multiply by -1 since sklearn calculates *negative* MAE
- scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
- print("Average MAE score:", scores.mean())
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 4. Test different parameter values - Variable n_estimators
- # *******************************************************************************************************
- # *******************************************************************************************************
- # 4a. Create a dictionary of keys and values
- n_estimators_list = list(range(50, 401, 50))
- results = {n_estimators: get_score(n_estimators) for n_estimators in n_estimators_list}
- print(results)
- # 4b. Plot the results
- %matplotlib inline
- plt.plot(list(results.keys()), list(results.values()))
- plt.show()
- n_estimators_best = 200
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement