Advertisement
makispaiktis

Kaggle - Intermediate ML - Cross Validation

Jun 26th, 2023
702
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.42 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.ensemble import RandomForestRegressor
  4. from sklearn.pipeline import Pipeline
  5. from sklearn.impute import SimpleImputer
  6. from sklearn.model_selection import cross_val_score
  7. import matplotlib.pyplot as plt
  8.  
  9.  
  10. # *******************************************************************************************************
  11. # *******************************************************************************************************
  12. # 0. Function that returns the average score - Input = number of trees examined in the RFR model
  13. # *******************************************************************************************************
  14. # *******************************************************************************************************
  15.  
  16. def get_score(n_estimators):
  17.    
  18.     my_pipeline = Pipeline(steps=[
  19.     ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))])
  20.     scores = -1 * cross_val_score(my_pipeline, X, y, cv=3, scoring='neg_mean_absolute_error')
  21.     avg = scores.mean()
  22.     print("Average MAE score:", avg)
  23.     return avg
  24.  
  25.  
  26.  
  27.  
  28. # *******************************************************************************************************
  29. # *******************************************************************************************************
  30. # 1. Basics
  31. # *******************************************************************************************************
  32. # *******************************************************************************************************
  33.  
  34. # 1a. Read the 2 datasets
  35. train_data = pd.read_csv('../input/train.csv', index_col='Id')
  36. test_data = pd.read_csv('../input/test.csv', index_col='Id')
  37.  
  38. # 1b. Remove rows with missing target, separate target from predictors
  39. train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
  40. y = train_data.SalePrice              
  41. train_data.drop(['SalePrice'], axis=1, inplace=True)
  42.  
  43. # 1c. Select numeric columns only (Not OH encoding below)
  44. numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
  45. X = train_data[numeric_cols].copy()
  46. X_test = test_data[numeric_cols].copy()
  47. X.head()
  48.  
  49.  
  50.  
  51.  
  52.  
  53. # *******************************************************************************************************
  54. # *******************************************************************************************************
  55. # 2. Create a simple pipeline with a simple imputer for numerical columns that selected before
  56. # *******************************************************************************************************
  57. # *******************************************************************************************************
  58.  
  59. my_pipeline = Pipeline(steps=[
  60.     ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=50, random_state=0))])
  61.  
  62.  
  63.  
  64.  
  65.  
  66. # *******************************************************************************************************
  67. # *******************************************************************************************************
  68. # 3. Create 5 folds in the dataset - Conduct 5 experiments with 5 different scores - Find the average
  69. # *******************************************************************************************************
  70. # *******************************************************************************************************
  71.  
  72. # Multiply by -1 since sklearn calculates *negative* MAE
  73. scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  74. print("Average MAE score:", scores.mean())
  75.  
  76.  
  77.  
  78.  
  79. # *******************************************************************************************************
  80. # *******************************************************************************************************
  81. # 4. Test different parameter values - Variable n_estimators
  82. # *******************************************************************************************************
  83. # *******************************************************************************************************
  84.  
  85. # 4a. Create a dictionary of keys and values
  86. n_estimators_list = list(range(50, 401, 50))
  87. results = {n_estimators: get_score(n_estimators) for n_estimators in n_estimators_list}
  88. print(results)
  89.  
  90. # 4b. Plot the results
  91. %matplotlib inline
  92. plt.plot(list(results.keys()), list(results.values()))
  93. plt.show()
  94. n_estimators_best = 200
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement