Advertisement
makispaiktis

Kaggle - Intermediate ML - Drop, Imputation

Jun 22nd, 2023
658
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.92 KB | None | 0 0
  1. # *******************************************************************************************
  2. # *******************************************************************************************
  3. # 0. Function that evaluates a RandomForestRegressor model (with 100 trees) using MAE
  4. # *******************************************************************************************
  5. # *******************************************************************************************
  6.  
  7. from sklearn.ensemble import RandomForestRegressor
  8. from sklearn.metrics import mean_absolute_error
  9. from sklearn.impute import SimpleImputer
  10.  
  11.  
  12. def score_dataset(X_train, X_valid, y_train, y_valid):
  13.     model = RandomForestRegressor(n_estimators=100, random_state=0)
  14.     model.fit(X_train, y_train)
  15.     preds = model.predict(X_valid)
  16.     return mean_absolute_error(y_valid, preds)
  17.  
  18.  
  19.  
  20.  
  21.  
  22. # *******************************************************************************************
  23. # *******************************************************************************************
  24. # 1. Training, validation (1st dataset) and testing dataset
  25. # *******************************************************************************************
  26. # *******************************************************************************************
  27.  
  28. import pandas as pd
  29. from sklearn.model_selection import train_test_split
  30.  
  31. # 1a. Read the data from the 2 datasets
  32. X_full = pd.read_csv('../input/train.csv', index_col='Id')
  33. X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
  34.  
  35. # 1b. Remove ROWS with MISSING TARGET, separate target from predictors
  36. X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
  37. y = X_full.SalePrice
  38. X_full.drop(['SalePrice'], axis=1, inplace=True)
  39.  
  40. # 1c. To keep things simple, we'll use only numerical predictors
  41. X = X_full.select_dtypes(exclude=['object'])
  42. X_test = X_test_full.select_dtypes(exclude=['object'])
  43.  
  44. # 1d. Break off validation set from training data
  45. X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
  46. X_train.head()
  47. # Shape of training data (num_rows, num_columns)
  48. print(X_train.shape)
  49.  
  50.  
  51. # *******************************************************************************************
  52. # *******************************************************************************************
  53. # 2. Preliminary investigation - Find which and how many columns contain missing data
  54. # *******************************************************************************************
  55. # *******************************************************************************************
  56.  
  57. # Number of missing values in each column of training data
  58. missing_val_count_by_column = (X_train.isnull().sum())
  59. print(missing_val_count_by_column[missing_val_count_by_column > 0])
  60.  
  61. num_rows = 1168
  62. num_cols_with_missing = 3
  63. tot_missing = 212 + 6 + 58
  64.  
  65.  
  66.  
  67. # *******************************************************************************************
  68. # *******************************************************************************************
  69. # 3a. Drop missing values
  70. # *******************************************************************************************
  71. # *******************************************************************************************
  72.  
  73. # Get names of columns with missing values - The same as above DataFrame
  74. cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
  75. print("Names of columns with missing data\n", cols_with_missing, '\n\n')
  76.  
  77. reduced_X_train = X_train.drop(cols_with_missing, axis=1)
  78. reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
  79.  
  80. print("MAE (Drop columns with missing values):")
  81. print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
  82.  
  83.  
  84.  
  85.  
  86. # *******************************************************************************************
  87. # *******************************************************************************************
  88. # 3b. Imputation - Default strategy = mean
  89. # *******************************************************************************************
  90. # *******************************************************************************************
  91.  
  92. my_imputer = SimpleImputer()
  93. imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
  94. imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
  95.  
  96. # Fill in the lines below: imputation removed column names; put them back
  97. imputed_X_train.columns = X_train.columns
  98. imputed_X_valid.columns = X_valid.columns
  99.  
  100. print("MAE (Imputation):")
  101. print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
  102.  
  103. print('\n\n', "Given that thre are so few missing values in the dataset, we'd expect imputation to perform better than dropping columns entirely. However, we see that dropping columns performs slightly better! While this can probably partially be attributed to noise in the dataset, another potential explanation is that the imputation method is not a great match to this dataset. That is, maybe instead of filling in the mean value, it makes more sense to set every missing value to a value of 0, to fill in the most frequently encountered value, or to use some other method. For instance, consider the GarageYrBlt column (which indicates the year that the garage was built). It's likely that in some cases, a missing value could indicate a house that does not have a garage. Does it make more sense to fill in the median value along each column in this case? Or could we get better results by filling in the minimum value along each column? It's not quite clear what's best in this case, but perhaps we can rule out some options immediately - for instance, setting missing values in this column to 0 is likely to yield horrible results!", '\n\n')
  104.  
  105.  
  106. # *******************************************************************************************
  107. # *******************************************************************************************
  108. # 3c. Imputation - Custom strategy = median
  109. # *******************************************************************************************
  110. # *******************************************************************************************
  111.  
  112. final_imputer = SimpleImputer(strategy='median')
  113. final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
  114. final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))
  115. final_X_train.columns = X_train.columns
  116. final_X_valid.columns = X_valid.columns
  117.  
  118. print("MAE (Imputation 2):")
  119. print(score_dataset(final_X_train, final_X_valid, y_train, y_valid))
  120.  
  121.  
  122.  
  123. # *******************************************************************************************
  124. # *******************************************************************************************
  125. # 4. Define and fit model
  126. # *******************************************************************************************
  127. # *******************************************************************************************
  128.  
  129. model = RandomForestRegressor(n_estimators=100, random_state=0)
  130. model.fit(final_X_train, y_train)
  131. preds_valid = model.predict(final_X_valid)
  132.  
  133. print("MAE (Your approach):")
  134. print(mean_absolute_error(y_valid, preds_valid))
  135.  
  136.  
  137.  
  138. # *******************************************************************************************
  139. # *******************************************************************************************
  140. # 5. Apply the model in the 2nd dataset - for testing
  141. # *******************************************************************************************
  142. # *******************************************************************************************
  143.  
  144. # I have to also impute the columns of this DataFrame (X_test)
  145. test_imputer = SimpleImputer(strategy='median')
  146. final_X_test = pd.DataFrame(test_imputer.fit_transform(X_test))
  147. final_X_test.columns = X_test.columns
  148.  
  149. preds_test = model.predict(final_X_test)
  150.  
  151. # Save test predictions to file
  152. output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
  153. output.to_csv('submission.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement