Advertisement
makispaiktis

Kaggle - Intermediate ML - Pipelines

Jun 23rd, 2023
711
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.52 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.compose import ColumnTransformer
  4. from sklearn.pipeline import Pipeline
  5. from sklearn.impute import SimpleImputer
  6. from sklearn.preprocessing import OneHotEncoder
  7. from sklearn.ensemble import RandomForestRegressor
  8. from sklearn.metrics import mean_absolute_error
  9.  
  10.  
  11. # ******************************************************************************************************
  12. # ******************************************************************************************************
  13. # 1. Read the 2 datasets and split the dataset - Choose 10 as maximum cardinality to drop some columns
  14. # ******************************************************************************************************
  15. # ******************************************************************************************************
  16.  
  17. # 1a. Read
  18. X_full = pd.read_csv('../input/train.csv', index_col='Id')
  19. X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
  20.  
  21. # 1b. Remove rows with missing target, separate target from predictors
  22. X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
  23. y = X_full.SalePrice
  24. X_full.drop(['SalePrice'], axis=1, inplace=True)
  25.  
  26. # 1c. Break off validation set from training data
  27. X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)
  28.  
  29. # 1d. Cardinality means the number of unique values in a column - OH Encoder is coming
  30. categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and  X_train_full[cname].dtype == "object"]
  31.  
  32. # 1e. Select numerical columns
  33. numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
  34.  
  35. # 1f. Keep selected columns only
  36. my_cols = categorical_cols + numerical_cols
  37. X_train = X_train_full[my_cols].copy()
  38. X_valid = X_valid_full[my_cols].copy()
  39. X_test = X_test_full[my_cols].copy()
  40.  
  41. print(X_train.shape)
  42. X_train.head()
  43.  
  44.  
  45.  
  46.  
  47. # ******************************************************************************************************
  48. # ******************************************************************************************************
  49. # 2. Pipelines - Bundle a preprocessor and a model
  50. # ******************************************************************************************************
  51. # ******************************************************************************************************
  52.  
  53. # 2a. Preprocessing for numerical data
  54. numerical_transformer = SimpleImputer(strategy='constant')
  55.  
  56. # 2b. Preprocessing for categorical data
  57. categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
  58.     ('onehot', OneHotEncoder(handle_unknown='ignore'))])
  59.  
  60. # 2c. Bundle preprocessing for numerical and categorical data into a preprocessor
  61. preprocessor = ColumnTransformer(transformers=[
  62.         ('num', numerical_transformer, numerical_cols),
  63.         ('cat', categorical_transformer, categorical_cols)])
  64.  
  65. # 2d. Define model
  66. model = RandomForestRegressor(n_estimators=100, random_state=0)
  67.  
  68. # 2e. Bundle preprocessing and modeling code in a pipeline
  69. clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
  70.  
  71. # 2f. Preprocessing of training data, fit model, preprocessing of validation data, make predictions
  72. clf.fit(X_train, y_train)
  73. preds = clf.predict(X_valid)
  74. print('MAE:', mean_absolute_error(y_valid, preds))
  75.  
  76.  
  77.  
  78.  
  79.  
  80.  
  81. # ******************************************************************************************************
  82. # ******************************************************************************************************
  83. # 3. Improve performance - One solution here is to change the SimpleImputer of numerical columns and model
  84. # ******************************************************************************************************
  85. # ******************************************************************************************************
  86.  
  87. # 3a. Preprocessing for numerical data
  88. numerical_transformer = SimpleImputer()
  89.  
  90. # 3b. Preprocessing for categorical data
  91. categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
  92.     ('onehot', OneHotEncoder(handle_unknown='ignore'))])
  93.  
  94. # 3c. Bundle preprocessing for numerical and categorical data
  95. preprocessor = ColumnTransformer(transformers=[
  96.         ('num', numerical_transformer, numerical_cols),
  97.         ('cat', categorical_transformer, categorical_cols)])
  98.  
  99. # 3d. Define model
  100. model = RandomForestRegressor()
  101.  
  102. # 3e. Bundle preprocessing and modeling code in a pipeline
  103. my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
  104.  
  105. # 3f. Preprocessing of training data, fit model, preprocessing of validation data, make predictions
  106. my_pipeline.fit(X_train, y_train)
  107. preds = my_pipeline.predict(X_valid)
  108. score = mean_absolute_error(y_valid, preds)
  109. print('MAE:', score)
  110.  
  111.  
  112.  
  113.  
  114.  
  115. # ******************************************************************************************************
  116. # ******************************************************************************************************
  117. # 4. Generate test predictions from the 2nd dataset using X_test
  118. # ******************************************************************************************************
  119. # ******************************************************************************************************
  120.  
  121. preds_test = my_pipeline.predict(X_test)
  122. output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
  123. output.to_csv('submission.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement