Advertisement
makispaiktis

Kaggle - Intermediate ML - XGBoost

Jun 26th, 2023
609
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.16 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from xgboost import XGBRegressor
  4. from sklearn.metrics import mean_absolute_error
  5.  
  6.  
  7.  
  8. # **********************************************************************************************************
  9. # **********************************************************************************************************
  10. # 1. Basics
  11. # **********************************************************************************************************
  12. # **********************************************************************************************************
  13.  
  14. # 1a. Read the 2 datasets
  15. X = pd.read_csv('../input/train.csv', index_col='Id')
  16. X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
  17.  
  18. # 1b. Remove rows with missing target, separate target from predictors
  19. X.dropna(axis=0, subset=['SalePrice'], inplace=True)
  20. y = X.SalePrice              
  21. X.drop(['SalePrice'], axis=1, inplace=True)
  22.  
  23. # 1c. Break off validation set from training data
  24. X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
  25.                                                                 random_state=0)
  26.  
  27. # 1d. Select categorical columns with relatively low cardinality (convenient but arbitrary)
  28. low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
  29.                         X_train_full[cname].dtype == "object"]
  30.  
  31. # 1e. Select numeric columns
  32. numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
  33.  
  34. # 1f. Keep selected columns only
  35. my_cols = low_cardinality_cols + numeric_cols
  36. X_train = X_train_full[my_cols].copy()
  37. X_valid = X_valid_full[my_cols].copy()
  38. X_test = X_test_full[my_cols].copy()
  39.  
  40. # 1g. One-hot encode the data (to shorten the code, we use pandas)
  41. X_train = pd.get_dummies(X_train)
  42. X_valid = pd.get_dummies(X_valid)
  43. X_test = pd.get_dummies(X_test)
  44. X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
  45. X_train, X_test = X_train.align(X_test, join='left', axis=1)
  46.  
  47.  
  48.  
  49.  
  50.  
  51. # **********************************************************************************************************
  52. # **********************************************************************************************************
  53. # 2a. First Default Model
  54. # **********************************************************************************************************
  55. # **********************************************************************************************************
  56.  
  57. my_model_1 = XGBRegressor(random_state=0)
  58. my_model_1.fit(X_train, y_train)
  59.  
  60. predictions_1 = my_model_1.predict(X_valid)
  61. mae_1 = mean_absolute_error(predictions_1, y_valid)
  62. print("Mean Absolute Error:" , mae_1)
  63.  
  64.  
  65.  
  66. # **********************************************************************************************************
  67. # **********************************************************************************************************
  68. # 2b. A Better Model in terms of MAE
  69. # **********************************************************************************************************
  70. # **********************************************************************************************************
  71.  
  72. my_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
  73. my_model_2.fit(X_train, y_train)
  74.  
  75. predictions_2 = my_model_2.predict(X_valid)
  76. mae_2 = mean_absolute_error(predictions_2, y_valid)
  77. print("Mean Absolute Error:" , mae_2)
  78.  
  79.  
  80.  
  81. # **********************************************************************************************************
  82. # **********************************************************************************************************
  83. # 2c. A Worse Model in terms of MAE
  84. # **********************************************************************************************************
  85. # **********************************************************************************************************
  86.  
  87. my_model_3 = XGBRegressor(n_estimators=100, learning_rate=0.5)
  88. my_model_3.fit(X_train, y_train)
  89.  
  90. predictions_3 = my_model_3.predict(X_valid)
  91. mae_3 = mean_absolute_error(predictions_3, y_valid)
  92. print("Mean Absolute Error:" , mae_3)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement