Advertisement
makispaiktis

5. Target Encoding

Sep 21st, 2024 (edited)
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.39 KB | None | 0 0
  1. # 0a. Imports
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. import warnings
  7. from category_encoders import MEstimateEncoder
  8. from sklearn.model_selection import cross_val_score
  9. from xgboost import XGBRegressor
  10.  
  11. # 0b. Set Matplotlib defaults
  12. plt.style.use("seaborn-whitegrid")
  13. plt.rc("figure", autolayout=True)
  14. plt.rc(
  15.     "axes",
  16.     labelweight="bold",
  17.     labelsize="large",
  18.     titleweight="bold",
  19.     titlesize=14,
  20.     titlepad=10,
  21. )
  22. warnings.filterwarnings('ignore')
  23.  
  24.  
  25.  
  26. # 1. AUXILIARY FUNCTIONS
  27. def score_dataset(X, y, model=XGBRegressor()):
  28.     # Label encoding for categoricals
  29.     for colname in X.select_dtypes(["category", "object"]):
  30.         X[colname], _ = X[colname].factorize()
  31.     # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
  32.     score = cross_val_score(
  33.         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
  34.     )
  35.     score = -1 * score.mean()
  36.     score = np.sqrt(score)
  37.     return score
  38.  
  39.  
  40.  
  41. # 2. Basics - Inspect the categorical columns
  42. df = pd.read_csv("../input/fe-course-data/ames.csv")
  43. print(df.select_dtypes(["object"]).nunique())
  44. print(df["SaleType"].value_counts())
  45.  
  46.  
  47.  
  48. # 3. Target encoding - Split the dataset to AVOID OVERFITTING
  49. # 3a. Encoding split
  50. X_encode = df.sample(frac=0.20, random_state=0)
  51. y_encode = X_encode.pop("SalePrice")
  52. # 3b. Training split
  53. X_pretrain = df.drop(X_encode.index)
  54. y_train = X_pretrain.pop("SalePrice")
  55.  
  56.  
  57.  
  58. # 4. MEstimatorEncoder
  59. # 4a. Create the encoder
  60. from category_encoders import MEstimateEncoder
  61. features = ["Neighborhood", "SaleType"]
  62. encoder = MEstimateEncoder(cols=features, m=5)
  63. # 4b. Fit the encoder on the encoding split
  64. encoder.fit(X_encode, y_encode)
  65. # 4c. Encode the training split
  66. X_train = encoder.transform(X_pretrain, y_train)
  67.  
  68.  
  69.  
  70. # 5. See how the encoded feature compares to the target
  71. encoder_cols = encoder.cols
  72. print(encoder_cols)
  73. plt.figure(dpi=90)
  74. ax = sns.distplot(y_train, kde=True, hist=False)
  75. ax = sns.distplot(X_train[feature], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
  76. ax.set_xlabel("SalePrice")
  77.  
  78.  
  79.  
  80. # 6. Compare scores
  81. X = df.copy()
  82. y = X.pop("SalePrice")
  83. score_base = score_dataset(X, y)
  84. score_new = score_dataset(X_train, y_train)
  85. print(f"Baseline Score: {score_base:.4f} RMSLE")
  86. # All categorical cols were label-encoded because of score_dataset function
  87. print(f"Score with Encoding: {score_new:.4f} RMSLE")
  88. # All categorical cols were label-encoded because of score_dataset function, but 2 of them ("SaleType" and "Neighborhood" were
  89. # target-encoded, even in a smaller proportion of the dataset, but this yields a better result in our performance metric
  90.  
  91.  
  92.  
  93. # 7. Non-sense feature allegedly results in a better RMSLE, but this because of OVERFITTING (we did not split the dataset)
  94. # 7A. We can try 0, 1, 5, 50
  95. m = 0
  96. X = df.copy()
  97. y = X.pop('SalePrice')
  98. # 7b. Create an UNINFORMATIVE feature
  99. X["Count"] = range(len(X))
  100. X["Count"][1] = 0  # actually need one duplicate value to circumvent error-checking in MEstimateEncoder
  101. # 7c. Fit and transform on the same dataset
  102. encoder = MEstimateEncoder(cols="Count", m=m)
  103. X = encoder.fit_transform(X, y)
  104. score =  score_dataset(X, y)
  105. print(f"Score: {score:.4f} RMSLE")
  106.  
  107. plt.figure(dpi=90)
  108. ax = sns.distplot(y, kde=True, hist=False)
  109. ax = sns.distplot(X["Count"], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
  110. ax.set_xlabel("SalePrice");
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement