Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 0a. Imports
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import warnings
- from category_encoders import MEstimateEncoder
- from sklearn.model_selection import cross_val_score
- from xgboost import XGBRegressor
- # 0b. Set Matplotlib defaults
- plt.style.use("seaborn-whitegrid")
- plt.rc("figure", autolayout=True)
- plt.rc(
- "axes",
- labelweight="bold",
- labelsize="large",
- titleweight="bold",
- titlesize=14,
- titlepad=10,
- )
- warnings.filterwarnings('ignore')
- # 1. AUXILIARY FUNCTIONS
- def score_dataset(X, y, model=XGBRegressor()):
- # Label encoding for categoricals
- for colname in X.select_dtypes(["category", "object"]):
- X[colname], _ = X[colname].factorize()
- # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
- score = cross_val_score(
- model, X, y, cv=5, scoring="neg_mean_squared_log_error",
- )
- score = -1 * score.mean()
- score = np.sqrt(score)
- return score
- # 2. Basics - Inspect the categorical columns
- df = pd.read_csv("../input/fe-course-data/ames.csv")
- print(df.select_dtypes(["object"]).nunique())
- print(df["SaleType"].value_counts())
- # 3. Target encoding - Split the dataset to AVOID OVERFITTING
- # 3a. Encoding split
- X_encode = df.sample(frac=0.20, random_state=0)
- y_encode = X_encode.pop("SalePrice")
- # 3b. Training split
- X_pretrain = df.drop(X_encode.index)
- y_train = X_pretrain.pop("SalePrice")
- # 4. MEstimatorEncoder
- # 4a. Create the encoder
- from category_encoders import MEstimateEncoder
- features = ["Neighborhood", "SaleType"]
- encoder = MEstimateEncoder(cols=features, m=5)
- # 4b. Fit the encoder on the encoding split
- encoder.fit(X_encode, y_encode)
- # 4c. Encode the training split
- X_train = encoder.transform(X_pretrain, y_train)
- # 5. See how the encoded feature compares to the target
- encoder_cols = encoder.cols
- print(encoder_cols)
- plt.figure(dpi=90)
- ax = sns.distplot(y_train, kde=True, hist=False)
- ax = sns.distplot(X_train[feature], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
- ax.set_xlabel("SalePrice")
- # 6. Compare scores
- X = df.copy()
- y = X.pop("SalePrice")
- score_base = score_dataset(X, y)
- score_new = score_dataset(X_train, y_train)
- print(f"Baseline Score: {score_base:.4f} RMSLE")
- # All categorical cols were label-encoded because of score_dataset function
- print(f"Score with Encoding: {score_new:.4f} RMSLE")
- # All categorical cols were label-encoded because of score_dataset function, but 2 of them ("SaleType" and "Neighborhood" were
- # target-encoded, even in a smaller proportion of the dataset, but this yields a better result in our performance metric
- # 7. Non-sense feature allegedly results in a better RMSLE, but this because of OVERFITTING (we did not split the dataset)
- # 7A. We can try 0, 1, 5, 50
- m = 0
- X = df.copy()
- y = X.pop('SalePrice')
- # 7b. Create an UNINFORMATIVE feature
- X["Count"] = range(len(X))
- X["Count"][1] = 0 # actually need one duplicate value to circumvent error-checking in MEstimateEncoder
- # 7c. Fit and transform on the same dataset
- encoder = MEstimateEncoder(cols="Count", m=m)
- X = encoder.fit_transform(X, y)
- score = score_dataset(X, y)
- print(f"Score: {score:.4f} RMSLE")
- plt.figure(dpi=90)
- ax = sns.distplot(y, kde=True, hist=False)
- ax = sns.distplot(X["Count"], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
- ax.set_xlabel("SalePrice");
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement