Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 0. Basics
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import seaborn as sns
- from sklearn.cluster import KMeans
- from sklearn.model_selection import cross_val_score
- from xgboost import XGBRegressor
- # Set Matplotlib defaults
- plt.style.use("seaborn-whitegrid")
- plt.rc("figure", autolayout=True)
- plt.rc(
- "axes",
- labelweight="bold",
- labelsize="large",
- titleweight="bold",
- titlesize=14,
- titlepad=10,
- )
- # 1. AUXILIARY FUNCTION ---->
- # Input1 = every dataframe "X" to be trained (it may contain categorical cols, but they will be encoded with .factorize()
- # Input2 = every Series (1-column dataframe) that acts as a target
- # Input3 = the regressor model (default model = XGBRegressor)
- def score_dataset(X, y, model=XGBRegressor()):
- # Label encoding for categoricals
- for colname in X.select_dtypes(["category", "object"]):
- X[colname], _ = X[colname].factorize()
- # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
- score = cross_val_score(
- model, X, y, cv=5, scoring="neg_mean_squared_log_error",
- )
- score = -1 * score.mean()
- score = np.sqrt(score)
- return score
- # 2. Prepare data - Evaluate performance
- df = pd.read_csv("../input/fe-course-data/ames.csv")
- X = df.copy()
- y = X.pop("SalePrice")
- print(score_dataset(X, y))
- # 3. 5D kmeans algorithm ----> Create a Feature of Cluster Labels ----> New "Cluster" column
- # 3a. Select the 5 features to be clustered
- features = ["LotArea", "TotalBsmtSF", "FirstFlrSF", "SecondFlrSF", "GrLivArea"]
- # 3b. Standardize them
- X_scaled = X[features] # The new temporary dataframe contains only these 5 columns-features
- X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0) # mean, std are methods applied to a whole dataframe
- # 3c. Create and apply the kmeans algorithm
- kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
- X["Cluster"] = kmeans.fit_predict(X_scaled) # A 1-column dataframe (Series) with column name = "Cluster"
- X["Cluster"] = X["Cluster"].astype("category")
- print(X[features + ["Cluster"]].head(8))
- print(score_dataset(X, y))
- # 4. Create relplots to see better the relation between the 5 features and the target
- X2 = X.copy() # Since X2 is a copy of X, then X2 dataframe contains "Cluster", but not "SalePrice"
- X2["Cluster"] = X2.Cluster.astype("category")
- X2["SalePrice"] = y # Merge the target column
- sns.relplot(
- x="value", y="SalePrice", hue="Cluster", col="variable",
- height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
- data=X2.melt(
- value_vars=features, id_vars=["SalePrice", "Cluster"],
- ),
- );
- # 5. Cluster-Distance Features: The k-means algorithm offers an alternative way of creating features. Instead of labelling each feature
- # with the nearest cluster centroid, it can measure the distance from a point to all the centroids and return those distances as
- # features. We use "fit_transform" to do so, not "fit_predict" method
- kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
- X_cd = kmeans.fit_transform(X_scaled) # A 10-column dataframe with elements = distances from centroids
- centroid_cols = [f"Centroid_{i}" for i in range(X_cd.shape[1])]
- X_cd = pd.DataFrame(X_cd, columns=centroid_cols)
- X = X.join(X_cd)
- print(X[features + centroid_cols].head(8))
- print(score_dataset(X, y))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement