Advertisement
makispaiktis

3. Kmeans clustering ("Cluster" feature-column)

Sep 20th, 2024 (edited)
20
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.43 KB | None | 0 0
  1. # 0. Basics
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. from sklearn.cluster import KMeans
  7. from sklearn.model_selection import cross_val_score
  8. from xgboost import XGBRegressor
  9.  
  10. # Set Matplotlib defaults
  11. plt.style.use("seaborn-whitegrid")
  12. plt.rc("figure", autolayout=True)
  13. plt.rc(
  14.     "axes",
  15.     labelweight="bold",
  16.     labelsize="large",
  17.     titleweight="bold",
  18.     titlesize=14,
  19.     titlepad=10,
  20. )
  21.  
  22.  
  23.  
  24. # 1. AUXILIARY FUNCTION ---->
  25. # Input1 = every dataframe "X" to be trained (it may contain categorical cols, but they will be encoded with .factorize()
  26. # Input2 = every Series (1-column dataframe) that acts as a target
  27. # Input3 = the regressor model (default model = XGBRegressor)
  28.  
  29. def score_dataset(X, y, model=XGBRegressor()):
  30.     # Label encoding for categoricals
  31.     for colname in X.select_dtypes(["category", "object"]):
  32.         X[colname], _ = X[colname].factorize()
  33.     # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
  34.     score = cross_val_score(
  35.         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
  36.     )
  37.     score = -1 * score.mean()
  38.     score = np.sqrt(score)
  39.     return score
  40.  
  41.  
  42.  
  43. # 2. Prepare data - Evaluate performance
  44. df = pd.read_csv("../input/fe-course-data/ames.csv")
  45. X = df.copy()
  46. y = X.pop("SalePrice")
  47. print(score_dataset(X, y))
  48.  
  49.  
  50.  
  51. # 3. 5D kmeans algorithm ----> Create a Feature of Cluster Labels ----> New "Cluster" column
  52.  
  53. # 3a. Select the 5 features to be clustered
  54. features = ["LotArea", "TotalBsmtSF", "FirstFlrSF", "SecondFlrSF", "GrLivArea"]
  55. # 3b. Standardize them
  56. X_scaled = X[features]                                  # The new temporary dataframe contains only these 5 columns-features
  57. X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)    # mean, std are methods applied to a whole dataframe
  58. # 3c. Create and apply the kmeans algorithm
  59. kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
  60. X["Cluster"] = kmeans.fit_predict(X_scaled)                             # A 1-column dataframe (Series) with column name = "Cluster"
  61. X["Cluster"] = X["Cluster"].astype("category")
  62. print(X[features + ["Cluster"]].head(8))
  63. print(score_dataset(X, y))
  64.  
  65.  
  66.  
  67. # 4. Create relplots to see better the relation between the 5 features and the target
  68. X2 = X.copy()                                   # Since X2 is a copy of X, then X2 dataframe contains "Cluster", but not "SalePrice"
  69. X2["Cluster"] = X2.Cluster.astype("category")
  70. X2["SalePrice"] = y                             # Merge the target column
  71. sns.relplot(
  72.     x="value", y="SalePrice", hue="Cluster", col="variable",
  73.     height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
  74.     data=X2.melt(
  75.         value_vars=features, id_vars=["SalePrice", "Cluster"],
  76.     ),
  77. );
  78.  
  79.  
  80.  
  81. # 5. Cluster-Distance Features: The k-means algorithm offers an alternative way of creating features. Instead of labelling each feature
  82. # with the nearest cluster centroid, it can measure the distance from a point to all the centroids and return those distances as
  83. # features. We use "fit_transform" to do so, not "fit_predict" method
  84.  
  85. kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
  86. X_cd = kmeans.fit_transform(X_scaled)                             # A 10-column dataframe with elements = distances from centroids
  87. centroid_cols = [f"Centroid_{i}" for i in range(X_cd.shape[1])]
  88. X_cd = pd.DataFrame(X_cd, columns=centroid_cols)
  89. X = X.join(X_cd)
  90. print(X[features + centroid_cols].head(8))
  91. print(score_dataset(X, y))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement