Advertisement
makispaiktis

Encoding of categorical columns

Sep 22nd, 2024 (edited)
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.98 KB | None | 0 0
  1. # 0. Basics
  2. import pandas as pd
  3.  
  4. companies = ["AlfaRomeo", "Audi", "BMW", "Audi", "AlfaRomeo", "Mercedes", "Mercedes", "Audi", "BMW", "BMW"]
  5. prices = [30000, 45000, 55000, 48000, 32000, 60000, 64000, 51000, 56000, 57000]
  6. years = [2020, 2021, 2018, 2017, 2020, 2021, 2016, 2019, 2015, 2017]
  7. regions = ["Milan", "Warsaw", "Munich", "Frankfurt", "Athens", "Berlin", "Munich", "Frankfurt", "Munich", "Berlin"]
  8. dic = {"Company":companies, "Region":regions, "Year":years, "Price":prices}
  9. df = pd.DataFrame(dic)
  10. print(f"\n\n******** DataFrame Size = {df.shape} ********\n\n", df)
  11. print(df.select_dtypes(["object"]))
  12. print(df.select_dtypes(["object"]).nunique())
  13.  
  14.  
  15.  
  16. # 1. One-hot encoding - Creates N extra cols categorical per feature (N=unique values of this feature)
  17. X1 = df.copy()
  18. categorical_cols = [colname for colname in X1.columns if X1[colname].dtype in ["object", "category"]]
  19. print(f"Categorical cols = {categorical_cols}")
  20. for colname in categorical_cols:
  21.     n_unique = X1[colname].nunique()
  22.     unique_values = list(X1[colname].unique())
  23.     print(f"----> There are {n_unique} unique values in column named {colname}:\n{unique_values}")
  24.  
  25.  
  26. for colname in categorical_cols:
  27.     dummies_temp_df = pd.get_dummies(X1[colname], prefix=colname)
  28.     X1 = pd.concat([X1, dummies_temp_df], axis=1)
  29.  
  30. X1 = X1.drop(categorical_cols, axis=1)
  31. print(f"\n\n******** DataFrame Size = {X1.shape} ********\n\n", X1)
  32.  
  33.  
  34.  
  35. # 2. Label Encoding - Does not create any extra columns
  36. X2 = df.copy()
  37. for colname in X2.select_dtypes("object"):
  38.     X2[colname], _ = X2[colname].factorize()
  39. discrete_features = X2.dtypes == int
  40. print(f"******** DataFrame Size = {X2.shape} ********\n\n", X2, '\n\n', discrete_features)
  41.  
  42.  
  43.  
  44. # 3. Target Encoding - Creates 1 extra column per categorical feature
  45. # I will "connect" each company with its mean price
  46. X3 = df.copy()
  47. X3["MeanPrice (Label_for_Company)"] = X2.groupby("Company")["Price"].transform("mean")
  48. print(f"******** DataFrame Size = {X3.shape} ********\n\n", X3)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement