Advertisement
makispaiktis

2. Creating Features (groupby, split, count booleans)

Sep 17th, 2024 (edited)
54
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.37 KB | None | 0 0
  1. # 0a. Import
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.model_selection import cross_val_score
  5. from xgboost import XGBRegressor
  6.  
  7.  
  8.  
  9. # 1. AUXILIARY FUNCTIONS
  10. def score_dataset(X, y, model=XGBRegressor()):
  11.     # 1a. Label encoding for categoricals
  12.     for colname in X.select_dtypes(["category", "object"]):
  13.         X[colname], _ = X[colname].factorize()
  14.     # 1b. Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
  15.     score = cross_val_score(
  16.         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
  17.     )
  18.     score = -1 * score.mean()
  19.     score = np.sqrt(score)
  20.     return score
  21.  
  22.  
  23. # 2. Prepare data
  24. df = pd.read_csv("../input/fe-course-data/ames.csv")
  25. X = df.copy()
  26. y = X.pop("SalePrice")
  27.  
  28.  
  29. # 3. Create 3 new features
  30. X_1 = pd.DataFrame()
  31. X_1["LivLotRatio"] = X["GrLivArea"] / X["LotArea"]
  32. X_1["Spaciousness"] = (X["FirstFlrSF"] + X["SecondFlrSF"]) / X["TotRmsAbvGrd"]
  33. X_1["TotalOutsideSF"] = X["WoodDeckSF"] + X["OpenPorchSF"] + X["EnclosedPorch"] + X["Threeseasonporch"] + X["ScreenPorch"]
  34.  
  35.  
  36.  
  37. # 4. One hot encoding
  38. # 4a. One-hot encode the column named "BldgType" ---> Use `prefix="Bldg"` in `get_dummies` ---> New columns will be named "Bldg.....", # where "...." stands for the categorical value
  39. X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
  40. print(X_2.head(5), '\n\n')
  41. # 4b. Multiply by "GrLivArea" (row-by-row)
  42. X_2 = X_2.mul(df.GrLivArea, axis=0)            # Data type of "X_2" = pandas.Series, so that I can perform the multiplication X_2.mul # # ("mul" is a pd.Series method)
  43. print(X_2.head(5))
  44.  
  45.  
  46.  
  47. # 5. Count how many kinds of outdoor areas are greater than 0.0
  48. X_3 = pd.DataFrame()
  49. X_3["PorchTypes"] = X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].gt(0.0).sum(axis=1)
  50. print(X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].head(5))
  51. print(X_3.head(5))
  52.  
  53.  
  54.  
  55. # 6. Breaking down a categorical feature by splitting
  56. print(df.MSSubClass.unique())
  57. X_4 = pd.DataFrame()
  58. X_4["MSClass"] = X["MSSubClass"].str.split("_", n=1, expand=True)[0]
  59.  
  60.  
  61.  
  62. # 7. Grouped Transform
  63. X_5 = pd.DataFrame()
  64. X_5["MedNhbdArea"] = X.groupby("Neighborhood")["GrLivArea"].transform("median")
  65.  
  66.  
  67.  
  68. # 8. Join new datasets in the original one and compare the performances
  69. X_new = X.join([X_1, X_2, X_3, X_4, X_5])
  70. print(score_dataset(X, y))
  71. print(score_dataset(X_new, y))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement