Advertisement
makispaiktis

Kaggle - Intermediate ML - Categorical Variables (Ordinal, OneHot encoding)

Jun 23rd, 2023
808
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.01 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.ensemble import RandomForestRegressor
  4. from sklearn.metrics import mean_absolute_error
  5. from sklearn.preprocessing import OrdinalEncoder
  6. from sklearn.preprocessing import OneHotEncoder
  7.  
  8.  
  9. # ********************************************************************************************************
  10. # ********************************************************************************************************
  11. # 0. Function - Compare scores between different models (train, validation datasets)
  12. # ********************************************************************************************************
  13. # ********************************************************************************************************
  14.  
  15. def score_dataset(X_train, X_valid, y_train, y_valid):
  16.     model = RandomForestRegressor(n_estimators=100, random_state=0)
  17.     model.fit(X_train, y_train)
  18.     preds = model.predict(X_valid)
  19.     return mean_absolute_error(y_valid, preds)
  20.  
  21.  
  22.  
  23.  
  24.  
  25. # ********************************************************************************************************
  26. # ********************************************************************************************************
  27. # 1. Basics - Read dataset and split it
  28. # ********************************************************************************************************
  29. # ********************************************************************************************************
  30.  
  31. # 1a. Read the 2 datasets
  32. X = pd.read_csv('../input/train.csv', index_col='Id')
  33. X_test = pd.read_csv('../input/test.csv', index_col='Id')
  34.  
  35. # 1b. Remove rows with missing target
  36. X.dropna(axis=0, subset=['SalePrice'], inplace=True)
  37.  
  38. # 1c. Separate target from predictors
  39. y = X.SalePrice
  40. X.drop(['SalePrice'], axis=1, inplace=True)
  41.  
  42. # 1d. Drop columns with missing values
  43. cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
  44. X.drop(cols_with_missing, axis=1, inplace=True)
  45. X_test.drop(cols_with_missing, axis=1, inplace=True)
  46.  
  47. # 1e. Break off validation set from training data
  48. X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
  49.  
  50. # 1f. Preview the dataset
  51. print(X_train.shape)
  52. X_train.head()
  53. X_train.describe()
  54.  
  55.  
  56.  
  57.  
  58.  
  59. # ********************************************************************************************************
  60. # ********************************************************************************************************
  61. # 2a. Drop columns with categorical data
  62. # ********************************************************************************************************
  63. # ********************************************************************************************************
  64.  
  65. # Drop columns in training and validation data (SUPPOSE THERE ARE NOT MISSING DATA COLUMNS TO DROP FIRST)
  66. drop_X_train = X_train.select_dtypes(exclude=['object'])
  67. drop_X_valid = X_valid.select_dtypes(exclude=['object'])
  68. print("MAE from Approach 1 (Drop categorical variables):")
  69. print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
  70.  
  71.  
  72.  
  73.  
  74.  
  75. # ********************************************************************************************************
  76. # ********************************************************************************************************
  77. # 2b. Ordinal Encoding
  78. # ********************************************************************************************************
  79. # ********************************************************************************************************
  80.  
  81. # 2b1. A little search in the dataset
  82. print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
  83. print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())
  84. print("\nFitting an ordinal encoder to a column in the training data creates a corresponding integer-valued label for each unique value that appears in the training data. In the case that the validation data contains values that don't also appear in the training data, the encoder will throw an error, because these values won't have an integer assigned to them. Notice that the 'Condition2' column in the validation data contains the values 'RRAn' and 'RRNn', but these don't appear in the training data -- thus, if we try to use an ordinal encoder with scikit-learn, the code will throw an error.\n\n")
  85.  
  86. # 2b2. Categorical columns in the training data
  87. object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
  88.  
  89. # 2b3. Columns that can be safely ordinal encoded
  90. good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]
  91.        
  92. # 2b4. Problematic columns that will be dropped from the dataset
  93. bad_label_cols = list(set(object_cols)-set(good_label_cols))
  94.  
  95. print('Categorical columns (ALL OF THEM):', object_cols, '\n')
  96. print('Categorical columns that will be ordinal encoded:', good_label_cols, '\n')
  97. print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols, '\n')
  98.  
  99.  
  100. # 2b5. Drop categorical columns that will not be encoded
  101. label_X_train = X_train.drop(bad_label_cols, axis=1)
  102. label_X_valid = X_valid.drop(bad_label_cols, axis=1)
  103.  
  104. # 2b6. Apply ordinal encoder
  105. ordinal_encoder = OrdinalEncoder()
  106. label_X_train[good_label_cols] = ordinal_encoder.fit_transform(label_X_train[good_label_cols])
  107. label_X_valid[good_label_cols] = ordinal_encoder.transform(label_X_valid[good_label_cols])
  108. print("MAE from Approach 2 (Ordinal Encoding):")
  109. print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
  110.  
  111.  
  112.  
  113.  
  114.  
  115.  
  116. # ********************************************************************************************************
  117. # ********************************************************************************************************
  118. # 2c. One Hot Encoding
  119. # ********************************************************************************************************
  120. # ********************************************************************************************************
  121.  
  122. # 2c0. Get number of unique entries in each column with categorical data
  123. object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
  124. d = dict(zip(object_cols, object_nunique))
  125.  
  126. # 2c1. Print number of unique entries by column, in ascending order
  127. sorted(d.items(), key=lambda x: x[1])
  128.  
  129. high_cardinality_numcols = 3
  130. num_cols_neighborhood = 25
  131. OH_entries_added = 10000 * (100 - 1)
  132. label_entries_added = 0
  133.  
  134. # 2c2. Low and high cardinality columns
  135.  
  136. low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
  137. high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
  138. print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
  139. print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
  140.  
  141.  
  142.  
  143. # 2c3. Drop high cardinality columns (It will contain numbers + low_cardinality)
  144. OH_X_train = X_train.drop(high_cardinality_cols, axis=1)
  145. OH_X_valid = X_valid.drop(high_cardinality_cols, axis=1)
  146.  
  147. # 2c4. Keep low cardinaliy columns (It will contain low_cardinality)
  148. oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
  149. OH_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[low_cardinality_cols]))
  150. OH_cols_valid = pd.DataFrame(oh_encoder.transform(X_valid[low_cardinality_cols]))
  151.  
  152. # 2c5. Index was removed whie one-hot-encoded, I bring it back
  153. OH_cols_train.index = X_train.index
  154. OH_cols_valid.index = X_valid.index
  155.  
  156. # 2c6. Number columns in both training and validation X datasets (It will contain numbers, I dropped the old low_cordinality cols, because I will replace them with OH cols)
  157. number_X_train = X_train.drop(object_cols, axis=1)
  158. number_X_valid = X_valid.drop(object_cols, axis=1)
  159.  
  160. # 2c7. Concatenate low cardinality and number columns
  161. OH_X_train = pd.concat([OH_cols_train, number_X_train], axis=1)
  162. OH_X_valid = pd.concat([OH_cols_valid, number_X_valid], axis=1)
  163. print("MAE from Approach 3 (One-Hot Encoding):")
  164. print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement