Advertisement
makispaiktis

3. Houses' SalePrice Regressor

Feb 1st, 2024 (edited)
493
0
Never
1
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.55 KB | None | 0 0
  1. import numpy as np # linear algebra
  2. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  3.  
  4. import os
  5. for dirname, _, filenames in os.walk('/kaggle/input'):
  6.     for filename in filenames:
  7.         print(os.path.join(dirname, filename))
  8.  
  9.  
  10. # 1a. Read the dataset
  11. houses_path = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
  12. houses = pd.read_csv(houses_path)
  13. print(houses.head(), '\n\n')
  14.  
  15. # 1b. Explore the dataset
  16. shape = houses.shape
  17. print(f"---- Dataset contains {shape[0]} rows-records with {shape[1]} columns-features ----")
  18. all_columns = list(houses.columns)
  19. print("Columns names:", all_columns)
  20. print('\n\n')
  21.  
  22. # 1c. Separate the target ('SalePrice') from the other features
  23. target = 'SalePrice'
  24. houses.dropna(axis=0, subset=[target], inplace=True)
  25. y = houses[target]
  26. houses.drop([target], axis=1, inplace=True)
  27. shape = houses.shape
  28. print(f"Now, there are {len(list(houses.columns))} columns (target = 'SalePrice' was removed) in the dataset")
  29.  
  30.  
  31.  
  32. # 2a. Check for columns with missing data
  33. # print("...Check for any columns with missing values....")
  34. cols_with_missing = [col for col in houses.columns if houses[col].isnull().any()]
  35. print(f"There are {len(cols_with_missing)}/{shape[1]} columns WITH MISSING VALUES:\n {cols_with_missing} \n\n")
  36.  
  37. # 2b. Find all the numerical cols AND categorical cols with low cardinality
  38. numerical_cols = [cname for cname in houses.columns if houses[cname].dtype in ['int64', 'float64']]
  39. categorical_cols = [cname for cname in houses.columns if houses[cname].dtype == "object" and houses[cname].nunique() < 5]
  40. my_cols = numerical_cols + categorical_cols
  41.  
  42. print(f"There are {len(numerical_cols)}/{shape[1]} NUMERICAL columns:")
  43. print("Numerical cols =", numerical_cols, '\n\n')
  44. print(f"There are {len(categorical_cols)}/{shape[1]} CATEGORICAL columns:")
  45. print("Categorical cols =", categorical_cols, '\n\n')
  46. print(f"I will SELECT {len(my_cols)}/{shape[1]} columns:")
  47. print("My columns =", my_cols, '\n\n')
  48. # print(f"Selected columns = {len(my_cols)} = {len(numerical_cols)} numerical + {len(categorical_cols)} categorical \n\n")
  49.  
  50. # 2c. Find out if there are cols with missing data in my selected columns (my_cols)
  51. selected_with_missing = [col for col in my_cols if col in cols_with_missing]
  52. numerical_with_missing = [col for col in numerical_cols if col in cols_with_missing]
  53. categorical_with_missing = [col for col in categorical_cols if col in cols_with_missing]
  54.  
  55. print("There are {}/{} SELECTED columns WITH MISSING DATA: {}".format(len(selected_with_missing), len(my_cols), selected_with_missing))
  56. print(f"= {len(numerical_with_missing)} numerical with missing data ---> {numerical_with_missing}")
  57. print(f"+ {len(categorical_with_missing)} categorical with missing data ---> {categorical_with_missing}")
  58. print("...Need imputation...\n\n\n")
  59.  
  60. print("******** SUMMARY ********")
  61. print(f"Selected columns = {len(my_cols)}/{shape[1]}")
  62. print(f"Numerical columns = {len(numerical_cols)}/{len(my_cols)}")
  63. print(f"Categorical columns = {len(categorical_cols)}/{len(my_cols)}")
  64. print(f"Selected cols with missing data = {len(selected_with_missing)}/{len(my_cols)}")
  65. print(f"Numerical with missing data = {len(numerical_with_missing)}/{len(selected_with_missing)}")
  66. print(f"Categorical with missing data = {len(categorical_with_missing)}/{len(selected_with_missing)}")
  67.  
  68.  
  69.  
  70. # 3a. Work with the selected columns of the dataset (my_cols)
  71. X = houses[my_cols].copy()
  72. y = y.copy()
  73.  
  74. SELECTED = 1000
  75. if SELECTED < shape[0]:
  76.     X = X.head(SELECTED)
  77.     y = y.head(SELECTED)
  78.  
  79. # 3b. Train, test, split
  80. from sklearn.model_selection import train_test_split
  81. X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
  82. print("X_train = {}, X_valid = {}\ny_train = {}, y_valid = {}".format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))
  83.  
  84.  
  85.  
  86.  
  87. # 4a. Imputer + OH encoder, since there are missing values and categorical columns
  88. from sklearn.compose import ColumnTransformer
  89. from sklearn.pipeline import Pipeline
  90. from sklearn.impute import SimpleImputer
  91. from sklearn.preprocessing import OneHotEncoder
  92.  
  93. numerical_transformer = SimpleImputer(strategy='constant')
  94.  
  95. categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
  96.     ('onehot', OneHotEncoder(handle_unknown='ignore'))])
  97.  
  98. preprocessor = ColumnTransformer(transformers=[
  99.         ('num', numerical_transformer, numerical_cols),
  100.         ('cat', categorical_transformer, categorical_cols)])
  101.  
  102.  
  103.  
  104. # 5. Model = Regressor (regr) ---> Pipeline
  105.  
  106. from sklearn.tree import DecisionTreeRegressor
  107. from sklearn.ensemble import RandomForestRegressor
  108. from sklearn.preprocessing import StandardScaler
  109.  
  110. regr = RandomForestRegressor(n_estimators=100, random_state=0)
  111. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr), ('normalizer', StandardScaler()) ])
  112. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  113. pipeline.steps
  114.  
  115.  
  116.  
  117. # 6. Evaluate the RandomForestRegressor
  118.  
  119. from sklearn.model_selection import cross_validate
  120. from sklearn.metrics import mean_absolute_error
  121.  
  122. pipeline.fit(X_train, y_train)
  123. preds = pipeline.predict(X_valid)
  124. validation_shape = X_valid.shape
  125. avg_y_valid = y_valid.mean()
  126. avg_error = mae / avg_y_valid
  127.  
  128. print(f"Validation dataset shape = {validation_shape[0]} rows x {validation_shape[1]} columns")
  129. print(f"MAE with RFR = {mae}\n")
  130. print(f"Average SalePrice in validation dataset = {avg_y_valid}, so avg_error = {100 * avg_error}%")
  131.  
  132.  
  133.  
  134. # 7. Evaluate with cross validation
  135.  
  136. from sklearn.model_selection import cross_val_score
  137.  
  138. scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  139. score = scores.mean()
  140.  
  141. print("Average MAE with RFR =", score, '\n\n')
  142.  
  143.  
  144.  
  145. # 8. Some more regressors
  146.  
  147. from sklearn.preprocessing import PolynomialFeatures
  148. from sklearn.linear_model import LinearRegression
  149. from sklearn.svm import SVR
  150. from sklearn.tree import DecisionTreeRegressor
  151. from sklearn.ensemble import RandomForestRegressor
  152. from  sklearn.linear_model import LogisticRegression
  153.  
  154. """
  155. # 8a. Polynomial Regressor
  156. # Create polynomial features (degree=3)
  157. poly_features = PolynomialFeatures(degree=3)
  158. X_poly = poly_features.fit_transform(X_train)
  159. # Create and fit the polynomial regression model
  160. regr1 = LinearRegression()
  161. regr1.fit(X_poly, y_train)
  162. """
  163. # 8a. Support Vector Machine Regressors
  164. regr1 = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.01)
  165. # 8b. Support Vector Machine Regressors
  166. regr2 = SVR(kernel='rbf', C=1e3, gamma='auto', epsilon=0.1)
  167. # 8c. Decision Tree Regressor
  168. regr3 = DecisionTreeRegressor()
  169. # 8d. Random Forest Regressor
  170. regr4 = RandomForestRegressor()
  171. # 8e. Logistic Regressor
  172. # regr5 = LogisticRegression(random_state=16)
  173.  
  174. regrs = [regr1, regr2, regr3, regr4]
  175. scores_list = []
  176.  
  177. for regr in regrs:
  178.     pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  179.     scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  180.     score = scores.mean()
  181.     scores_list.append(score)
  182.  
  183.     print('-----------------------------------------')
  184.     print("{}. ".format(str(regr)))
  185.     print("Score =", score)
  186.     print('-------------------------------------------', '\n\n')
  187.  
  188.  
  189.  
  190. # 9. Summary of regressors - pipelines that I tried
  191. best_regr = None
  192. best_score = 10**20
  193. print("After cross-validation with 5-folds:")
  194.  
  195. for i in range(len(regrs)):
  196.     regr = regrs[i]
  197.     score = scores_list[i]
  198.     print("{} ---> {}".format(regr, score))
  199.     if score < best_score:
  200.         best_score = score
  201.         best_regr = regr
  202.  
  203. print("\n\n", "Best regressor = {} with average MAE  = {}".format(best_regr, best_score))
  204.  
  205.  
  206.  
  207. # 10. Select the best regressor and evaluate it
  208. # Since RFR is the best one, I can change the default values to find a better model
  209.  
  210. n_estimators_list = list(range(50, 301, 50))
  211. random_state_list = [0, 1]
  212. best_score = 10**20
  213. best_rs = -1
  214. best_ne = -1
  215.  
  216. for random_state in random_state_list:
  217.     for n_estimators in n_estimators_list:
  218.         regr = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
  219.         pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  220.         scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  221.         score = scores.mean()
  222.  
  223.         print('-------------------------------------------------------------')
  224.         print("{}. ".format(str(regr)))
  225.         print("Score =", score)
  226.         print('-------------------------------------------------------------', '\n\n')
  227.  
  228.         if score < best_score:
  229.             best_score = score
  230.             best_rs = random_state
  231.             best_ne = n_estimators
  232.  
  233.  
  234.  
  235. # 11. One last evaluation
  236. best_regr = RandomForestRegressor(n_estimators=best_ne, random_state=best_rs)
  237. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', best_regr) ])
  238. best_scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  239. best_score = best_scores.mean()
  240.  
  241. print('-------------------------------------------------------------')
  242. print("Best regressor = {} with:".format(str(best_regr)))
  243. print("Best score MAE =", best_score)
  244. print('-------------------------------------------------------------', '\n\n')
  245. print(best_ne, best_rs)
  246.  
  247.  
  248.  
  249. # 12. Check the predicted values with the real ones
  250. pipeline.fit(X_train, y_train)
  251. preds = pipeline.predict(X_valid)
  252. comparison = pd.DataFrame({ 'Real values': y_valid, 'Predictions' : preds, 'Absolute Error': abs(y_valid - preds), 'Error (%)':  100*(abs(y_valid - preds) / y_valid) })
  253. print(comparison)
Advertisement
Comments
Add Comment
Please, Sign In to add comment
Advertisement