Advertisement
makispaiktis

2. Cars regressor

Jul 19th, 2023 (edited)
805
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.35 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3.  
  4.  
  5.  
  6. # 1a. Read the dataset
  7. cars_path = './autos.csv'
  8. cars = pd.read_csv(cars_path)
  9. print(cars.head(), '\n\n')
  10.  
  11. # 1b. Explore the dataset
  12. shape = cars.shape
  13. print("---- Dataset contains {} rows-records with {} columns-features ----".format(shape[0], shape[1]))
  14. all_columns = list(cars.columns)
  15. print("Columns names:", all_columns)
  16. print('\n\n')
  17.  
  18. # 1c. Separate the target ('price') from the other features
  19. target = 'price'
  20. cars.dropna(axis=0, subset=[target], inplace=True)
  21. y = cars[target]
  22. cars.drop([target], axis=1, inplace=True)
  23. shape = cars.shape
  24. print("Now, there are", len(list(cars.columns)), "columns (target = 'price') in the dataset")
  25.  
  26.  
  27.  
  28. # 2a. Check for columns with missing data
  29. print("...Check for any columns with missing values....")
  30. cols_with_missing = [col for col in cars.columns if cars[col].isnull().any()]
  31. print("There are {}/{} columns with missing values:".format(len(cols_with_missing), shape[1]), cols_with_missing, '\n\n')
  32.  
  33. # 2b. Find all the numerical cols AND categorical cols with low cardinality
  34. numerical_cols = [cname for cname in cars.columns if cars[cname].dtype in ['int64', 'float64']]
  35. categorical_cols = [cname for cname in cars.columns if cars[cname].dtype == "object" and cars[cname].nunique() < 10]
  36. my_cols = numerical_cols + categorical_cols
  37.  
  38. print("  Numerical cols =", numerical_cols)
  39. print("Categorical cols =", categorical_cols)
  40. print(" All the columns =", my_cols)
  41. print("Selected columns = {} = {} numerical + {} categorical".format(len(my_cols), len(numerical_cols), len(categorical_cols)), '\n\n')
  42.  
  43. # 2c. Find out if there are cols with missing data in my selected columns (my_cols)
  44. selected_with_missing = [col for col in my_cols if col in cols_with_missing]
  45. print("There are {}/{} selected columns with missing data: {}".format(len(selected_with_missing), len(my_cols), selected_with_missing))
  46. print("...Need imputation...")
  47.  
  48.  
  49.  
  50. # 3a. Work with the selected columns of the dataset (my_cols)
  51. X = cars[my_cols].copy()
  52.  
  53. SELECTED = 1000
  54. X = X.head(SELECTED)
  55. y = y.head(SELECTED)
  56. print(X.head(), '\n\n')
  57.  
  58. # 3b. Train, test, split
  59. from sklearn.model_selection import train_test_split
  60. X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
  61. print("X_train = {}, X_valid = {}\ny_train = {}, y_valid = {}".format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))
  62.  
  63.  
  64.  
  65. # 4a. Imputer + OH encoder, since there are missing values and categorical columns
  66. from sklearn.compose import ColumnTransformer
  67. from sklearn.pipeline import Pipeline
  68. from sklearn.impute import SimpleImputer
  69. from sklearn.preprocessing import OneHotEncoder
  70.  
  71. numerical_transformer = SimpleImputer(strategy='constant')
  72.  
  73. categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
  74.     ('onehot', OneHotEncoder(handle_unknown='ignore'))])
  75.  
  76. preprocessor = ColumnTransformer(transformers=[
  77.         ('num', numerical_transformer, numerical_cols),
  78.         ('cat', categorical_transformer, categorical_cols)])
  79.  
  80.  
  81.  
  82. # 5. Model = Regressor (regr) ---> Pipeline
  83.  
  84. from sklearn.tree import DecisionTreeRegressor
  85. from sklearn.ensemble import RandomForestRegressor
  86. from sklearn.preprocessing import StandardScaler
  87.  
  88. regr = RandomForestRegressor(n_estimators=100, random_state=0)
  89. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr), ('normalizer', StandardScaler()) ])
  90. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  91. pipeline.steps
  92.  
  93.  
  94.  
  95. # 6. Evaluate the RandomForestRegressor
  96.  
  97. from sklearn.model_selection import cross_validate
  98. from sklearn.metrics import mean_absolute_error
  99.  
  100. pipeline.fit(X_train, y_train)
  101. preds = pipeline.predict(X_valid)
  102. mae = mean_absolute_error(preds, y_valid)
  103. mae_per_row = mae / X_train.shape[0]
  104. price_per_row = y_train.mean()
  105.  
  106. print("MAE with RFR =", mae, '\n\n')
  107. print("MAE per row =", mae_per_row)
  108. print("Price per row =", price_per_row)
  109. print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
  110. print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))
  111.  
  112.  
  113.  
  114. # 7. Evaluate with cross validation
  115.  
  116. from sklearn.model_selection import cross_val_score
  117.  
  118. scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  119. score = scores.mean()
  120. mae_per_row = score / X_train.shape[0]
  121. price_per_row = y_train.mean()
  122.  
  123. print("Average MAE with RFR =", score, '\n\n')
  124. print("MAE per row =", mae_per_row)
  125. print("Price per row =", price_per_row)
  126. print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
  127. print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))
  128.  
  129.  
  130.  
  131. # 8. Some more regressors
  132.  
  133. from sklearn.preprocessing import PolynomialFeatures
  134. from sklearn.linear_model import LinearRegression
  135. from sklearn.svm import SVR
  136. from sklearn.tree import DecisionTreeRegressor
  137. from sklearn.ensemble import RandomForestRegressor
  138.  
  139. """
  140. # 8a. Polynomial Regressor
  141. # Create polynomial features (degree=3)
  142. poly_features = PolynomialFeatures(degree=3)
  143. X_poly = poly_features.fit_transform(X_train)
  144. # Create and fit the polynomial regression model
  145. regr1 = LinearRegression()
  146. regr1.fit(X_poly, y_train)
  147. """
  148. # 8a. Support Vector Machine Regressors
  149. regr1 = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.01)
  150. # 8b. Support Vector Machine Regressors
  151. regr2 = SVR(kernel='rbf', C=1e3, gamma='auto', epsilon=0.1)
  152. # 8c. Decision Tree Regressor
  153. regr3 = DecisionTreeRegressor()
  154. # 8d. Random Forest Regressor
  155. regr4 = RandomForestRegressor()
  156. regrs = [regr1, regr2, regr3, regr4]
  157. scores_list = []
  158.  
  159. for regr in regrs:
  160.     pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  161.     scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  162.     score = scores.mean()
  163.     scores_list.append(score)
  164.  
  165.     print('-----------------------------------------')
  166.     print("{}. ".format(str(regr)))
  167.     print("Score =", score)
  168.     print('-------------------------------------------', '\n\n')
  169.  
  170.  
  171.  
  172. # 9. Summary of regressors - pipelines that I tried
  173. best_regr = None
  174. best_score = 10**20
  175. print("After cross-validation with 5-folds:")
  176.  
  177. for i in range(len(regrs)):
  178.     regr = regrs[i]
  179.     score = scores_list[i]
  180.     print("{} ---> {}".format(regr, score))
  181.     if score < best_score:
  182.         best_score = score
  183.         best_regr = regr
  184.  
  185. print("\n\n", "Best regressor = {} with average MAE  = {}".format(best_regr, best_score))
  186.  
  187.  
  188.  
  189. # 10. Select the best regressor and evaluate it
  190. # Since RFR is the best one, I can change the default values to find a better model
  191.  
  192. n_estimators_list = list(range(50, 301, 50))
  193. random_state_list = [0, 1]
  194. best_score = 10**20
  195. best_rs = -1
  196. best_ne = -1
  197.  
  198. for random_state in random_state_list:
  199.     for n_estimators in n_estimators_list:
  200.         regr = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
  201.         pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
  202.         scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  203.         score = scores.mean()
  204.  
  205.         print('-------------------------------------------------------------')
  206.         print("{}. ".format(str(regr)))
  207.         print("Score =", score)
  208.         print('-------------------------------------------------------------', '\n\n')
  209.  
  210.         if score < best_score:
  211.             best_score = score
  212.             best_rs = random_state
  213.             best_ne = n_estimators
  214.  
  215.  
  216.  
  217. # 11. One last evaluation
  218. best_regr = RandomForestRegressor(n_estimators=best_ne, random_state=best_rs)
  219. pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', best_regr) ])
  220. best_scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
  221. best_score = best_scores.mean()
  222.  
  223. print('-------------------------------------------------------------')
  224. print("Best regressor = {} with:".format(str(best_regr)))
  225. print("Best score MAE =", best_score)
  226. print('-------------------------------------------------------------', '\n\n')
  227. print(best_ne, best_rs)
  228.  
  229.  
  230.  
  231. # 12. Check the predicted values with the real ones
  232. pipeline.fit(X_train, y_train)
  233. preds = pipeline.predict(X_valid)
  234. comparison = pd.DataFrame({ 'Real values': y_valid, 'Predictions' : preds })
  235. print(comparison)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement