Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- # 1a. Read the dataset
- cars_path = './autos.csv'
- cars = pd.read_csv(cars_path)
- print(cars.head(), '\n\n')
- # 1b. Explore the dataset
- shape = cars.shape
- print("---- Dataset contains {} rows-records with {} columns-features ----".format(shape[0], shape[1]))
- all_columns = list(cars.columns)
- print("Columns names:", all_columns)
- print('\n\n')
- # 1c. Separate the target ('price') from the other features
- target = 'price'
- cars.dropna(axis=0, subset=[target], inplace=True)
- y = cars[target]
- cars.drop([target], axis=1, inplace=True)
- shape = cars.shape
- print("Now, there are", len(list(cars.columns)), "columns (target = 'price') in the dataset")
- # 2a. Check for columns with missing data
- print("...Check for any columns with missing values....")
- cols_with_missing = [col for col in cars.columns if cars[col].isnull().any()]
- print("There are {}/{} columns with missing values:".format(len(cols_with_missing), shape[1]), cols_with_missing, '\n\n')
- # 2b. Find all the numerical cols AND categorical cols with low cardinality
- numerical_cols = [cname for cname in cars.columns if cars[cname].dtype in ['int64', 'float64']]
- categorical_cols = [cname for cname in cars.columns if cars[cname].dtype == "object" and cars[cname].nunique() < 10]
- my_cols = numerical_cols + categorical_cols
- print(" Numerical cols =", numerical_cols)
- print("Categorical cols =", categorical_cols)
- print(" All the columns =", my_cols)
- print("Selected columns = {} = {} numerical + {} categorical".format(len(my_cols), len(numerical_cols), len(categorical_cols)), '\n\n')
- # 2c. Find out if there are cols with missing data in my selected columns (my_cols)
- selected_with_missing = [col for col in my_cols if col in cols_with_missing]
- print("There are {}/{} selected columns with missing data: {}".format(len(selected_with_missing), len(my_cols), selected_with_missing))
- print("...Need imputation...")
- # 3a. Work with the selected columns of the dataset (my_cols)
- X = cars[my_cols].copy()
- SELECTED = 1000
- X = X.head(SELECTED)
- y = y.head(SELECTED)
- print(X.head(), '\n\n')
- # 3b. Train, test, split
- from sklearn.model_selection import train_test_split
- X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
- print("X_train = {}, X_valid = {}\ny_train = {}, y_valid = {}".format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))
- # 4a. Imputer + OH encoder, since there are missing values and categorical columns
- from sklearn.compose import ColumnTransformer
- from sklearn.pipeline import Pipeline
- from sklearn.impute import SimpleImputer
- from sklearn.preprocessing import OneHotEncoder
- numerical_transformer = SimpleImputer(strategy='constant')
- categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
- ('onehot', OneHotEncoder(handle_unknown='ignore'))])
- preprocessor = ColumnTransformer(transformers=[
- ('num', numerical_transformer, numerical_cols),
- ('cat', categorical_transformer, categorical_cols)])
- # 5. Model = Regressor (regr) ---> Pipeline
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.preprocessing import StandardScaler
- regr = RandomForestRegressor(n_estimators=100, random_state=0)
- pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr), ('normalizer', StandardScaler()) ])
- pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
- pipeline.steps
- # 6. Evaluate the RandomForestRegressor
- from sklearn.model_selection import cross_validate
- from sklearn.metrics import mean_absolute_error
- pipeline.fit(X_train, y_train)
- preds = pipeline.predict(X_valid)
- mae = mean_absolute_error(preds, y_valid)
- mae_per_row = mae / X_train.shape[0]
- price_per_row = y_train.mean()
- print("MAE with RFR =", mae, '\n\n')
- print("MAE per row =", mae_per_row)
- print("Price per row =", price_per_row)
- print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
- print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))
- # 7. Evaluate with cross validation
- from sklearn.model_selection import cross_val_score
- scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
- score = scores.mean()
- mae_per_row = score / X_train.shape[0]
- price_per_row = y_train.mean()
- print("Average MAE with RFR =", score, '\n\n')
- print("MAE per row =", mae_per_row)
- print("Price per row =", price_per_row)
- print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
- print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))
- # 8. Some more regressors
- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.linear_model import LinearRegression
- from sklearn.svm import SVR
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.ensemble import RandomForestRegressor
- """
- # 8a. Polynomial Regressor
- # Create polynomial features (degree=3)
- poly_features = PolynomialFeatures(degree=3)
- X_poly = poly_features.fit_transform(X_train)
- # Create and fit the polynomial regression model
- regr1 = LinearRegression()
- regr1.fit(X_poly, y_train)
- """
- # 8a. Support Vector Machine Regressors
- regr1 = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.01)
- # 8b. Support Vector Machine Regressors
- regr2 = SVR(kernel='rbf', C=1e3, gamma='auto', epsilon=0.1)
- # 8c. Decision Tree Regressor
- regr3 = DecisionTreeRegressor()
- # 8d. Random Forest Regressor
- regr4 = RandomForestRegressor()
- regrs = [regr1, regr2, regr3, regr4]
- scores_list = []
- for regr in regrs:
- pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
- scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
- score = scores.mean()
- scores_list.append(score)
- print('-----------------------------------------')
- print("{}. ".format(str(regr)))
- print("Score =", score)
- print('-------------------------------------------', '\n\n')
- # 9. Summary of regressors - pipelines that I tried
- best_regr = None
- best_score = 10**20
- print("After cross-validation with 5-folds:")
- for i in range(len(regrs)):
- regr = regrs[i]
- score = scores_list[i]
- print("{} ---> {}".format(regr, score))
- if score < best_score:
- best_score = score
- best_regr = regr
- print("\n\n", "Best regressor = {} with average MAE = {}".format(best_regr, best_score))
- # 10. Select the best regressor and evaluate it
- # Since RFR is the best one, I can change the default values to find a better model
- n_estimators_list = list(range(50, 301, 50))
- random_state_list = [0, 1]
- best_score = 10**20
- best_rs = -1
- best_ne = -1
- for random_state in random_state_list:
- for n_estimators in n_estimators_list:
- regr = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
- pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
- scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
- score = scores.mean()
- print('-------------------------------------------------------------')
- print("{}. ".format(str(regr)))
- print("Score =", score)
- print('-------------------------------------------------------------', '\n\n')
- if score < best_score:
- best_score = score
- best_rs = random_state
- best_ne = n_estimators
- # 11. One last evaluation
- best_regr = RandomForestRegressor(n_estimators=best_ne, random_state=best_rs)
- pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', best_regr) ])
- best_scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
- best_score = best_scores.mean()
- print('-------------------------------------------------------------')
- print("Best regressor = {} with:".format(str(best_regr)))
- print("Best score MAE =", best_score)
- print('-------------------------------------------------------------', '\n\n')
- print(best_ne, best_rs)
- # 12. Check the predicted values with the real ones
- pipeline.fit(X_train, y_train)
- preds = pipeline.predict(X_valid)
- comparison = pd.DataFrame({ 'Real values': y_valid, 'Predictions' : preds })
- print(comparison)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement