2. Cars regressor

import numpy as np
import pandas as pd


# 1a. Read the dataset
cars_path = './autos.csv'
cars = pd.read_csv(cars_path)
print(cars.head(), '\n\n')

# 1b. Explore the dataset
shape = cars.shape
print("---- Dataset contains {} rows-records with {} columns-features ----".format(shape[0], shape[1]))
all_columns = list(cars.columns)
print("Columns names:", all_columns)
print('\n\n')

# 1c. Separate the target ('price') from the other features
target = 'price'
cars.dropna(axis=0, subset=[target], inplace=True)
y = cars[target]
cars.drop([target], axis=1, inplace=True)
shape = cars.shape
print("Now, there are", len(list(cars.columns)), "columns (target = 'price') in the dataset")


# 2a. Check for columns with missing data
print("...Check for any columns with missing values....")
cols_with_missing = [col for col in cars.columns if cars[col].isnull().any()]
print("There are {}/{} columns with missing values:".format(len(cols_with_missing), shape[1]), cols_with_missing, '\n\n')

# 2b. Find all the numerical cols AND categorical cols with low cardinality
numerical_cols = [cname for cname in cars.columns if cars[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in cars.columns if cars[cname].dtype == "object" and cars[cname].nunique() < 10]
my_cols = numerical_cols + categorical_cols

print("  Numerical cols =", numerical_cols)
print("Categorical cols =", categorical_cols)
print(" All the columns =", my_cols)
print("Selected columns = {} = {} numerical + {} categorical".format(len(my_cols), len(numerical_cols), len(categorical_cols)), '\n\n')

# 2c. Find out if there are cols with missing data in my selected columns (my_cols)
selected_with_missing = [col for col in my_cols if col in cols_with_missing]
print("There are {}/{} selected columns with missing data: {}".format(len(selected_with_missing), len(my_cols), selected_with_missing))
print("...Need imputation...")


# 3a. Work with the selected columns of the dataset (my_cols)
X = cars[my_cols].copy()

SELECTED = 1000
X = X.head(SELECTED)
y = y.head(SELECTED)
print(X.head(), '\n\n')

# 3b. Train, test, split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
print("X_train = {}, X_valid = {}\ny_train = {}, y_valid = {}".format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))


# 4a. Imputer + OH encoder, since there are missing values and categorical columns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])


# 5. Model = Regressor (regr) ---> Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

regr = RandomForestRegressor(n_estimators=100, random_state=0)
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr), ('normalizer', StandardScaler()) ])
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
pipeline.steps


# 6. Evaluate the RandomForestRegressor

from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)
mae = mean_absolute_error(preds, y_valid)
mae_per_row = mae / X_train.shape[0]
price_per_row = y_train.mean()

print("MAE with RFR =", mae, '\n\n')
print("MAE per row =", mae_per_row)
print("Price per row =", price_per_row)
print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))


# 7. Evaluate with cross validation

from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
score = scores.mean()
mae_per_row = score / X_train.shape[0]
price_per_row = y_train.mean()

print("Average MAE with RFR =", score, '\n\n')
print("MAE per row =", mae_per_row)
print("Price per row =", price_per_row)
print("This means that the model outputs an error of {:.3} $ in prices with average value of".format(mae_per_row))
print("{} $, so the error percentage is {:%}".format(price_per_row, mae_per_row / price_per_row))


# 8. Some more regressors

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

"""
# 8a. Polynomial Regressor
# Create polynomial features (degree=3)
poly_features = PolynomialFeatures(degree=3)
X_poly = poly_features.fit_transform(X_train)
# Create and fit the polynomial regression model
regr1 = LinearRegression()
regr1.fit(X_poly, y_train)
"""
# 8a. Support Vector Machine Regressors
regr1 = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.01)
# 8b. Support Vector Machine Regressors
regr2 = SVR(kernel='rbf', C=1e3, gamma='auto', epsilon=0.1)
# 8c. Decision Tree Regressor
regr3 = DecisionTreeRegressor()
# 8d. Random Forest Regressor
regr4 = RandomForestRegressor()
regrs = [regr1, regr2, regr3, regr4]
scores_list = []

for regr in regrs:
    pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
    scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    score = scores.mean()
    scores_list.append(score)

    print('-----------------------------------------')
    print("{}. ".format(str(regr)))
    print("Score =", score)
    print('-------------------------------------------', '\n\n')


# 9. Summary of regressors - pipelines that I tried
best_regr = None
best_score = 10**20
print("After cross-validation with 5-folds:")

for i in range(len(regrs)):
    regr = regrs[i]
    score = scores_list[i]
    print("{} ---> {}".format(regr, score))
    if score < best_score:
        best_score = score
        best_regr = regr

print("\n\n", "Best regressor = {} with average MAE  = {}".format(best_regr, best_score))


# 10. Select the best regressor and evaluate it
# Since RFR is the best one, I can change the default values to find a better model

n_estimators_list = list(range(50, 301, 50))
random_state_list = [0, 1]
best_score = 10**20
best_rs = -1
best_ne = -1

for random_state in random_state_list:
    for n_estimators in n_estimators_list:
        regr = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
        scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
        score = scores.mean()

        print('-------------------------------------------------------------')
        print("{}. ".format(str(regr)))
        print("Score =", score)
        print('-------------------------------------------------------------', '\n\n')

        if score < best_score:
            best_score = score
            best_rs = random_state
            best_ne = n_estimators


# 11. One last evaluation
best_regr = RandomForestRegressor(n_estimators=best_ne, random_state=best_rs)
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', best_regr) ])
best_scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
best_score = best_scores.mean()

print('-------------------------------------------------------------')
print("Best regressor = {} with:".format(str(best_regr)))
print("Best score MAE =", best_score)
print('-------------------------------------------------------------', '\n\n')
print(best_ne, best_rs)


# 12. Check the predicted values with the real ones
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)
comparison = pd.DataFrame({ 'Real values': y_valid, 'Predictions' : preds })
print(comparison)