Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # *******************************************************************************************
- # *******************************************************************************************
- # 0. Function that evaluates a RandomForestRegressor model (with 100 trees) using MAE
- # *******************************************************************************************
- # *******************************************************************************************
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.metrics import mean_absolute_error
- from sklearn.impute import SimpleImputer
- def score_dataset(X_train, X_valid, y_train, y_valid):
- model = RandomForestRegressor(n_estimators=100, random_state=0)
- model.fit(X_train, y_train)
- preds = model.predict(X_valid)
- return mean_absolute_error(y_valid, preds)
- # *******************************************************************************************
- # *******************************************************************************************
- # 1. Training, validation (1st dataset) and testing dataset
- # *******************************************************************************************
- # *******************************************************************************************
- import pandas as pd
- from sklearn.model_selection import train_test_split
- # 1a. Read the data from the 2 datasets
- X_full = pd.read_csv('../input/train.csv', index_col='Id')
- X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
- # 1b. Remove ROWS with MISSING TARGET, separate target from predictors
- X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
- y = X_full.SalePrice
- X_full.drop(['SalePrice'], axis=1, inplace=True)
- # 1c. To keep things simple, we'll use only numerical predictors
- X = X_full.select_dtypes(exclude=['object'])
- X_test = X_test_full.select_dtypes(exclude=['object'])
- # 1d. Break off validation set from training data
- X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
- X_train.head()
- # Shape of training data (num_rows, num_columns)
- print(X_train.shape)
- # *******************************************************************************************
- # *******************************************************************************************
- # 2. Preliminary investigation - Find which and how many columns contain missing data
- # *******************************************************************************************
- # *******************************************************************************************
- # Number of missing values in each column of training data
- missing_val_count_by_column = (X_train.isnull().sum())
- print(missing_val_count_by_column[missing_val_count_by_column > 0])
- num_rows = 1168
- num_cols_with_missing = 3
- tot_missing = 212 + 6 + 58
- # *******************************************************************************************
- # *******************************************************************************************
- # 3a. Drop missing values
- # *******************************************************************************************
- # *******************************************************************************************
- # Get names of columns with missing values - The same as above DataFrame
- cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
- print("Names of columns with missing data\n", cols_with_missing, '\n\n')
- reduced_X_train = X_train.drop(cols_with_missing, axis=1)
- reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
- print("MAE (Drop columns with missing values):")
- print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
- # *******************************************************************************************
- # *******************************************************************************************
- # 3b. Imputation - Default strategy = mean
- # *******************************************************************************************
- # *******************************************************************************************
- my_imputer = SimpleImputer()
- imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
- imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
- # Fill in the lines below: imputation removed column names; put them back
- imputed_X_train.columns = X_train.columns
- imputed_X_valid.columns = X_valid.columns
- print("MAE (Imputation):")
- print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
- print('\n\n', "Given that thre are so few missing values in the dataset, we'd expect imputation to perform better than dropping columns entirely. However, we see that dropping columns performs slightly better! While this can probably partially be attributed to noise in the dataset, another potential explanation is that the imputation method is not a great match to this dataset. That is, maybe instead of filling in the mean value, it makes more sense to set every missing value to a value of 0, to fill in the most frequently encountered value, or to use some other method. For instance, consider the GarageYrBlt column (which indicates the year that the garage was built). It's likely that in some cases, a missing value could indicate a house that does not have a garage. Does it make more sense to fill in the median value along each column in this case? Or could we get better results by filling in the minimum value along each column? It's not quite clear what's best in this case, but perhaps we can rule out some options immediately - for instance, setting missing values in this column to 0 is likely to yield horrible results!", '\n\n')
- # *******************************************************************************************
- # *******************************************************************************************
- # 3c. Imputation - Custom strategy = median
- # *******************************************************************************************
- # *******************************************************************************************
- final_imputer = SimpleImputer(strategy='median')
- final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
- final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))
- final_X_train.columns = X_train.columns
- final_X_valid.columns = X_valid.columns
- print("MAE (Imputation 2):")
- print(score_dataset(final_X_train, final_X_valid, y_train, y_valid))
- # *******************************************************************************************
- # *******************************************************************************************
- # 4. Define and fit model
- # *******************************************************************************************
- # *******************************************************************************************
- model = RandomForestRegressor(n_estimators=100, random_state=0)
- model.fit(final_X_train, y_train)
- preds_valid = model.predict(final_X_valid)
- print("MAE (Your approach):")
- print(mean_absolute_error(y_valid, preds_valid))
- # *******************************************************************************************
- # *******************************************************************************************
- # 5. Apply the model in the 2nd dataset - for testing
- # *******************************************************************************************
- # *******************************************************************************************
- # I have to also impute the columns of this DataFrame (X_test)
- test_imputer = SimpleImputer(strategy='median')
- final_X_test = pd.DataFrame(test_imputer.fit_transform(X_test))
- final_X_test.columns = X_test.columns
- preds_test = model.predict(final_X_test)
- # Save test predictions to file
- output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
- output.to_csv('submission.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement