Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #### 1. Read the dataset and separate target from predictors
- # Read the dataset
- X = pd.read_csv('../input/train.csv', index_col='Id')
- # Remove rows with missing target
- X.dropna(axis=0, subset=['SalePrice'], inplace=True)
- # Separate target from predictors
- y = X.SalePrice
- X.drop(['SalePrice'], axis=1, inplace=True)
- #### 2. Split the dataset
- X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
- #### 3. Separate into numerical and categorical columns
- X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
- numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
- categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == "object" and X_train_full[cname].nunique() < 10] # for OH encoding
- categorical_cols_ALL = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == "object"]
- print('Numerical columns:', numerical_cols, '\nCategorical columns:', categorical_cols, '\n\n')
- my_cols = categorical_cols + numerical_cols
- X_train = X_train_full[my_cols].copy()
- X_valid = X_valid_full[my_cols].copy()
- #### 4. Cleaning - Drop cols with missing data - Φτιάχνω μεταβλητή με τα ονόματα των στηλών αυτών
- cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
- X.drop(cols_with_missing, axis=1, inplace=True)
- X_test.drop(cols_with_missing, axis=1, inplace=True)
- #### 5. Preprocessing - Drop columns with categorical data
- drop_X_train = X_train.select_dtypes(exclude=['object'])
- drop_X_valid = X_valid.select_dtypes(exclude=['object'])
- print("MAE from Approach 1 (Drop categorical variables):")
- print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
- #### 6. Preprocessing - Ordinal encoding ----> (object_cols = good_label_cols + bad_label_cols)
- #### If I have some categorical variables in which there is the 'order attribute' (like "Beginner", "Amateur", "Advanced"), I can
- #### make ordinal encoding. But first, I have to check if the values of these features are consistent through both training and
- #### validation dataset. I can do so, by creating the following 2 variables named: good_label_cols and bad_label_cols
- object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
- good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]
- bad_label_cols = list(set(object_cols)-set(good_label_cols))
- print('Categorical columns (ALL OF THEM):', object_cols, '\n')
- print('Categorical columns that will be ordinal encoded:', good_label_cols, '\n')
- print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols, '\n')
- label_X_train = X_train.drop(bad_label_cols, axis=1)
- label_X_valid = X_valid.drop(bad_label_cols, axis=1)
- ordinal_encoder = OrdinalEncoder()
- label_X_train[good_label_cols] = ordinal_encoder.fit_transform(label_X_train[good_label_cols])
- label_X_valid[good_label_cols] = ordinal_encoder.transform(label_X_valid[good_label_cols])
- #### 7. Preprocessing - One Hot encoding ----> (object_cols = low_cardinality_cols + high_cardinality_cols)
- #### For features that do not have the 'order-attribute' (Company: 'Microsoft', 'Amazon', 'Google'). OH Encoding creates new columns in
- #### the dataset. The number of the columns created are equal to the number this categorical variable can take. In the above example, #### there are 3 companies in the 'Company' column, so OH Encoder will create 3 columns. But after that, we have to delete the old
- #### column that contained this information. So, the new dataset has 2 more columns than the original one. Ofc, if there are more than
- #### one columns that have this kind of values, we do the same thing.
- object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
- object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
- d = dict(zip(object_cols, object_nunique))
- sorted(d.items(), key=lambda x: x[1])
- print(d)
- low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
- high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
- print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
- print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
- OH_X_train = X_train.drop(high_cardinality_cols, axis=1)
- OH_X_valid = X_valid.drop(high_cardinality_cols, axis=1)
- oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
- OH_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[low_cardinality_cols]))
- OH_cols_valid = pd.DataFrame(oh_encoder.transform(X_valid[low_cardinality_cols]))
- OH_cols_train.index = X_train.index
- OH_cols_valid.index = X_valid.index
- number_X_train = X_train.drop(object_cols, axis=1)
- number_X_valid = X_valid.drop(object_cols, axis=1)
- OH_X_train = pd.concat([OH_cols_train, number_X_train], axis=1)
- OH_X_valid = pd.concat([OH_cols_valid, number_X_valid], axis=1)
- #### 8. Pipelines - Preprocessor and Model
- numerical_transformer = SimpleImputer(strategy='constant')
- categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
- ('onehot', OneHotEncoder(handle_unknown='ignore'))])
- preprocessor = ColumnTransformer(transformers=[
- ('num', numerical_transformer, numerical_cols),
- ('cat', categorical_transformer, categorical_cols)])
- model = RandomForestRegressor(n_estimators=100, random_state=0)
- clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
- clf.fit(X_train, y_train)
- preds = clf.predict(X_valid)
- print('MAE:', mean_absolute_error(y_valid, preds))
- #### 9. Output to CSV file - Requires a 2nd dataset ONLY FOR TESTING after your having validated your model
- preds_test = my_pipeline.predict(X_test)
- output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
- output.to_csv('submission.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement