Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 0a. Imports
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import seaborn as sns
- from sklearn.decomposition import PCA
- from sklearn.feature_selection import mutual_info_regression
- from sklearn.model_selection import cross_val_score
- from xgboost import XGBRegressor
- # 0b. Set Matplotlib defaults
- plt.style.use("seaborn-whitegrid")
- plt.rc("figure", autolayout=True)
- plt.rc(
- "axes",
- labelweight="bold",
- labelsize="large",
- titleweight="bold",
- titlesize=14,
- titlepad=10,
- )
- # 2. AUXILIARY FUNCTIONS
- def apply_pca(X, standardize=True):
- # Standardize
- if standardize:
- X = (X - X.mean(axis=0)) / X.std(axis=0)
- # Create principal components
- pca = PCA()
- X_pca = pca.fit_transform(X)
- # Convert to dataframe
- component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
- X_pca = pd.DataFrame(X_pca, columns=component_names)
- # Create loadings
- loadings = pd.DataFrame(
- pca.components_.T, # transpose the matrix of loadings
- columns=component_names, # so the columns are the principal components
- index=X.columns, # and the rows are the original features
- )
- return pca, X_pca, loadings
- def plot_variance(pca, width=8, dpi=100):
- # Create figure
- fig, axs = plt.subplots(1, 2)
- n = pca.n_components_
- grid = np.arange(1, n + 1)
- # Explained variance
- evr = pca.explained_variance_ratio_
- axs[0].bar(grid, evr)
- axs[0].set(
- xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
- )
- # Cumulative Variance
- cv = np.cumsum(evr)
- axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
- axs[1].set(
- xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
- )
- # Set up figure
- fig.set(figwidth=8, dpi=100)
- return axs
- def make_mi_scores(X, y):
- X = X.copy()
- # Label Encoding for categorical columns
- for colname in X.select_dtypes(["object", "category"]):
- X[colname], _ = X[colname].factorize()
- # All discrete features should now have integer dtypes
- discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
- mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
- mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
- mi_scores = mi_scores.sort_values(ascending=False)
- return mi_scores
- def score_dataset(X, y, model=XGBRegressor()):
- # Label Encoding for categorical columns
- for colname in X.select_dtypes(["category", "object"]):
- X[colname], _ = X[colname].factorize()
- # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
- score = cross_val_score(
- model, X, y, cv=5, scoring="neg_mean_squared_log_error",
- )
- score = -1 * score.mean()
- score = np.sqrt(score)
- return score
- # 3. Basics
- df = pd.read_csv("../input/fe-course-data/ames.csv")
- features = ["GarageArea", "YearRemodAdd", "TotalBsmtSF", "GrLivArea"]
- print("Correlation with SalePrice:\n")
- print(df[features].corrwith(df.SalePrice))
- X = df.copy()
- y = X.pop("SalePrice")
- X = X.loc[:, features]
- # 4. Apply PCA to "X" DataFrame (4 columns)
- pca, X_pca, loadings = apply_pca(X)
- print(loadings)
- X = df.copy()
- y = X.pop("SalePrice")
- X = X.join(X_pca) # Add 4 new fetaures to the original dataframe (principal components)
- score = score_dataset(X, y)
- print(f"Your score: {score:.5f} RMSLE")
- # 5. Detect and inspect outliers
- sns.catplot(
- y="value",
- col="variable",
- data=X_pca.melt(),
- kind='boxen',
- sharey=False,
- col_wrap=2,
- );
- component = "PC1"
- idx = X_pca[component].sort_values(ascending=False).index
- df.loc[idx, ["SalePrice", "Neighborhood", "SaleCondition"] + features]
- # Notice that there are several dwellings listed as Partial sales in the Edwards neighborhood that stand out. A partial sale is what # occurs when there are multiple owners of a property and one or more of them sell their "partial" ownership of the property.
- # These kinds of sales are often happen during the settlement of a family estate or the dissolution of a business and aren't advertised # publicly. If you were trying to predict the value of a house on the open market, you would probably be justified in removing sales # # like these from your dataset -- they are truly outliers.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement