Advertisement
makispaiktis

4. PCA

Sep 21st, 2024
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.35 KB | None | 0 0
  1. # 0a. Imports
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. from sklearn.decomposition import PCA
  7. from sklearn.feature_selection import mutual_info_regression
  8. from sklearn.model_selection import cross_val_score
  9. from xgboost import XGBRegressor
  10.  
  11. # 0b. Set Matplotlib defaults
  12. plt.style.use("seaborn-whitegrid")
  13. plt.rc("figure", autolayout=True)
  14. plt.rc(
  15.     "axes",
  16.     labelweight="bold",
  17.     labelsize="large",
  18.     titleweight="bold",
  19.     titlesize=14,
  20.     titlepad=10,
  21. )
  22.  
  23.  
  24.  
  25. # 2. AUXILIARY FUNCTIONS
  26. def apply_pca(X, standardize=True):
  27.     # Standardize
  28.     if standardize:
  29.         X = (X - X.mean(axis=0)) / X.std(axis=0)
  30.     # Create principal components
  31.     pca = PCA()
  32.     X_pca = pca.fit_transform(X)
  33.     # Convert to dataframe
  34.     component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
  35.     X_pca = pd.DataFrame(X_pca, columns=component_names)
  36.     # Create loadings
  37.     loadings = pd.DataFrame(
  38.         pca.components_.T,  # transpose the matrix of loadings
  39.         columns=component_names,  # so the columns are the principal components
  40.         index=X.columns,  # and the rows are the original features
  41.     )
  42.     return pca, X_pca, loadings
  43.  
  44.  
  45. def plot_variance(pca, width=8, dpi=100):
  46.     # Create figure
  47.     fig, axs = plt.subplots(1, 2)
  48.     n = pca.n_components_
  49.     grid = np.arange(1, n + 1)
  50.     # Explained variance
  51.     evr = pca.explained_variance_ratio_
  52.     axs[0].bar(grid, evr)
  53.     axs[0].set(
  54.         xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
  55.     )
  56.     # Cumulative Variance
  57.     cv = np.cumsum(evr)
  58.     axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
  59.     axs[1].set(
  60.         xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
  61.     )
  62.     # Set up figure
  63.     fig.set(figwidth=8, dpi=100)
  64.     return axs
  65.  
  66.  
  67. def make_mi_scores(X, y):
  68.     X = X.copy()
  69.     # Label Encoding for categorical columns
  70.     for colname in X.select_dtypes(["object", "category"]):
  71.         X[colname], _ = X[colname].factorize()
  72.     # All discrete features should now have integer dtypes
  73.     discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  74.     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  75.     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
  76.     mi_scores = mi_scores.sort_values(ascending=False)
  77.     return mi_scores
  78.  
  79.  
  80. def score_dataset(X, y, model=XGBRegressor()):
  81.     # Label Encoding for categorical columns
  82.     for colname in X.select_dtypes(["category", "object"]):
  83.         X[colname], _ = X[colname].factorize()
  84.     # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
  85.     score = cross_val_score(
  86.         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
  87.     )
  88.     score = -1 * score.mean()
  89.     score = np.sqrt(score)
  90.     return score
  91.  
  92.  
  93.  
  94.  
  95. # 3. Basics
  96. df = pd.read_csv("../input/fe-course-data/ames.csv")
  97. features = ["GarageArea", "YearRemodAdd", "TotalBsmtSF", "GrLivArea"]
  98. print("Correlation with SalePrice:\n")
  99. print(df[features].corrwith(df.SalePrice))
  100.  
  101. X = df.copy()
  102. y = X.pop("SalePrice")
  103. X = X.loc[:, features]
  104.  
  105.  
  106.  
  107.  
  108. # 4. Apply PCA to "X" DataFrame (4 columns)
  109. pca, X_pca, loadings = apply_pca(X)
  110. print(loadings)
  111.  
  112. X = df.copy()
  113. y = X.pop("SalePrice")
  114. X = X.join(X_pca)                           # Add 4 new fetaures to the original dataframe (principal components)
  115.  
  116. score = score_dataset(X, y)
  117. print(f"Your score: {score:.5f} RMSLE")
  118.  
  119.  
  120.  
  121. # 5. Detect and inspect outliers
  122. sns.catplot(
  123.     y="value",
  124.     col="variable",
  125.     data=X_pca.melt(),
  126.     kind='boxen',
  127.     sharey=False,
  128.     col_wrap=2,
  129. );
  130.  
  131. component = "PC1"
  132. idx = X_pca[component].sort_values(ascending=False).index
  133. df.loc[idx, ["SalePrice", "Neighborhood", "SaleCondition"] + features]
  134.  
  135. # Notice that there are several dwellings listed as Partial sales in the Edwards neighborhood that stand out. A partial sale is what # occurs when there are multiple owners of a property and one or more of them sell their "partial" ownership of the property.
  136. # These kinds of sales are often happen during the settlement of a family estate or the dissolution of a business and aren't advertised # publicly. If you were trying to predict the value of a house on the open market, you would probably be justified in removing sales # # like these from your dataset -- they are truly outliers.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement