Advertisement
makispaiktis

1. Mutual Information (MI)

Sep 17th, 2024
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.45 KB | None | 0 0
  1. # 0a. Import
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. from sklearn.feature_selection import mutual_info_regression
  7.  
  8. # 0b. Set Matplotlib defaults
  9. plt.style.use("seaborn-whitegrid")
  10. plt.rc("figure", autolayout=True)
  11. plt.rc(
  12.     "axes",
  13.     labelweight="bold",
  14.     labelsize="large",
  15.     titleweight="bold",
  16.     titlesize=14,
  17.     titlepad=10,
  18. )
  19.  
  20.  
  21.  
  22. # 1. AUXILIARY FUNCTIONS
  23. def make_mi_scores(X, y):
  24.  
  25.     X = X.copy()
  26.     for colname in X.select_dtypes(["object", "category"]):
  27.         X[colname], _ = X[colname].factorize()
  28.     # All discrete features should now have integer dtypes
  29.     discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  30.  
  31.     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  32.     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
  33.     mi_scores = mi_scores.sort_values(ascending=False)
  34.     return mi_scores
  35.  
  36.  
  37. def plot_mi_scores(scores):
  38.     scores = scores.sort_values(ascending=True)
  39.     width = np.arange(len(scores))
  40.     ticks = list(scores.index)
  41.     plt.barh(width, scores)
  42.     plt.yticks(width, ticks)
  43.     plt.title("Mutual Information Scores")
  44.  
  45.  
  46.  
  47.  
  48. # 2. Load data and make some "rel" plots inline
  49. df = pd.read_csv("../input/fe-course-data/ames.csv")
  50. features = ["YearBuilt", "MoSold", "ScreenPorch"]
  51. sns.relplot(
  52.     x="value", y="SalePrice", col="variable", data=df.melt(id_vars="SalePrice", value_vars=features), facet_kws=dict(sharex=False),
  53. );
  54. # sns.relplot(data=df, x="YearBuilt", y="SalePrice")
  55. # plt.show()
  56.  
  57.  
  58.  
  59. # 3. Discover the MI scores
  60. X = df.copy()
  61. y = X.pop('SalePrice')
  62. mi_scores = make_mi_scores(X, y)
  63. print(mi_scores.head(10))
  64. print(mi_scores.tail(10))
  65. plt.figure(dpi=100, figsize=(8, 5))
  66. plot_mi_scores(mi_scores.head(10))
  67. plot_mi_scores(mi_scores.tail(10))
  68.  
  69.  
  70.  
  71.  
  72. # 4. An "outsider" feature named "BldgType" - with not a high value of MI - seems to interact well and have some type
  73. # of relation with a variable ("GrLivArea") that is very high in MI scores
  74.  
  75. sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen");
  76.  
  77.  
  78. feature = "GrLivArea"
  79. sns.lmplot(
  80.     x=feature, y="SalePrice", hue="BldgType", col="BldgType",
  81.     data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
  82. );
  83.  
  84.  
  85. feature = "MoSold"
  86. sns.lmplot(
  87.     x=feature, y="SalePrice", hue="BldgType", col="BldgType",
  88.     data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
  89. );
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement