Advertisement
jules0707

get_dummies.py

Jan 9th, 2025
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.76 KB | None | 0 0
  1. # %%
  2. import numpy as np
  3. import pandas as pd
  4. import seaborn as sns
  5. import matplotlib.pyplot as plt
  6. from sklearn.metrics import mean_squared_error
  7. from sklearn.linear_model import LinearRegression
  8. from sklearn.model_selection import train_test_split
  9.  
  10. # %%
  11. # Load the credit data.
  12. df = pd.read_csv('credit.csv')
  13. df.head()
  14.  
  15. # %%
  16. # The response variable will be 'Balance.'
  17. x = df.drop('Balance', axis=1)
  18. y = df['Balance']
  19. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
  20.  
  21. # %%
  22. # Trying to fit on all features in their current representation throws an error.
  23. try:
  24.     test_model = LinearRegression().fit(x_train, y_train)
  25. except Exception as e:
  26.     print('Error!:', e)
  27.  
  28. # %% [markdown]
  29. # ⏸️ Given this error and what you've seen of the data so far, what do you think the problem is?
  30. #
  31. # A. We are trying to fit on too many features/columns\
  32. # B. Some columns are strings\
  33. # C. The column names contain capital letters\
  34. # D. The features are on different scales
  35.  
  36. # %%
  37. ### edTest(test_chow1) ###
  38. # Submit an answer choice as a string below.
  39. answer1 = 'B'
  40.  
  41. # %%
  42. # Inspect the data types of the DataFrame's columns.
  43. df.dtypes
  44.  
  45. # %%
  46. ### edTest(test_model1) ###
  47. # Fit a linear model using only the numeric features in the dataframe.
  48. numeric_features = df.drop(df.columns[-5:], axis=1).columns
  49. model1 = LinearRegression().fit(x_train[numeric_features], y_train)
  50.  
  51. # Report train and test R2 scores.
  52. train_score = model1.score(x_train[numeric_features], y_train)
  53. test_score = model1.score(x_test[numeric_features], y_test)
  54. print('Train R2:', train_score)
  55. print('Test R2:', test_score)
  56.  
  57. # %%
  58. # Look at unique values of Ethnicity feature.
  59. print('In the train data, Ethnicity takes on the values:', list(x_train['Ethnicity'].unique()))
  60.  
  61. # %% [markdown]
  62. # ⏸️ From the output above, how many binary variables will be required to encode the `Ethnicity` feature?
  63. #
  64. # A. 1\
  65. # B. 2\
  66. # C. 3\
  67. # D. 4
  68.  
  69. # %%
  70. ### edTest(test_chow2) ###
  71. # Submit an answer choice as a string below.
  72. answer2 = 'B'
  73.  
  74. # %%
  75. ### edTest(test_design) ###
  76. # Create x train and test design matrices creating dummy variables for the categorical.
  77. # hint: use pd.get_dummies() with the drop_first hyperparameter for this
  78. x_train_design = pd.get_dummies(x_train, drop_first=True)
  79. x_test_design = pd.get_dummies(x_test, drop_first=True)
  80. x_train_design.head()
  81.  
  82. # %%
  83. # Confirm that all data types are now numeric.
  84. x_train_design.dtypes
  85.  
  86. # %%
  87. ### edTest(test_model2) ###
  88. # Fit model2 on design matrix
  89. model2 = LinearRegression().fit(x_train_design, y_train)
  90.  
  91. # Report train and test R2 scores
  92. train_score = model2.score(x_train_design, y_train)
  93. test_score = model2.score(x_test_design, y_test)
  94. print('Train R2:', train_score)
  95. print('Test R2:', test_score)
  96.  
  97. # %% [markdown]
  98. # ⏸️ How do the $R^2$ scores of the two models compare?
  99. #
  100. # A. numeric only model performs better on both train and test.\
  101. # B. numeric only model performs better on train but worse on test.\
  102. # C. full model performs better on both train and test.\
  103. # D. full model performs better on train but worse on test.
  104.  
  105. # %%
  106. ### edTest(test_chow3) ###
  107. # Submit an answer choice as a string below.
  108. answer3 = 'C'
  109.  
  110. # %%
  111. # Note that the intercept is not a part of .coef_ but is instead stored in .intercept_.
  112. coefs = pd.DataFrame(model2.coef_, index=x_train_design.columns, columns=['beta_value'])
  113. coefs
  114.  
  115. # %%
  116. # Visualize crude measure of feature importance.
  117. sns.barplot(data=coefs.T, orient='h').set(title='Model Coefficients');
  118.  
  119. # %% [markdown]
  120. # ⏸️ Base, on the plot above, which categorical feature has the strongest relationship with `Balance`?
  121. #
  122. # A. Cards\
  123. # B. Gender_Female\
  124. # C. Student_Yes\
  125. # D. Ethnicity_Caucasian
  126.  
  127. # %%
  128. ### edTest(test_chow4) ###
  129. # Submit an answer choice as a string below.
  130. answer4 = 'C'
  131.  
  132. # %% [markdown]
  133. # Fit a model to predict `Balance` from 2 predictors: `Income` and the best categorical feature from your answer.
  134.  
  135. # %%
  136. ### edTest(test_model3) ###
  137. # Specify best categorical feature
  138. best_cat_feature ='Student_Yes'
  139.  
  140. # Define the model.
  141. features = ['Income', best_cat_feature]
  142. model3 = LinearRegression()
  143. model3.fit(x_train_design[features], y_train)
  144.  
  145. # Collect betas from fitted model.
  146. beta0 = model3.intercept_
  147. beta1 = model3.coef_[features.index('Income')]
  148. beta2 = model3.coef_[features.index(best_cat_feature)]
  149.  
  150. # Display betas in a DataFrame.
  151. coefs = pd.DataFrame([beta0, beta1, beta2], index=['Intercept']+features, columns=['beta_value'])
  152. coefs
  153.  
  154. # %%
  155. # Visualize crude measure of feature importance.
  156. sns.barplot(data=coefs.T, orient='h').set(title='Model Coefficients');
  157.  
  158. # %%
  159. ### edTest(test_prediction_lines) ###
  160. # Create space of x values to predict on.
  161. x_space = np.linspace(x['Income'].min(), x['Income'].max(), 1000)
  162.  
  163. # Generate 2 sets of predictions based on best categorical feature value.
  164. # When categorical feature is true/present (1)
  165. y_hat_yes = beta0 + beta1 * x_space + beta2 * 1
  166. # When categorical feature is false/absent (0)
  167. y_hat_no = beta0 + beta1 * x_space
  168.  
  169. # Plot the 2 prediction lines for students and non-students.
  170. ax = sns.scatterplot(data=pd.concat([x_train_design, y_train], axis=1), x='Income', y='Balance', hue=best_cat_feature, alpha=0.8)
  171. ax.plot(x_space, y_hat_no)
  172. ax.plot(x_space, y_hat_yes);
  173.  
  174. # %% [markdown]
  175. # ⏸️ What is the effect of student status on the regression line?
  176. #
  177. # A. Non-students' balances increase faster with raising income (i.e., steeper slope)\
  178. # B. Students' balances increase faster with raising income (i.e., steeper slope)\
  179. # C. Non-students' higher balances on average (i.e., higher intercept)\
  180. # D. students' higher balances on average (i.e., higher intercept)
  181.  
  182. # %%
  183. ### edTest(test_chow5) ###
  184. # Submit an answer choice as a string below.
  185. answer5 = 'D'
  186.  
  187.  
  188.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement