Advertisement
jules0707

decision tree classifier

Dec 13th, 2024 (edited)
15
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.67 KB | None | 0 0
  1. # %% [markdown]
  2. # ## Title :
  3. # Classification using Decision Tree
  4. #
  5. # ## Description :
  6. # The goal of this exercise is to get comfortable using Decision Trees for classification in sklearn.  Eventually, you will produce a plot similar to the one given below:
  7. #
  8. # <img src="fig1.png" style="width: 1000px;">
  9. #
  10. #
  11. # ## Instructions:
  12. #
  13. # - Read the train and test datafile as Pandas data frame.
  14. # - Use `minority` and `bachelor` as the predictor variables and `won` as the response.
  15. # - Fit a decision tree of depth 2 and another of depth 10 on the training data.
  16. # - Call the function `plot_boundary` to visualise the decision boundary of these 2 classifiers.
  17. # - Increase the number of predictor variables as mentioned in scaffold.
  18. # - Initialize a decision tree classifier of depth 2, 10 and 15.
  19. # - Fit the model on the train data.
  20. # - Compute the train and test accuracy scores for each classifier.
  21. # - Use the helper code to look at the feature importance of the predictors from the decision tree of depth 15.
  22. #
  23. # ## Hints:
  24. #
  25. # <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html" target="_blank">sklearn.DecisionTreeClassifier()</a>
  26. # Generates a Logistic Regression classifier
  27. #
  28. # <a href="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score" target="_blank">sklearn.score()</a>
  29. # Accuracy classification score.
  30. #
  31. # <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier.fit" target="_blank">classifier.fit()</a>
  32. # Build a decision tree classifier from the training set (X, y).
  33. #
  34. # **Note: This exercise is auto-graded and you can try multiple attempts.**
  35.  
  36. # %%
  37. # Import necessary libraries
  38. import numpy as np
  39. import pandas as pd
  40. import sklearn as sk
  41. import seaborn as sns
  42. from sklearn import tree
  43. import matplotlib.pyplot as plt
  44. from helper import plot_boundary
  45. from prettytable import PrettyTable
  46. from sklearn.tree import DecisionTreeClassifier
  47. from sklearn.model_selection import cross_val_score
  48.  
  49. pd.set_option('display.width', 100)
  50. pd.set_option('display.max_columns', 20)
  51. plt.rcParams["figure.figsize"] = (12,8)
  52.  
  53.  
  54. # %%
  55. # Read the data file "election_train.csv" as a Pandas dataframe
  56. elect_train = pd.read_csv("election_train.csv")
  57.  
  58. # Read the data file "election_test.csv" as a Pandas dataframe
  59. elect_test = pd.read_csv("election_test.csv")
  60.  
  61. # Take a quick look at the train data
  62. elect_train.head()
  63.  
  64.  
  65. # %%
  66. # Set the columns minority and bachelor as train data predictors
  67. X_train = elect_train[['minority','bachelor']]
  68.  
  69. # Set the columns minority and bachelor as test data predictors
  70. X_test = elect_test[['minority','bachelor']]
  71.  
  72. # Set the column "won" as the train response variable
  73. y_train = elect_train['won']
  74.  
  75. # Set the column "won" as the test response variable
  76. y_test = elect_test['won']
  77.  
  78.  
  79. # %%
  80. # Initialize a Decision Tree classifier with a depth of 2
  81. dt1 = DecisionTreeClassifier(max_depth=2)
  82.  
  83. # Fit the classifier on the train data
  84. dt1.fit(X_train, y_train)
  85.  
  86. # Initialize a Decision Tree classifier with a depth of 10
  87. dt2 = DecisionTreeClassifier(max_depth=10)
  88.  
  89. # Fit the classifier on the train data
  90. dt2.fit(X_train, y_train)
  91.  
  92.  
  93.  
  94.  
  95. # %%
  96. # Call the function plot_boundary from the helper file to get
  97. # the decision boundaries of both the classifiers
  98. plot_boundary(elect_train, dt1, dt2)
  99.  
  100.  
  101. # %%
  102. # Set of predictor columns
  103. pred_cols = ['minority','bachelor','hispanic','female','unemployed','income','nodegree','obesity','cancer']
  104.  
  105. # Use the columns above as the predictor data from the train data
  106. X_train = elect_train[pred_cols]
  107.  
  108. # Use the columns above as the predictor data from the test data
  109. X_test = elect_test[pred_cols]
  110.  
  111. # Initialize a Decision Tree classifier with a depth of 2
  112. dt1 = DecisionTreeClassifier(max_depth=2)
  113.  
  114. # Initialize a Decision Tree classifier with a depth of 10
  115. dt2 = DecisionTreeClassifier(max_depth=10)
  116.  
  117. # Initialize a Decision Tree classifier with a depth of 15
  118. dt3 = DecisionTreeClassifier(max_depth=15)
  119.  
  120. # Fit all the classifier on the train data
  121. dt1.fit(X_train, y_train)
  122. dt2.fit(X_train, y_train)
  123. dt3.fit(X_train, y_train)
  124.  
  125.  
  126. # %%
  127. ### edTest(test_accuracy) ###
  128.  
  129. # Compute the train and test accuracy for the first decision tree classifier of depth 2
  130. dt1_train_acc = dt1.score(X_train,y_train)
  131. dt1_test_acc = dt1.score(X_test,y_test)
  132.  
  133. # Compute the train and test accuracy for the second decision tree classifier of depth 10
  134. dt2_train_acc = dt2.score(X_train,y_train)
  135. dt2_test_acc = dt2.score(X_test, y_test)
  136.  
  137. # Compute the train and test accuracy for the third decision tree classifier of depth 15
  138. dt3_train_acc = dt3.score(X_train, y_train)
  139. dt3_test_acc = dt3.score(X_test,y_test)
  140.  
  141.  
  142. # %%
  143. # Helper code to plot the scores of each classifier as a table
  144. pt = PrettyTable()
  145. pt.field_names = ['Max Depth', 'Number of Features', 'Train Accuracy', 'Test Accuracy']
  146. pt.add_row([2, 2, round(dt1_train_acc, 4), round(dt1_test_acc,4)])
  147. pt.add_row([10, 2, round(dt2_train_acc,4), round(dt2_test_acc,4)])
  148. pt.add_row([15, len(pred_cols), round(dt3_train_acc,4), round(dt3_test_acc,4)])
  149. print(pt)
  150.  
  151.  
  152. # %%
  153.  
  154. +-----------+--------------------+----------------+---------------+
  155. | Max Depth | Number of Features | Train Accuracy | Test Accuracy |
  156. +-----------+--------------------+----------------+---------------+
  157. |     2     |         2          |     0.8924     |     0.8862    |
  158. |     10    |         2          |     0.9836     |     0.9024    |
  159. |     15    |         9          |     0.9981     |     0.8821    |
  160. +-----------+--------------------+----------------+---------------+
  161.  
  162.  
  163.  
  164.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement