decision tree classifier

# %% [markdown]
# ## Title :
# Classification using Decision Tree
#
# ## Description :
# The goal of this exercise is to get comfortable using Decision Trees for classification in sklearn.  Eventually, you will produce a plot similar to the one given below:
#
# <img src="fig1.png" style="width: 1000px;">
#
#
# ## Instructions:
#
# - Read the train and test datafile as Pandas data frame.
# - Use `minority` and `bachelor` as the predictor variables and `won` as the response.
# - Fit a decision tree of depth 2 and another of depth 10 on the training data.
# - Call the function `plot_boundary` to visualise the decision boundary of these 2 classifiers.
# - Increase the number of predictor variables as mentioned in scaffold.
# - Initialize a decision tree classifier of depth 2, 10 and 15.
# - Fit the model on the train data.
# - Compute the train and test accuracy scores for each classifier.
# - Use the helper code to look at the feature importance of the predictors from the decision tree of depth 15.
#
# ## Hints:
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html" target="_blank">sklearn.DecisionTreeClassifier()</a>
# Generates a Logistic Regression classifier
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score" target="_blank">sklearn.score()</a>
# Accuracy classification score.
#
# <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier.fit" target="_blank">classifier.fit()</a>
# Build a decision tree classifier from the training set (X, y).
#
# **Note: This exercise is auto-graded and you can try multiple attempts.**

# %%
# Import necessary libraries
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
from sklearn import tree
import matplotlib.pyplot as plt
from helper import plot_boundary
from prettytable import PrettyTable
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 20)
plt.rcParams["figure.figsize"] = (12,8)


# %%
# Read the data file "election_train.csv" as a Pandas dataframe
elect_train = pd.read_csv("election_train.csv")

# Read the data file "election_test.csv" as a Pandas dataframe
elect_test = pd.read_csv("election_test.csv")

# Take a quick look at the train data
elect_train.head()


# %%
# Set the columns minority and bachelor as train data predictors
X_train = elect_train[['minority','bachelor']]

# Set the columns minority and bachelor as test data predictors
X_test = elect_test[['minority','bachelor']]

# Set the column "won" as the train response variable
y_train = elect_train['won']

# Set the column "won" as the test response variable
y_test = elect_test['won']


# %%
# Initialize a Decision Tree classifier with a depth of 2
dt1 = DecisionTreeClassifier(max_depth=2)

# Fit the classifier on the train data
dt1.fit(X_train, y_train)

# Initialize a Decision Tree classifier with a depth of 10
dt2 = DecisionTreeClassifier(max_depth=10)

# Fit the classifier on the train data
dt2.fit(X_train, y_train)


# %%
# Call the function plot_boundary from the helper file to get
# the decision boundaries of both the classifiers
plot_boundary(elect_train, dt1, dt2)


# %%
# Set of predictor columns
pred_cols = ['minority','bachelor','hispanic','female','unemployed','income','nodegree','obesity','cancer']

# Use the columns above as the predictor data from the train data
X_train = elect_train[pred_cols]

# Use the columns above as the predictor data from the test data
X_test = elect_test[pred_cols]

# Initialize a Decision Tree classifier with a depth of 2
dt1 = DecisionTreeClassifier(max_depth=2)

# Initialize a Decision Tree classifier with a depth of 10
dt2 = DecisionTreeClassifier(max_depth=10)

# Initialize a Decision Tree classifier with a depth of 15
dt3 = DecisionTreeClassifier(max_depth=15)

# Fit all the classifier on the train data
dt1.fit(X_train, y_train)
dt2.fit(X_train, y_train)
dt3.fit(X_train, y_train)


# %%
### edTest(test_accuracy) ###

# Compute the train and test accuracy for the first decision tree classifier of depth 2
dt1_train_acc = dt1.score(X_train,y_train)
dt1_test_acc = dt1.score(X_test,y_test)

# Compute the train and test accuracy for the second decision tree classifier of depth 10
dt2_train_acc = dt2.score(X_train,y_train)
dt2_test_acc = dt2.score(X_test, y_test)

# Compute the train and test accuracy for the third decision tree classifier of depth 15
dt3_train_acc = dt3.score(X_train, y_train)
dt3_test_acc = dt3.score(X_test,y_test)


# %%
# Helper code to plot the scores of each classifier as a table
pt = PrettyTable()
pt.field_names = ['Max Depth', 'Number of Features', 'Train Accuracy', 'Test Accuracy']
pt.add_row([2, 2, round(dt1_train_acc, 4), round(dt1_test_acc,4)])
pt.add_row([10, 2, round(dt2_train_acc,4), round(dt2_test_acc,4)])
pt.add_row([15, len(pred_cols), round(dt3_train_acc,4), round(dt3_test_acc,4)])
print(pt)


# %%

+-----------+--------------------+----------------+---------------+
| Max Depth | Number of Features | Train Accuracy | Test Accuracy |
+-----------+--------------------+----------------+---------------+
|     2     |         2          |     0.8924     |     0.8862    |
|     10    |         2          |     0.9836     |     0.9024    |
|     15    |         9          |     0.9981     |     0.8821    |
+-----------+--------------------+----------------+---------------+