Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.tree import DecisionTreeClassifier
- def gini_impurity(dataframe):
- class_values = dataframe.iloc[:, -1].unique()
- total_count = len(dataframe)
- gini = 0.0
- for class_value in class_values:
- class_count = len(dataframe[dataframe.iloc[:, -1] == class_value])
- class_probability = class_count / total_count
- gini += (class_probability * (1 - class_probability))
- return gini
- def find_first_split(sorted_dataframe, index_col):
- for i in range(1, len(sorted_dataframe)):
- if sorted_dataframe.iloc[i, -1] != sorted_dataframe.iloc[i - 1, -1]:
- first_split_value = sorted_dataframe.iloc[i, index_col]
- return first_split_value
- def information_gain(dataframe, feature_name, first_split_value):
- total_gini_impurity = gini_impurity(dataframe)
- left_subset = dataframe[dataframe[feature_name] <= first_split_value]
- right_subset = dataframe[dataframe[feature_name] > first_split_value]
- left_gini_impurity = gini_impurity(left_subset)
- right_gini_impurity = gini_impurity(right_subset)
- information_gain_value = total_gini_impurity - (len(left_subset) / len(dataframe)) * left_gini_impurity - (
- len(right_subset) / len(dataframe)) * right_gini_impurity
- return information_gain_value
- class Node:
- def __init__(self, data, depth, col, val, class_):
- self.data = data
- self.depth = depth
- self.col = col
- self.val = val
- self.class_ = class_
- self.left = None
- self.right = None
- @classmethod
- def build_binary_tree(cls, dataframe, min_samples_split, max_depth, current_depth=0, current_col=np.NAN,
- current_val=np.NAN, current_class = np.NAN):
- if len(dataframe) <= min_samples_split or current_depth >= max_depth:
- if len(dataframe) > 0:
- current_class = dataframe.iloc[:, -1].value_counts().idxmax()
- return cls(dataframe, current_depth, current_col, current_val, current_class)
- gain_df = pd.DataFrame({'Gain_value': [], 'Column_name': [], 'Split_value': []})
- for i in range(len(dataframe.columns) - 1):
- sorted_gini_df = dataframe.sort_values(by=dataframe.columns[i])
- first_split_value = find_first_split(sorted_gini_df, i)
- feature_name = sorted_gini_df.columns[i]
- information_gain_value = information_gain(sorted_gini_df, feature_name, first_split_value)
- gain_df.loc[i] = [information_gain_value, feature_name, first_split_value]
- max_gain_row_df = gain_df.loc[gain_df['Gain_value'].idxmax()]
- most_common_class = dataframe.iloc[:, -1].value_counts().idxmax()
- left_subset = dataframe[dataframe[max_gain_row_df['Column_name']] <= max_gain_row_df['Split_value']]
- right_subset = dataframe[dataframe[max_gain_row_df['Column_name']] > max_gain_row_df['Split_value']]
- node = cls(dataframe, current_depth, max_gain_row_df['Column_name'], max_gain_row_df['Split_value'], most_common_class)
- node.left = cls.build_binary_tree(left_subset, min_samples_split, max_depth, current_depth + 1)
- node.right = cls.build_binary_tree(right_subset, min_samples_split, max_depth, current_depth + 1)
- return node
- def train(self, min_samples_split, max_depth):
- if (self is not None) and (not self.data.empty):
- if len(self.data) <= min_samples_split or self.depth >= max_depth:
- self.data.iloc[:, -1] = self.class_
- final_df.loc[final_df.index.intersection(self.data.index)] = self.data.loc[
- final_df.index.intersection(self.data.index)]
- if (self.left is not None) and (not self.left.data.empty):
- self.left.train(min_samples_split, max_depth)
- if (self.right is not None) and (not self.right.data.empty):
- self.right.train(min_samples_split, max_depth)
- return self
- def predict(self, test_element):
- if (self.left is None and self.right is None) or (self.val is None):
- return self.data.iloc[0, -1]
- if test_element[self.col] <= self.val:
- return self.left.predict(test_element)
- else:
- return self.right.predict(test_element)
- def print_binary_tree(self, indent=""):
- if (self is not None) and (not self.data.empty):
- print(indent + "Depth", self.depth)
- print(indent + "Data:", len(self.data), "samples")
- print(indent + "Column:", self.col, "; Split_Value: <=", self.val, ' (Left=True | Right=False)')
- print(indent + "Predicted Class:", self.class_)
- if (self.left is not None) and (not self.left.data.empty):
- print(indent + " Left:")
- self.left.print_binary_tree(indent + " ")
- if (self.right is not None) and (not self.right.data.empty):
- print(indent + " Right:")
- self.right.print_binary_tree(indent + " ")
- df = pd.read_csv('iris.csv')
- X = df.drop(labels=df.columns[-1], axis=1)
- Y = df[df.columns[-1]]
- x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
- gini_df = pd.concat([x_train, y_train], axis=1)
- final_df = gini_df.copy()
- min_samples_split = 5
- max_depth = 5
- root_node = Node.build_binary_tree(gini_df, min_samples_split, max_depth)
- y = root_node.train(min_samples_split, max_depth)
- root_node.print_binary_tree()
- gini_df['species_test'] = final_df['species']
- gini_df['Matching'] = gini_df.species == gini_df.species_test
- acc = round((gini_df['Matching'] == True).sum() / (len(gini_df['Matching'])) * 100, 2)
- gini_df.loc['Accuracy, %'] = ['' for i in range(len(gini_df.columns))]
- gini_df.iloc[-1, -1] = acc
- print(gini_df)
- test_df = pd.concat([x_test, y_test], axis=1)
- species_test = test_df['species'].copy()
- for i in range(len(x_test)):
- species_test.iloc[i] = root_node.predict(x_test.iloc[i])
- test_df['species_test'] = species_test
- test_df['Matching'] = test_df.species == species_test
- acc1 = round((test_df['Matching'] == True).sum() / (len(test_df['Matching'])) * 100, 2)
- test_df.loc['Accuracy, %'] = ['' for i in range(len(test_df.columns))]
- test_df.iloc[-1, -1] = acc1
- clf = DecisionTreeClassifier()
- clf.fit(x_train, y_train)
- accuracy_train = round(clf.score(x_train, y_train) * 100, 2)
- accuracy_test = round(clf.score(x_test, y_test) * 100, 2)
- predict_y = clf.predict(x_test)
- sk_match_y = predict_y == y_test
- predict_y = np.append(predict_y, '')
- sk_match_y = np.append(sk_match_y, accuracy_test)
- test_df['sklearn_species'] = predict_y
- test_df['sklearn_matching'] = sk_match_y
- test_df['sklearn_matching'] = test_df['sklearn_matching'].replace(0.0, False)
- test_df['sklearn_matching'] = test_df['sklearn_matching'].replace(1.0, True)
- info_df = pd.DataFrame({'Decision Tree': [acc, acc1], 'sklearn Decision Tree': [accuracy_train, accuracy_test]},
- index=['Точність моделі на навчальних даних, %', 'Точність моделі на тестових даних, %'])
- print(test_df)
- print(info_df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement