Дерево рішень

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

def gini_impurity(dataframe):
    class_values = dataframe.iloc[:, -1].unique()
    total_count = len(dataframe)
    gini = 0.0
    for class_value in class_values:
        class_count = len(dataframe[dataframe.iloc[:, -1] == class_value])
        class_probability = class_count / total_count
        gini += (class_probability * (1 - class_probability))
    return gini

def find_first_split(sorted_dataframe, index_col):
    for i in range(1, len(sorted_dataframe)):
        if sorted_dataframe.iloc[i, -1] != sorted_dataframe.iloc[i - 1, -1]:
            first_split_value = sorted_dataframe.iloc[i, index_col]
            return first_split_value

def information_gain(dataframe, feature_name, first_split_value):
    total_gini_impurity = gini_impurity(dataframe)
    left_subset = dataframe[dataframe[feature_name] <= first_split_value]
    right_subset = dataframe[dataframe[feature_name] > first_split_value]
    left_gini_impurity = gini_impurity(left_subset)
    right_gini_impurity = gini_impurity(right_subset)
    information_gain_value = total_gini_impurity - (len(left_subset) / len(dataframe)) * left_gini_impurity - (
                len(right_subset) / len(dataframe)) * right_gini_impurity
    return information_gain_value

class Node:
    def __init__(self, data, depth, col, val, class_):
        self.data = data
        self.depth = depth
        self.col = col
        self.val = val
        self.class_ = class_
        self.left = None
        self.right = None

    @classmethod
    def build_binary_tree(cls, dataframe, min_samples_split, max_depth, current_depth=0, current_col=np.NAN,
                          current_val=np.NAN, current_class = np.NAN):
        if len(dataframe) <= min_samples_split or current_depth >= max_depth:
            if len(dataframe) > 0:
                current_class = dataframe.iloc[:, -1].value_counts().idxmax()
            return cls(dataframe, current_depth, current_col, current_val, current_class)

        gain_df = pd.DataFrame({'Gain_value': [], 'Column_name': [], 'Split_value': []})
        for i in range(len(dataframe.columns) - 1):
            sorted_gini_df = dataframe.sort_values(by=dataframe.columns[i])
            first_split_value = find_first_split(sorted_gini_df, i)
            feature_name = sorted_gini_df.columns[i]
            information_gain_value = information_gain(sorted_gini_df, feature_name, first_split_value)
            gain_df.loc[i] = [information_gain_value, feature_name, first_split_value]

        max_gain_row_df = gain_df.loc[gain_df['Gain_value'].idxmax()]
        most_common_class = dataframe.iloc[:, -1].value_counts().idxmax()

        left_subset = dataframe[dataframe[max_gain_row_df['Column_name']] <= max_gain_row_df['Split_value']]
        right_subset = dataframe[dataframe[max_gain_row_df['Column_name']] > max_gain_row_df['Split_value']]
        node = cls(dataframe, current_depth, max_gain_row_df['Column_name'], max_gain_row_df['Split_value'], most_common_class)
        node.left = cls.build_binary_tree(left_subset, min_samples_split, max_depth, current_depth + 1)
        node.right = cls.build_binary_tree(right_subset, min_samples_split, max_depth, current_depth + 1)
        return node

    def train(self, min_samples_split, max_depth):


        if (self is not None) and (not self.data.empty):
            if len(self.data) <= min_samples_split or self.depth >= max_depth:
                self.data.iloc[:, -1] = self.class_

            final_df.loc[final_df.index.intersection(self.data.index)] = self.data.loc[
                final_df.index.intersection(self.data.index)]

            if (self.left is not None) and (not self.left.data.empty):
                self.left.train(min_samples_split, max_depth)
            if (self.right is not None) and (not self.right.data.empty):
                self.right.train(min_samples_split, max_depth)
        return self

    def predict(self, test_element):
        if (self.left is None and self.right is None) or (self.val is None):
            return self.data.iloc[0, -1]
        if test_element[self.col] <= self.val:
            return self.left.predict(test_element)
        else:
            return self.right.predict(test_element)

    def print_binary_tree(self, indent=""):
        if (self is not None) and (not self.data.empty):

            print(indent + "Depth", self.depth)
            print(indent + "Data:", len(self.data), "samples")
            print(indent + "Column:", self.col,  ";  Split_Value: <=", self.val, '  (Left=True  |  Right=False)')
            print(indent + "Predicted Class:", self.class_)

            if (self.left is not None) and (not self.left.data.empty):
                print(indent + "  Left:")
                self.left.print_binary_tree(indent + "    ")

            if (self.right is not None) and (not self.right.data.empty):
                print(indent + "  Right:")
                self.right.print_binary_tree(indent + "    ")

df = pd.read_csv('iris.csv')
X = df.drop(labels=df.columns[-1], axis=1)
Y = df[df.columns[-1]]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
gini_df = pd.concat([x_train, y_train], axis=1)
final_df = gini_df.copy()

min_samples_split = 5
max_depth = 5

root_node = Node.build_binary_tree(gini_df, min_samples_split, max_depth)
y = root_node.train(min_samples_split, max_depth)

root_node.print_binary_tree()
gini_df['species_test'] = final_df['species']
gini_df['Matching'] = gini_df.species == gini_df.species_test
acc = round((gini_df['Matching'] == True).sum() / (len(gini_df['Matching'])) * 100, 2)
gini_df.loc['Accuracy, %'] = ['' for i in range(len(gini_df.columns))]
gini_df.iloc[-1, -1] = acc
print(gini_df)

test_df = pd.concat([x_test, y_test], axis=1)
species_test = test_df['species'].copy()
for i in range(len(x_test)):
    species_test.iloc[i] = root_node.predict(x_test.iloc[i])

test_df['species_test'] = species_test
test_df['Matching'] = test_df.species == species_test
acc1 = round((test_df['Matching'] == True).sum() / (len(test_df['Matching'])) * 100, 2)
test_df.loc['Accuracy, %'] = ['' for i in range(len(test_df.columns))]
test_df.iloc[-1, -1] = acc1

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
accuracy_train = round(clf.score(x_train, y_train) * 100, 2)
accuracy_test = round(clf.score(x_test, y_test) * 100, 2)
predict_y = clf.predict(x_test)
sk_match_y = predict_y == y_test
predict_y = np.append(predict_y, '')
sk_match_y = np.append(sk_match_y, accuracy_test)
test_df['sklearn_species'] = predict_y
test_df['sklearn_matching'] = sk_match_y
test_df['sklearn_matching'] = test_df['sklearn_matching'].replace(0.0, False)
test_df['sklearn_matching'] = test_df['sklearn_matching'].replace(1.0, True)

info_df = pd.DataFrame({'Decision Tree': [acc, acc1], 'sklearn Decision Tree': [accuracy_train, accuracy_test]},
                       index=['Точність моделі на навчальних даних, %', 'Точність моделі на тестових даних, %'])

print(test_df)
print(info_df)