Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier as RFC
- from sklearn.metrics import accuracy_score
- import seaborn as sns
- import matplotlib.pyplot as plt
- import numpy as np
- from sklearn.preprocessing import LabelEncoder
- from IPython.display import Latex
- from sklearn.base import clone, BaseEstimator, ClassifierMixin
- from joblib import Parallel, delayed
- class RandomForestClassifierParallel:
- def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, n_jobs=-1):
- self.n_estimators = n_estimators
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.n_jobs = n_jobs
- self.estimators_ = []
- def fit(self, X, y):
- def fit_tree(X, y):
- X = np.copy(X)
- y = np.copy(y)
- idx = np.random.choice(len(X), len(X), replace=True)
- X_subset, y_subset = X[idx], y[idx]
- tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
- tree.fit(X_subset, y_subset)
- return tree
- self.estimators_ = Parallel(n_jobs=self.n_jobs)(
- delayed(fit_tree)(X, y) for _ in range(self.n_estimators))
- def predict(self, X):
- def predict_tree(tree, X):
- return tree.predict(X)
- predictions = Parallel(n_jobs=self.n_jobs)(
- delayed(predict_tree)(tree, X) for tree in self.estimators_)
- y_pred = np.array(predictions).T
- majority_votes = []
- for row in y_pred:
- unique_elements, counts = np.unique(row, return_counts=True)
- majority_votes.append(unique_elements[np.argmax(counts)])
- return np.array(majority_votes)
- def visualize_data(X, Y, target, y_pred=None, model=None, title='$Initial\ Data$'):
- color_ = ['b', 'orange', 'green']
- marker_ = ["o", "s", "D"]
- df = pd.concat([X, pd.DataFrame(Y, columns=['target'], index=X.index)], axis=1)
- plt.figure(figsize=(10, 6))
- for i, value in enumerate(np.unique(Y)):
- plt.scatter(X.iloc[Y == value, 0], X.iloc[Y == value, 1], c=color_[i], marker=marker_[i], label=fr'${target[i]}$')
- if y_pred is not None:
- df = pd.concat([df, pd.DataFrame(y_pred, columns=['pred'], index=X.index)], axis=1)
- misclassified_indices = df['pred'] != df['target']
- df_miss = df.loc[misclassified_indices]
- misclassified_indices = df.index
- for t_i, t in enumerate(np.unique(df_miss['target'])):
- for p_i, p in enumerate(np.unique(df_miss['pred'])):
- df_miss_i = df_miss.loc[(df_miss['target'] == t) & (df_miss['pred'] == p)]
- if len(df_miss_i) > 0:
- plt.scatter(df_miss_i.iloc[:, 0], df_miss_i.iloc[:, 1],
- c='black', marker=marker_[t_i], edgecolors=color_[p_i],
- label=fr'$Misclassified\ {target[t_i]}\ as\ {target[p_i]}$')
- n = 1
- x_min, x_max = X.iloc[:, 0].min() - n, X.iloc[:, 0].max() + n
- y_min, y_max = X.iloc[:, 1].min() - n, X.iloc[:, 1].max() + n
- xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
- Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
- Z = Z.reshape(xx.shape)
- plt.contourf(xx, yy, Z, alpha=0.35, cmap=plt.cm.coolwarm)
- plt.xlabel(r'$x$')
- plt.ylabel(r'$y$')
- plt.title(fr'{title}')
- plt.grid(True)
- plt.legend()
- plt.show()
- def quality_diagram(n_estimators, accuracy, title=''):
- plt.figure(figsize=(10, 6))
- plt.plot(n_estimators, accuracy, c='red', marker='o', markeredgecolor='blue', markerfacecolor='blue')
- plt.xlabel(r'$Кількість\ базових\ алгоритмів$')
- plt.ylabel(r'$Якість\ ансамблю, \%$')
- plt.title(fr'${title}$')
- plt.grid(True)
- plt.show()
- df = pd.read_csv('Moons.csv')
- df = df.iloc[:, 1:]
- df.columns = ['x', 'y', 'target']
- df.loc[df.target == 0, 'target'] = -1
- X = df.drop(labels=df.columns[-1], axis=1)
- Y = df[df.columns[-1]].values
- Y_unique = np.unique(Y)
- visualize_data(X, Y, Y_unique)
- x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
- n_estimators = [1, 5, 25, 50, 100, 250, 500, 750, 1000]
- train_accuracy, train_accuracy_sk = [], []
- test_accuracy, test_accuracy_sk = [], []
- for n in n_estimators:
- rf = RandomForestClassifierParallel(n_estimators=n, max_depth=5, min_samples_split=2, n_jobs=-1)
- rf.fit(x_train, y_train)
- y_pred = rf.predict(x_train)
- visualize_data(x_train, y_train, Y_unique, y_pred, rf, fr'$Train\ Data\ (n$_$estimators={n})$')
- accuracy = accuracy_score(y_train, y_pred)*100
- train_accuracy.append(accuracy)
- accuracy = round(accuracy, 2)
- display(Latex(fr'$RandomForestClassifier\ Train\ Accuracy: {accuracy}\%$'))
- y_pred = rf.predict(x_test)
- visualize_data(x_test, y_test, Y_unique, y_pred, rf, fr'$Test\ Data\ (n$_$estimators={n})$')
- accuracy = accuracy_score(y_test, y_pred)*100
- test_accuracy.append(accuracy)
- accuracy = round(accuracy, 2)
- display(Latex(fr'$RandomForestClassifier\ Test\ Accuracy: {accuracy}\%$'))
- # Sklearn
- rf_classifier = RFC(n_estimators=n, max_depth=5, min_samples_split=2, n_jobs=-1)
- rf_classifier.fit(x_train, y_train)
- y_pred = rf_classifier.predict(x_train)
- visualize_data(x_train, y_train, Y_unique, y_pred, rf_classifier, fr'$Sklearn\ Train\ Data\ (n$_$estimators={n})$')
- accuracy = accuracy_score(y_train, y_pred)*100
- train_accuracy_sk.append(accuracy)
- accuracy = round(accuracy, 2)
- display(Latex(fr'$Sklearn\ RandomForestClassifier\ Train\ Accuracy: {accuracy}\%$'))
- y_pred = rf_classifier.predict(x_test)
- visualize_data(x_test, y_test, Y_unique, y_pred, rf_classifier, fr'$Sklearn\ Test\ Data\ (n$_$estimators={n})$')
- accuracy = accuracy_score(y_test, y_pred)*100
- test_accuracy_sk.append(accuracy)
- accuracy = round(accuracy, 2)
- display(Latex(fr'$Sklearn\ RandomForestClassifier\ Test\ Accuracy: {accuracy}\%$'))
- quality_diagram(n_estimators, train_accuracy, title='Залежність\ якості\ алгоритму\ від\ кількості\ базових\ алгоритмів\ (Train\ Data)')
- quality_diagram(n_estimators, test_accuracy, title='Залежність\ якості\ алгоритму\ від\ кількості\ базових\ алгоритмів\ (Test\ Data)')
- quality_diagram(n_estimators, train_accuracy_sk, title='Залежність\ якості\ алгоритму\ від\ кількості\ базових\ алгоритмів\ (Sklearn\ Train\ Data)')
- quality_diagram(n_estimators, test_accuracy_sk, title='Залежність\ якості\ алгоритму\ від\ кількості\ базових\ алгоритмів\ (Sklearn\ Test\ Data)')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement