Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import accuracy_score
- def dist_df(x_train, x, k):
- info_df = x_train.copy()
- info_df['Dist'] = np.zeros_like(x_train[x_train.columns[0]])
- for i in range(len(x_train)):
- info_df.iloc[i, -1] = np.linalg.norm(x - x_train.iloc[i])
- info_df = info_df.sort_values(by='Dist')
- info_df = info_df.iloc[:k]
- return info_df.index
- def kNN(X, Y, k=5):
- x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
- y_pred = y_test.copy()
- y_pred[:] = np.NAN
- for i in range(len(y_test)):
- indexes_to_select = dist_df(x_train, x_test.iloc[i], k=k)
- selected_rows = df.loc[indexes_to_select]
- counts = selected_rows[selected_rows.columns[-1]].value_counts()
- if (counts == counts.max()).sum() > 1:
- idxs = counts.index
- for j in range(len(idxs)):
- filtered_df = selected_rows[selected_rows[selected_rows.columns[-1]] == idxs[j]]
- counts[j] = 0
- for l in range(len(filtered_df)):
- counts[j] += 1 / ((np.linalg.norm(x_test.iloc[i] - filtered_df.iloc[l, :-1]))**2 + 10**-5)
- y_pred.iloc[i] = counts.idxmax()
- res_df = x_test.copy()
- res_df['y_test'] = y_test
- res_df['y_pred'] = y_pred
- res_df['Matching'] = res_df['y_test'] == res_df['y_pred']
- acc = round((res_df['Matching'] == True).sum() / (len(res_df['Matching'])) * 100, 2)
- res_df.loc['Accuracy, %'] = ['' for i in range(len(res_df.columns))]
- res_df.iloc[-1, -1] = acc
- return res_df
- iris = pd.read_csv('iris.csv')
- df = iris.drop(columns=[iris.columns[2], iris.columns[3]])
- # df = iris.drop(columns=[iris.columns[3]])
- # df = iris.copy()
- X = df.drop(labels=df.columns[-1], axis=1)
- Y = df[df.columns[-1]]
- sol = kNN(X, Y, 4)
- print(sol)
- a = 1
- b = 20
- df_best_k = pd.DataFrame({'Accuracy, %': [(kNN(X, Y, k=i)).iloc[-1, -1] for i in range(a, b+1)]},
- index=[i for i in range(a, b+1)])
- df_best_k.index.names = ['k=']
- print(df_best_k)
- best_k = df_best_k.idxmax()[0]
- print(f'Найкраще підходе k = {best_k}')
- best_sol = kNN(X, Y, k=best_k)
- print(best_sol)
- colors = ['b', 'g', 'orange']
- for i in range(len(np.unique(df.species))):
- plt.scatter(df[df.species == (np.unique(df.species))[i]][df.columns[0]],
- df[df.species == (np.unique(df.species))[i]][df.columns[1]],
- c=colors[i], marker='o', label=f'{(np.unique(df.species))[i]}')
- plt.scatter(df.loc[best_sol.index[:-1]][df.columns[0]], df.loc[best_sol.index[:-1]][df.columns[1]],
- c='grey', marker='o', label='Вид ?')
- plt.xlabel(f'{df.columns[0]}')
- plt.ylabel(f'{df.columns[1]}')
- plt.legend()
- plt.show()
- for i in range(len(np.unique(df.species))):
- plt.scatter(df[df.species == (np.unique(df.species))[i]][df.columns[0]],
- df[df.species == (np.unique(df.species))[i]][df.columns[1]],
- c=colors[i], marker='o', label=f'{(np.unique(df.species))[i]}')
- best_sol = best_sol.drop(best_sol.index[-1])
- for i in range(len(np.unique(best_sol.y_pred))):
- plt.scatter(best_sol[best_sol.y_pred == (np.unique(best_sol.y_pred))[i]][best_sol.columns[0]],
- best_sol[best_sol.y_pred == (np.unique(best_sol.y_pred))[i]][best_sol.columns[1]],
- c=colors[i], marker='o')
- plt.xlabel(f'{df.columns[0]}')
- plt.ylabel(f'{df.columns[1]}')
- plt.legend()
- plt.show()
- x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
- df_best_k_sk = pd.DataFrame({'Accuracy sklearn, %': [round(accuracy_score( y_test, ((KNeighborsClassifier(n_neighbors=i)).fit(x_train, y_train)).predict(x_test) ) * 100, 2) for i in range(a, b+1)]},
- index=[i for i in range(a, b+1)])
- df_best_k_sk.index.names = ['k=']
- print(df_best_k_sk)
- best_k_sk = df_best_k_sk.idxmax()[0]
- print(f'Найкраще підходе k = {best_k_sk}')
- df_best_k = pd.concat([df_best_k, df_best_k_sk], axis=1)
- print(df_best_k)
- plt.plot(df_best_k.index, df_best_k[df_best_k.columns[0]], label='my kNN', lw=2)
- plt.plot(df_best_k.index, df_best_k[df_best_k.columns[1]], label='sklearn kNN')
- plt.xlabel(f'k')
- plt.ylabel(f'Accuracy, %')
- plt.legend()
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement