Advertisement
mirosh111000

Метод kNN

Oct 1st, 2023
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.46 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.neighbors import KNeighborsClassifier
  6. from sklearn.metrics import accuracy_score
  7.  
  8.  
  9. def dist_df(x_train, x, k):
  10.  
  11.     info_df = x_train.copy()
  12.     info_df['Dist'] = np.zeros_like(x_train[x_train.columns[0]])
  13.  
  14.     for i in range(len(x_train)):
  15.         info_df.iloc[i, -1] = np.linalg.norm(x - x_train.iloc[i])
  16.  
  17.     info_df = info_df.sort_values(by='Dist')
  18.  
  19.     info_df = info_df.iloc[:k]
  20.  
  21.     return info_df.index
  22.  
  23.  
  24. def kNN(X, Y, k=5):
  25.  
  26.     x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
  27.     y_pred = y_test.copy()
  28.     y_pred[:] = np.NAN
  29.  
  30.     for i in range(len(y_test)):
  31.  
  32.         indexes_to_select = dist_df(x_train, x_test.iloc[i], k=k)
  33.         selected_rows = df.loc[indexes_to_select]
  34.         counts = selected_rows[selected_rows.columns[-1]].value_counts()
  35.  
  36.         if (counts == counts.max()).sum() > 1:
  37.  
  38.             idxs = counts.index
  39.  
  40.             for j in range(len(idxs)):
  41.  
  42.                 filtered_df = selected_rows[selected_rows[selected_rows.columns[-1]] == idxs[j]]
  43.                 counts[j] = 0
  44.  
  45.                 for l in range(len(filtered_df)):
  46.                     counts[j] += 1 / ((np.linalg.norm(x_test.iloc[i] - filtered_df.iloc[l, :-1]))**2 + 10**-5)
  47.  
  48.         y_pred.iloc[i] = counts.idxmax()
  49.  
  50.     res_df = x_test.copy()
  51.     res_df['y_test'] = y_test
  52.     res_df['y_pred'] = y_pred
  53.  
  54.     res_df['Matching'] = res_df['y_test'] == res_df['y_pred']
  55.     acc = round((res_df['Matching'] == True).sum() / (len(res_df['Matching'])) * 100, 2)
  56.     res_df.loc['Accuracy, %'] = ['' for i in range(len(res_df.columns))]
  57.     res_df.iloc[-1, -1] = acc
  58.  
  59.     return res_df
  60.  
  61. iris = pd.read_csv('iris.csv')
  62.  
  63. df = iris.drop(columns=[iris.columns[2], iris.columns[3]])
  64. # df = iris.drop(columns=[iris.columns[3]])
  65. # df = iris.copy()
  66.  
  67. X = df.drop(labels=df.columns[-1], axis=1)
  68. Y = df[df.columns[-1]]
  69.  
  70. sol = kNN(X, Y, 4)
  71.  
  72. print(sol)
  73.  
  74. a = 1
  75. b = 20
  76. df_best_k = pd.DataFrame({'Accuracy, %': [(kNN(X, Y, k=i)).iloc[-1, -1] for i in range(a, b+1)]},
  77.                          index=[i for i in range(a, b+1)])
  78. df_best_k.index.names = ['k=']
  79.  
  80. print(df_best_k)
  81.  
  82. best_k = df_best_k.idxmax()[0]
  83. print(f'Найкраще підходе k = {best_k}')
  84.  
  85. best_sol = kNN(X, Y, k=best_k)
  86.  
  87. print(best_sol)
  88.  
  89.  
  90.  
  91. colors = ['b', 'g', 'orange']
  92.  
  93. for i in range(len(np.unique(df.species))):
  94.     plt.scatter(df[df.species == (np.unique(df.species))[i]][df.columns[0]],
  95.                 df[df.species == (np.unique(df.species))[i]][df.columns[1]],
  96.                 c=colors[i], marker='o', label=f'{(np.unique(df.species))[i]}')
  97.  
  98. plt.scatter(df.loc[best_sol.index[:-1]][df.columns[0]], df.loc[best_sol.index[:-1]][df.columns[1]],
  99.             c='grey', marker='o', label='Вид ?')
  100. plt.xlabel(f'{df.columns[0]}')
  101. plt.ylabel(f'{df.columns[1]}')
  102. plt.legend()
  103. plt.show()
  104.  
  105.  
  106. for i in range(len(np.unique(df.species))):
  107.     plt.scatter(df[df.species == (np.unique(df.species))[i]][df.columns[0]],
  108.                 df[df.species == (np.unique(df.species))[i]][df.columns[1]],
  109.                 c=colors[i], marker='o', label=f'{(np.unique(df.species))[i]}')
  110.  
  111. best_sol = best_sol.drop(best_sol.index[-1])
  112. for i in range(len(np.unique(best_sol.y_pred))):
  113.     plt.scatter(best_sol[best_sol.y_pred == (np.unique(best_sol.y_pred))[i]][best_sol.columns[0]],
  114.                 best_sol[best_sol.y_pred == (np.unique(best_sol.y_pred))[i]][best_sol.columns[1]],
  115.                 c=colors[i], marker='o')
  116.  
  117. plt.xlabel(f'{df.columns[0]}')
  118. plt.ylabel(f'{df.columns[1]}')
  119. plt.legend()
  120. plt.show()
  121.  
  122.  
  123.  
  124. x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=10)
  125.  
  126. df_best_k_sk = pd.DataFrame({'Accuracy sklearn, %': [round(accuracy_score( y_test, ((KNeighborsClassifier(n_neighbors=i)).fit(x_train, y_train)).predict(x_test) ) * 100, 2) for i in range(a, b+1)]},
  127.                          index=[i for i in range(a, b+1)])
  128. df_best_k_sk.index.names = ['k=']
  129.  
  130. print(df_best_k_sk)
  131.  
  132. best_k_sk = df_best_k_sk.idxmax()[0]
  133. print(f'Найкраще підходе k = {best_k_sk}')
  134.  
  135. df_best_k = pd.concat([df_best_k, df_best_k_sk], axis=1)
  136.  
  137. print(df_best_k)
  138.  
  139. plt.plot(df_best_k.index, df_best_k[df_best_k.columns[0]], label='my kNN', lw=2)
  140. plt.plot(df_best_k.index, df_best_k[df_best_k.columns[1]], label='sklearn kNN')
  141. plt.xlabel(f'k')
  142. plt.ylabel(f'Accuracy, %')
  143. plt.legend()
  144. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement