Advertisement
mirosh111000

K-Means

Feb 4th, 2024
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.63 KB | None | 0 0
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import seaborn as sns
  4. import pandas as pd
  5. from sklearn.datasets import make_blobs
  6. from sklearn.decomposition import PCA
  7. from sklearn.metrics import adjusted_rand_score
  8. from sklearn.metrics import v_measure_score
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.cluster import KMeans
  11. from sklearn.metrics import silhouette_score, accuracy_score
  12.  
  13. class K_Means:
  14.     def __init__(self, k=3, max_iters=10000, tol=1e-4):
  15.        
  16.         self.k = k
  17.         self.max_iters = max_iters
  18.         self.tol = tol
  19.         self.centroids = None
  20.         self.sse = 0
  21.  
  22.     def fit_predict(self, X):
  23.        
  24.         X = np.copy(X)
  25.         np.random.seed(42)
  26.        
  27.         self.centroids = X[np.random.choice(len(X), self.k, replace=False)]
  28.        
  29.         for _ in range(self.max_iters):
  30.            
  31.             labels = self.assign_clusters(X)
  32.             new_centroids = self.calculate_centroids(X, labels)
  33.            
  34.             if np.linalg.norm(new_centroids - self.centroids) < self.tol:
  35.                 break
  36.                
  37.             self.centroids = new_centroids
  38.            
  39.         self.sse = self.calculate_sse(X, labels)
  40.        
  41.         return labels
  42.  
  43.     def assign_clusters(self, X):
  44.        
  45.         distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids, axis=2)
  46.         labels = np.argmin(distances, axis=1)
  47.        
  48.         return labels
  49.  
  50.     def calculate_centroids(self, X, labels):
  51.        
  52.         centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
  53.        
  54.         return centroids
  55.  
  56.     def calculate_sse(self, X, labels):
  57.        
  58.         sse = 0
  59.        
  60.         for i in range(self.k):
  61.            
  62.             cluster_points = X[labels == i]
  63.             centroid = self.centroids[i]
  64.             sse += np.sum((cluster_points - centroid) ** 2)
  65.            
  66.         return sse
  67.    
  68.     def predict(self, X):
  69.        
  70.         X = np.copy(X)
  71.         distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids, axis=2)
  72.         labels = np.argmin(distances, axis=1)
  73.        
  74.         return labels
  75.    
  76.    
  77. def reformat(df1, pred):
  78.    
  79.     df = df1.copy()
  80.     df['pred'] = pred
  81.     n = len(np.unique(df['pred']))
  82.     arr = []
  83.     idx = []
  84.    
  85.     for i in range(n):
  86.        
  87.         num = df[df['pred'] == i].describe().loc['mean', 'target']
  88.         arr.append(num)
  89.         idx.append(i)
  90.        
  91.     calc = pd.DataFrame({'i': idx, 'num': arr})
  92.     calc = calc.sort_values(by='num')
  93.     calc['replace'] = [f'tr{i}' for i in range(n)]
  94.     calc.index = [i for i in range(n)]
  95.     zero = calc.loc[(calc['i'] == 0), 'replace'].values[0]
  96.     calc = calc.sort_values(by='i')
  97.     my_dict = {0: zero}
  98.    
  99.     for i in range(1, n):
  100.        
  101.         zero = calc.loc[(calc['i'] == i), 'replace'].values[0]
  102.         my_dict[i] = zero
  103.        
  104.     return my_dict
  105.    
  106.    
  107. def plot_clusters(X, centroids, labels, true_labels, title='K-means Clustering'):
  108.    
  109.     X = np.copy(X)
  110.     true_labels = np.copy(true_labels)
  111.    
  112.     plt.figure(figsize=(10, 6))
  113.     plt.scatter(X[:, 0], X[:, 1], c=true_labels, cmap='plasma', marker='s', s=100, edgecolors='black', label=f'true')
  114.     plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='plasma', alpha=1, edgecolors='w', label='predict')
  115.     plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=150, edgecolors='black', label='Centroids')
  116.     plt.title(title)
  117.     plt.xlabel(f'{data.columns[0]}')
  118.     plt.ylabel(f'{data.columns[1]}')
  119.     plt.grid()
  120.     plt.legend()
  121.     plt.show()
  122.    
  123.  
  124. def find_best_k(data, a=2, b=15):
  125.    
  126.     inertia = []
  127.     vmeasure_scores = []
  128.     silhouette_scores = []
  129.    
  130.     k_values = range(2, b+1)
  131.     for k in k_values:
  132.         kmeans = K_Means(k=k, max_iters=100, tol=1e-4)
  133.         labels = kmeans.fit_predict(data[data.columns[0:2]])
  134.         sse = kmeans.sse
  135.         inertia.append(kmeans.sse)
  136.         score = v_measure_score(data['target'], labels)
  137.         vmeasure_scores.append(score)
  138.         silhouette_avg = silhouette_score(data[data.columns[0:2]], labels)
  139.         silhouette_scores.append(silhouette_avg)
  140.        
  141.     SSE_df = pd.DataFrame({'SSE': inertia, 'V_measure': vmeasure_scores, 'Silhouette': silhouette_scores}, index=k_values)
  142.     SSE_df.index.names = ['k']
  143.     best_k = SSE_df['V_measure'].idxmax()
  144.    
  145.     plt.figure(figsize=(10, 7))
  146.     plt.plot(k_values, inertia, marker='o')
  147.     plt.title('Elbow Method для вибору оптимального k')
  148.     plt.xlabel('Кількість кластерів (k)')
  149.     plt.ylabel('SSE')
  150.     plt.grid()
  151.     plt.show()
  152.    
  153.     plt.figure(figsize=(10, 7))
  154.     plt.plot(k_values, vmeasure_scores, marker='o')
  155.     plt.xlabel('Кількість кластерів')
  156.     plt.ylabel('V_measure_score')
  157.     plt.title('V_measure_score для вибору оптимального k')
  158.     plt.grid(True)
  159.     plt.show()
  160.    
  161.     plt.figure(figsize=(10, 7))
  162.     plt.plot(k_values, silhouette_scores, marker='o')
  163.     plt.xlabel('Кількість кластерів (k)')
  164.     plt.ylabel('Силует')
  165.     plt.title('Оцінка якості кластеризації за допомогою силуету')
  166.     plt.grid()
  167.     plt.show()
  168.    
  169.     print(SSE_df, f'\n\nbest_k: {best_k}\n\n')
  170.    
  171.     return best_k
  172.  
  173.  
  174. X, y = make_blobs(
  175.     n_samples=250,
  176.     n_features=5,
  177.     centers=6,
  178.     cluster_std=2,
  179.     random_state=42
  180. )
  181.  
  182.  
  183. pca = PCA(n_components=2)
  184. X = pca.fit_transform(X)
  185. data = pd.DataFrame(X)
  186. data['target'] = y
  187. sns.pairplot(data, hue='target', palette='dark')
  188. plt.show()
  189. data
  190.  
  191.  
  192. x_train, x_test, y_train, y_test = train_test_split(data[data.columns[0:2]], data['target'], test_size=0.2, random_state=42)
  193. x_train['target'] = y_train
  194. x_test['target'] = y_test
  195.  
  196. kmeans = K_Means(k=find_best_k(x_train), max_iters=10000, tol=1e-4)
  197. labels = kmeans.fit_predict(x_train[x_train.columns[0:2]])
  198. sse = kmeans.sse
  199.  
  200. df1 = x_train.copy()
  201. pred_df = pd.DataFrame(labels)
  202. pred_df = pred_df.replace(reformat(df1, labels))
  203. dict_new = {'tr0': 0}
  204. for i in range(1, len(np.unique(labels))):
  205.     dict_new[f'tr{i}'] = i
  206. pred = pred_df.replace(dict_new)
  207. x_train['pred'] = pred.values
  208. print("Centroids:")
  209. print(kmeans.centroids)
  210.  
  211. plot_clusters(x_train, kmeans.centroids, x_train['pred'], y_train, title='Навчальна вибірка')
  212.  
  213. test_labels = kmeans.predict(x_test[x_test.columns[0:2]])
  214. df2 = x_test.copy()
  215.  
  216. pred_df = pd.DataFrame(test_labels)
  217. pred_df = pred_df.replace(reformat(df2, test_labels))
  218. dict_new = {'tr0': 0}
  219. for i in range(1, len(np.unique(test_labels))):
  220.     dict_new[f'tr{i}'] = i
  221. pred = pred_df.replace(dict_new)
  222. x_test['pred'] = pred.values
  223.  
  224. plot_clusters(x_test, kmeans.centroids, x_test['pred'], y_test, title='Тестова вибірка')
  225.  
  226. print(f"Adjusted Rand Index train: {(round(adjusted_rand_score(x_train.target, labels)*100, 2))}%")
  227. print(f'Accuracy train: {round(accuracy_score(x_train.target, x_train.pred)*100, 2)}%')
  228. print(f"Adjusted Rand Index test: {(round(adjusted_rand_score(x_test.target, x_test.pred)*100, 2))}%")
  229. print(f'Accuracy test: {round(accuracy_score(x_test.target, x_test.pred)*100, 2)}%')
  230.  
  231. kmeans_sk = KMeans(n_clusters=6, random_state=42)
  232. kmeans_sk.fit(np.copy(x_train[x_train.columns[0:2]]))
  233. labels_sk = kmeans_sk.predict(np.copy(x_train[x_train.columns[0:2]]))
  234. test_labels_sk = kmeans_sk.predict(np.copy(x_test[x_test.columns[0:2]]))
  235.  
  236. df1 = x_train.copy()
  237.  
  238. pred_df = pd.DataFrame(labels_sk)
  239. pred_df = pred_df.replace(reformat(df1, labels_sk))
  240. dict_new = {'tr0': 0}
  241. for i in range(1, len(np.unique(labels_sk))):
  242.     dict_new[f'tr{i}'] = i
  243. pred = pred_df.replace(dict_new)
  244. x_train['pred_sk'] = pred.values
  245.  
  246. plot_clusters(x_train, kmeans_sk.cluster_centers_, x_train['pred_sk'], y_train, title='Sklearn Навчальна вибірка')
  247.  
  248. df2 = x_test.copy()
  249.  
  250. pred_df = pd.DataFrame(test_labels_sk)
  251. pred_df = pred_df.replace(reformat(df2, test_labels_sk))
  252. dict_new = {'tr0': 0}
  253. for i in range(1, len(np.unique(test_labels_sk))):
  254.     dict_new[f'tr{i}'] = i
  255. pred = pred_df.replace(dict_new)
  256. x_test['pred_sk'] = pred.values
  257.  
  258. plot_clusters(x_test, kmeans_sk.cluster_centers_, x_test['pred_sk'], y_test, title='Sklearn Тестова вибірка')
  259.  
  260. print(f"Sklearn Adjusted Rand Index train: {(round(adjusted_rand_score(x_train.target, labels_sk)*100, 2))}%")
  261. print(f'Sklearn train Accuracy: {round(accuracy_score(x_train.target, x_train.pred_sk)*100, 2)}%')
  262. print(f"Sklearn test Adjusted Rand Index: {(round(adjusted_rand_score(x_test.target, x_test.pred_sk)*100, 2))}%")
  263. print(f'Sklearn test Accuracy: {round(accuracy_score(x_test.target, x_test.pred_sk)*100, 2)}%')
  264.  
  265.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement