Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- import pandas as pd
- from sklearn.datasets import make_blobs
- from sklearn.decomposition import PCA
- from sklearn.metrics import adjusted_rand_score
- from sklearn.metrics import v_measure_score
- from sklearn.model_selection import train_test_split
- from sklearn.cluster import KMeans
- from sklearn.metrics import silhouette_score, accuracy_score
- class K_Means:
- def __init__(self, k=3, max_iters=10000, tol=1e-4):
- self.k = k
- self.max_iters = max_iters
- self.tol = tol
- self.centroids = None
- self.sse = 0
- def fit_predict(self, X):
- X = np.copy(X)
- np.random.seed(42)
- self.centroids = X[np.random.choice(len(X), self.k, replace=False)]
- for _ in range(self.max_iters):
- labels = self.assign_clusters(X)
- new_centroids = self.calculate_centroids(X, labels)
- if np.linalg.norm(new_centroids - self.centroids) < self.tol:
- break
- self.centroids = new_centroids
- self.sse = self.calculate_sse(X, labels)
- return labels
- def assign_clusters(self, X):
- distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids, axis=2)
- labels = np.argmin(distances, axis=1)
- return labels
- def calculate_centroids(self, X, labels):
- centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
- return centroids
- def calculate_sse(self, X, labels):
- sse = 0
- for i in range(self.k):
- cluster_points = X[labels == i]
- centroid = self.centroids[i]
- sse += np.sum((cluster_points - centroid) ** 2)
- return sse
- def predict(self, X):
- X = np.copy(X)
- distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids, axis=2)
- labels = np.argmin(distances, axis=1)
- return labels
- def reformat(df1, pred):
- df = df1.copy()
- df['pred'] = pred
- n = len(np.unique(df['pred']))
- arr = []
- idx = []
- for i in range(n):
- num = df[df['pred'] == i].describe().loc['mean', 'target']
- arr.append(num)
- idx.append(i)
- calc = pd.DataFrame({'i': idx, 'num': arr})
- calc = calc.sort_values(by='num')
- calc['replace'] = [f'tr{i}' for i in range(n)]
- calc.index = [i for i in range(n)]
- zero = calc.loc[(calc['i'] == 0), 'replace'].values[0]
- calc = calc.sort_values(by='i')
- my_dict = {0: zero}
- for i in range(1, n):
- zero = calc.loc[(calc['i'] == i), 'replace'].values[0]
- my_dict[i] = zero
- return my_dict
- def plot_clusters(X, centroids, labels, true_labels, title='K-means Clustering'):
- X = np.copy(X)
- true_labels = np.copy(true_labels)
- plt.figure(figsize=(10, 6))
- plt.scatter(X[:, 0], X[:, 1], c=true_labels, cmap='plasma', marker='s', s=100, edgecolors='black', label=f'true')
- plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='plasma', alpha=1, edgecolors='w', label='predict')
- plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=150, edgecolors='black', label='Centroids')
- plt.title(title)
- plt.xlabel(f'{data.columns[0]}')
- plt.ylabel(f'{data.columns[1]}')
- plt.grid()
- plt.legend()
- plt.show()
- def find_best_k(data, a=2, b=15):
- inertia = []
- vmeasure_scores = []
- silhouette_scores = []
- k_values = range(2, b+1)
- for k in k_values:
- kmeans = K_Means(k=k, max_iters=100, tol=1e-4)
- labels = kmeans.fit_predict(data[data.columns[0:2]])
- sse = kmeans.sse
- inertia.append(kmeans.sse)
- score = v_measure_score(data['target'], labels)
- vmeasure_scores.append(score)
- silhouette_avg = silhouette_score(data[data.columns[0:2]], labels)
- silhouette_scores.append(silhouette_avg)
- SSE_df = pd.DataFrame({'SSE': inertia, 'V_measure': vmeasure_scores, 'Silhouette': silhouette_scores}, index=k_values)
- SSE_df.index.names = ['k']
- best_k = SSE_df['V_measure'].idxmax()
- plt.figure(figsize=(10, 7))
- plt.plot(k_values, inertia, marker='o')
- plt.title('Elbow Method для вибору оптимального k')
- plt.xlabel('Кількість кластерів (k)')
- plt.ylabel('SSE')
- plt.grid()
- plt.show()
- plt.figure(figsize=(10, 7))
- plt.plot(k_values, vmeasure_scores, marker='o')
- plt.xlabel('Кількість кластерів')
- plt.ylabel('V_measure_score')
- plt.title('V_measure_score для вибору оптимального k')
- plt.grid(True)
- plt.show()
- plt.figure(figsize=(10, 7))
- plt.plot(k_values, silhouette_scores, marker='o')
- plt.xlabel('Кількість кластерів (k)')
- plt.ylabel('Силует')
- plt.title('Оцінка якості кластеризації за допомогою силуету')
- plt.grid()
- plt.show()
- print(SSE_df, f'\n\nbest_k: {best_k}\n\n')
- return best_k
- X, y = make_blobs(
- n_samples=250,
- n_features=5,
- centers=6,
- cluster_std=2,
- random_state=42
- )
- pca = PCA(n_components=2)
- X = pca.fit_transform(X)
- data = pd.DataFrame(X)
- data['target'] = y
- sns.pairplot(data, hue='target', palette='dark')
- plt.show()
- data
- x_train, x_test, y_train, y_test = train_test_split(data[data.columns[0:2]], data['target'], test_size=0.2, random_state=42)
- x_train['target'] = y_train
- x_test['target'] = y_test
- kmeans = K_Means(k=find_best_k(x_train), max_iters=10000, tol=1e-4)
- labels = kmeans.fit_predict(x_train[x_train.columns[0:2]])
- sse = kmeans.sse
- df1 = x_train.copy()
- pred_df = pd.DataFrame(labels)
- pred_df = pred_df.replace(reformat(df1, labels))
- dict_new = {'tr0': 0}
- for i in range(1, len(np.unique(labels))):
- dict_new[f'tr{i}'] = i
- pred = pred_df.replace(dict_new)
- x_train['pred'] = pred.values
- print("Centroids:")
- print(kmeans.centroids)
- plot_clusters(x_train, kmeans.centroids, x_train['pred'], y_train, title='Навчальна вибірка')
- test_labels = kmeans.predict(x_test[x_test.columns[0:2]])
- df2 = x_test.copy()
- pred_df = pd.DataFrame(test_labels)
- pred_df = pred_df.replace(reformat(df2, test_labels))
- dict_new = {'tr0': 0}
- for i in range(1, len(np.unique(test_labels))):
- dict_new[f'tr{i}'] = i
- pred = pred_df.replace(dict_new)
- x_test['pred'] = pred.values
- plot_clusters(x_test, kmeans.centroids, x_test['pred'], y_test, title='Тестова вибірка')
- print(f"Adjusted Rand Index train: {(round(adjusted_rand_score(x_train.target, labels)*100, 2))}%")
- print(f'Accuracy train: {round(accuracy_score(x_train.target, x_train.pred)*100, 2)}%')
- print(f"Adjusted Rand Index test: {(round(adjusted_rand_score(x_test.target, x_test.pred)*100, 2))}%")
- print(f'Accuracy test: {round(accuracy_score(x_test.target, x_test.pred)*100, 2)}%')
- kmeans_sk = KMeans(n_clusters=6, random_state=42)
- kmeans_sk.fit(np.copy(x_train[x_train.columns[0:2]]))
- labels_sk = kmeans_sk.predict(np.copy(x_train[x_train.columns[0:2]]))
- test_labels_sk = kmeans_sk.predict(np.copy(x_test[x_test.columns[0:2]]))
- df1 = x_train.copy()
- pred_df = pd.DataFrame(labels_sk)
- pred_df = pred_df.replace(reformat(df1, labels_sk))
- dict_new = {'tr0': 0}
- for i in range(1, len(np.unique(labels_sk))):
- dict_new[f'tr{i}'] = i
- pred = pred_df.replace(dict_new)
- x_train['pred_sk'] = pred.values
- plot_clusters(x_train, kmeans_sk.cluster_centers_, x_train['pred_sk'], y_train, title='Sklearn Навчальна вибірка')
- df2 = x_test.copy()
- pred_df = pd.DataFrame(test_labels_sk)
- pred_df = pred_df.replace(reformat(df2, test_labels_sk))
- dict_new = {'tr0': 0}
- for i in range(1, len(np.unique(test_labels_sk))):
- dict_new[f'tr{i}'] = i
- pred = pred_df.replace(dict_new)
- x_test['pred_sk'] = pred.values
- plot_clusters(x_test, kmeans_sk.cluster_centers_, x_test['pred_sk'], y_test, title='Sklearn Тестова вибірка')
- print(f"Sklearn Adjusted Rand Index train: {(round(adjusted_rand_score(x_train.target, labels_sk)*100, 2))}%")
- print(f'Sklearn train Accuracy: {round(accuracy_score(x_train.target, x_train.pred_sk)*100, 2)}%')
- print(f"Sklearn test Adjusted Rand Index: {(round(adjusted_rand_score(x_test.target, x_test.pred_sk)*100, 2))}%")
- print(f'Sklearn test Accuracy: {round(accuracy_score(x_test.target, x_test.pred_sk)*100, 2)}%')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement