Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- import pandas as pd
- from sklearn.datasets import make_blobs
- from sklearn.decomposition import PCA
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import silhouette_score, accuracy_score, adjusted_rand_score
- from sklearn.neighbors import NearestNeighbors
- from sklearn.cluster import DBSCAN as sk_DBSCAN
- def best_k_distance_graph(X, k=9):
- plt.figure(figsize=(10, 6))
- plt.xlabel('Points sorted by distance')
- plt.ylabel(f'k-distance')
- plt.title(f'K-Distance Graph')
- plt.grid(True)
- neigh = NearestNeighbors(n_neighbors=k)
- neigh.fit(X)
- distances, _ = neigh.kneighbors(X)
- for i in range(1, k):
- k_distances = distances[:, i]
- k_distances_sorted = np.sort(k_distances)
- plt.plot(np.arange(len(X)), k_distances_sorted, marker='.', label=f'k={i+1}')
- plt.legend()
- plt.show()
- class DBSCAN:
- def __init__(self, eps=0.5, min_samples=5):
- self.eps = eps
- self.min_samples = min_samples
- self.labels_ = None
- def fit(self, X):
- X = np.copy(X)
- self.labels_ = np.zeros(len(X), dtype=int)
- cluster_label = 0
- for i, point in enumerate(X):
- if self.labels_[i] != 0:
- continue
- neighbors = self._get_neighbors(X, i)
- if len(neighbors) < self.min_samples:
- self.labels_[i] = -1
- continue
- cluster_label += 1
- self._expand_cluster(X, i, neighbors, cluster_label)
- def predict(self, X):
- return self.labels_
- def _expand_cluster(self, X, point_index, neighbors, cluster_label):
- self.labels_[point_index] = cluster_label
- i = 0
- while i < len(neighbors):
- neighbor = neighbors[i]
- if self.labels_[neighbor] == -1:
- self.labels_[neighbor] = cluster_label
- elif self.labels_[neighbor] == 0:
- self.labels_[neighbor] = cluster_label
- new_neighbors = self._get_neighbors(X, neighbor)
- if len(new_neighbors) >= self.min_samples:
- neighbors.extend(new_neighbors)
- i += 1
- def _get_neighbors(self, X, point_index):
- neighbors = []
- for i, point in enumerate(X):
- if np.linalg.norm(point - X[point_index]) < self.eps:
- neighbors.append(i)
- return neighbors
- def visualize_clusters(X, labels, title='DBSCAN Clustering'):
- X = np.copy(X)
- labels = np.copy(labels)
- plt.figure(figsize=(10, 6))
- plt.grid()
- for i, label in enumerate(np.unique(labels)):
- if label == -1:
- plt.scatter(X[labels == label][:, 0], X[labels == label][:, 1], color='k', edgecolors='black', label='Noise')
- else:
- plt.scatter(X[labels == label][:, 0], X[labels == label][:, 1], edgecolors='black', label=f'Cluster {label}')
- plt.title(title)
- plt.xlabel('Feature 1')
- plt.ylabel('Feature 2')
- plt.legend()
- plt.show()
- X, y = make_blobs(
- n_samples=250,
- n_features=5,
- centers=6,
- cluster_std=2,
- random_state=42
- )
- pca = PCA(n_components=2)
- X = pca.fit_transform(X)
- data = pd.DataFrame(X)
- data['target'] = y
- sns.pairplot(data, hue='target', palette='dark')
- plt.show()
- best_k_distance_graph(np.copy(X))
- dbscan = DBSCAN(eps=2.3, min_samples=5)
- dbscan.fit(X)
- labels = dbscan.predict(X)
- visualize_clusters(X, labels)
- print(f"Adjusted Rand Index: {(round(adjusted_rand_score(data.target, labels)*100, 2))}%")
- dbscan = sk_DBSCAN(eps=2.3, min_samples=5)
- dbscan.fit(X)
- labels = dbscan.labels_
- visualize_clusters(X, labels, title='Sklearn DBSCAN')
- print(f"Sklearn Adjusted Rand Index: {(round(adjusted_rand_score(data.target, labels)*100, 2))}%")
- data = pd.read_csv('moons.csv')
- sns.pairplot(data)
- plt.show()
- X = data.copy()
- best_k_distance_graph(np.copy(X))
- dbscan = DBSCAN(eps=0.35, min_samples=3)
- dbscan.fit(X)
- labels = dbscan.predict(X)
- visualize_clusters(X, labels)
- dbscan = sk_DBSCAN(eps=0.35, min_samples=3)
- dbscan.fit(X)
- labels = dbscan.labels_
- visualize_clusters(X, labels, title='Sklearn DBSCAN')
- data = pd.read_csv('sklearn_moons.csv')
- data.columns = ['x', 'y', 'target']
- sns.pairplot(data, hue='target', palette='dark')
- plt.show()
- X = data.copy()
- best_k_distance_graph(np.copy(X))
- dbscan = DBSCAN(eps=0.125, min_samples=3)
- dbscan.fit(X)
- labels = dbscan.predict(X)
- visualize_clusters(X, labels)
- print(f"Adjusted Rand Index: {(round(adjusted_rand_score(data.target, labels)*100, 2))}%")
- dbscan = sk_DBSCAN(eps=0.125, min_samples=3)
- dbscan.fit(X)
- labels = dbscan.labels_
- visualize_clusters(X, labels, title='Sklearn DBSCAN')
- print(f"Sklearn Adjusted Rand Index: {(round(adjusted_rand_score(data.target, labels)*100, 2))}%")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement