Advertisement
makispaiktis

ML - Lab 7 - kmeans: metrics, elbow method, silhouette, heatmap

Oct 22nd, 2022 (edited)
1,001
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.10 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import sklearn
  5. import math
  6.  
  7.  
  8. # Read Data - Split into target and features = cdata
  9. cdata = pd.read_csv("./cdata.txt")
  10. print("cdata summary = ")
  11. print(cdata.describe())
  12. print()
  13. print()
  14. target = cdata.loc[:, "Y"]
  15. cdata = cdata.loc[:, ["X1", "X2"]]
  16.  
  17.  
  18. # Draw the different classes
  19. plt.figure()
  20. plt.scatter(cdata[target == 1].X1, cdata[target == 1].X2, color="red", marker="o", label="1")
  21. plt.scatter(cdata[target == 2].X1, cdata[target == 2].X2, color="blue", marker="o", label="2")
  22. plt.scatter(cdata[target == 3].X1, cdata[target == 3].X2, color="green", marker="o", label="3")
  23. plt.title("Initial Data")
  24. plt.xlabel("X1")
  25. plt.ylabel("X2")
  26. plt.legend()
  27. plt.show()
  28.  
  29.  
  30. # Elbow method - Cohesion = SSE
  31. from sklearn.cluster import KMeans
  32. sse = []
  33. RANGE = range(1, 11)
  34. for i in RANGE:
  35.     # sse list will contain the inertia_ of KMeans or cohesion or SSE
  36.     sse.append(KMeans(n_clusters=i, init=cdata.loc[0:i-1, :]).fit(cdata).inertia_)
  37.  
  38. plt.figure()
  39. plt.plot(RANGE, sse)                        # Solid line
  40. plt.scatter(RANGE, sse, marker="o")         # Points
  41. plt.title("SSE = Cohesion for every K")
  42. plt.xlabel("K")
  43. plt.ylabel("SSE")
  44. plt.show()
  45.  
  46.  
  47. # Select K = 3 for kmeans and calculate metrics
  48. K = 3
  49. kmeans = KMeans(n_clusters=K, init=cdata.loc[0:K-1, :])
  50. kmeans = kmeans.fit(cdata)
  51. cohesion = kmeans.inertia_
  52. print("K = " + str(K))
  53. print("Centroids = ")
  54. print(kmeans.cluster_centers_)
  55. print("Labels = " + str(kmeans.labels_))
  56. print("Cohesion = " + str(cohesion))
  57.  
  58. separation = 0
  59. distance = lambda x1, x2: math.sqrt(((x1.X1 - x2.X1) ** 2) + ((x1.X2 - x2.X2) ** 2))
  60. m = cdata.mean()
  61. for i in list(set(kmeans.labels_)):
  62.     mi = cdata.loc[kmeans.labels_ == i, :].mean()
  63.     Ci = len(cdata.loc[kmeans.labels_ == i, :].index)
  64.     separation += Ci * (distance(m, mi) ** 2)
  65. print("Separation = " + str(separation))
  66. print("CSS + BSS = " + str(cohesion + separation))
  67. print()
  68. print()
  69.  
  70.  
  71. # Draw Centroids and Points with the same color
  72. plt.figure()
  73. plt.scatter(cdata.X1, cdata.X2, c=kmeans.labels_)
  74. plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker="+", s=169, c="black")
  75. plt.title("Clustering with  K = " + str(K))
  76. plt.xlabel("X1")
  77. plt.ylabel("X2")
  78. plt.show()
  79.  
  80.  
  81. # Silhouette and Silhouette Plot
  82. from sklearn.metrics import silhouette_samples, silhouette_score
  83. print("Average Silhouette for each cluster: ")
  84. for i in range(K):
  85.     print(np.mean(silhouette_samples(cdata, kmeans.labels_)[kmeans.labels_ == i]))
  86. print("Average Silhouette for the whole clustering = " + str(silhouette_score(cdata, kmeans.labels_)))
  87. print()
  88.  
  89.  
  90. # Silhouette Visualizer - Yellowbrick Library
  91. import yellowbrick
  92. from yellowbrick.cluster import SilhouetteVisualizer
  93. visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
  94. visualizer.fit(cdata)
  95. visualizer.show()
  96.  
  97.  
  98. # Heatmap
  99. cdata["cluster"] = kmeans.labels_
  100. cdata = cdata.sort_values("cluster").drop("cluster", axis=1)
  101. from scipy.spatial import distance_matrix
  102. dist = distance_matrix(cdata, cdata)
  103. plt.imshow(dist, cmap='hot')
  104. plt.colorbar()
  105. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement