ML - Lab 8 - Hierarchical Clustering: Dendrograms and Silhouette

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn


# Read Data
europe = pd.read_csv("./europe.txt")
print("Data = ")
print(europe)
print()
print("Data Summary = ")
print(europe.describe())
print()


# Preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(europe)
europe = pd.DataFrame(scaler.transform(europe), columns=europe.columns, index=europe.index)
print("Data processed = ")
print(europe)
print()


# Hierarchical Clustering with complete link - Dendrogram
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
clustering = AgglomerativeClustering(n_clusters=None, linkage="complete", distance_threshold=0).fit(europe)
linkage_matrix = np.column_stack([clustering.children_, clustering.distances_, np.ones(len(europe.index)-1)]).astype(float)
dendrogram(linkage_matrix, labels=europe.index)
plt.title("Complete link (No clusters)")
plt.show()


# Silhouette score for the whole cluster
from sklearn.metrics import silhouette_score
slc = []
for i in range(2, 21):
    clustering = AgglomerativeClustering(n_clusters=i, linkage="complete").fit(europe)
    SILHOUETTE = silhouette_score(europe, clustering.labels_)
    slc.append(SILHOUETTE)

plt.plot(range(2, 21), slc)
plt.xticks(range(2, 21), range(2, 21))
plt.title("Silhouette score with complete link")
plt.xlabel("# of clusters")
plt.ylabel("Silhouette Score")
plt.show()


# Max silhouette at n = 7
n_clusters = 7
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="complete").fit(europe)
# 3-D plot of clustering
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(europe.GDP, europe.Inflation, europe.Unemployment, c=clustering.labels_, cmap="bwr")
for i in range(len(europe.index)):
    ax.text(europe.loc[europe.index[i], "GDP"], europe.loc[europe.index[i], "Inflation"], europe.loc[europe.index[i], "Unemployment"], '%s' % (str(europe.index[i])), size=5, zorder=1)
ax.set_xlabel('GDP')
ax.set_ylabel('Inflation')
ax.set_zlabel('Unemployment')
plt.title("Clustering in " + str(n_clusters) + " clusters")
plt.show()
print("n = " + str(n_clusters))
print("Silhouette score = " + str(silhouette_score(europe, clustering.labels_)))