Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import sklearn
- # Read Data
- europe = pd.read_csv("./europe.txt")
- print("Data = ")
- print(europe)
- print()
- print("Data Summary = ")
- print(europe.describe())
- print()
- # Preprocessing
- from sklearn.preprocessing import StandardScaler
- scaler = StandardScaler()
- scaler = scaler.fit(europe)
- europe = pd.DataFrame(scaler.transform(europe), columns=europe.columns, index=europe.index)
- print("Data processed = ")
- print(europe)
- print()
- # Hierarchical Clustering with complete link - Dendrogram
- from sklearn.cluster import AgglomerativeClustering
- from scipy.cluster.hierarchy import dendrogram
- clustering = AgglomerativeClustering(n_clusters=None, linkage="complete", distance_threshold=0).fit(europe)
- linkage_matrix = np.column_stack([clustering.children_, clustering.distances_, np.ones(len(europe.index)-1)]).astype(float)
- dendrogram(linkage_matrix, labels=europe.index)
- plt.title("Complete link (No clusters)")
- plt.show()
- # Silhouette score for the whole cluster
- from sklearn.metrics import silhouette_score
- slc = []
- for i in range(2, 21):
- clustering = AgglomerativeClustering(n_clusters=i, linkage="complete").fit(europe)
- SILHOUETTE = silhouette_score(europe, clustering.labels_)
- slc.append(SILHOUETTE)
- plt.plot(range(2, 21), slc)
- plt.xticks(range(2, 21), range(2, 21))
- plt.title("Silhouette score with complete link")
- plt.xlabel("# of clusters")
- plt.ylabel("Silhouette Score")
- plt.show()
- # Max silhouette at n = 7
- n_clusters = 7
- clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="complete").fit(europe)
- # 3-D plot of clustering
- fig = plt.figure()
- ax = fig.add_subplot(projection='3d')
- ax.scatter(europe.GDP, europe.Inflation, europe.Unemployment, c=clustering.labels_, cmap="bwr")
- for i in range(len(europe.index)):
- ax.text(europe.loc[europe.index[i], "GDP"], europe.loc[europe.index[i], "Inflation"], europe.loc[europe.index[i], "Unemployment"], '%s' % (str(europe.index[i])), size=5, zorder=1)
- ax.set_xlabel('GDP')
- ax.set_ylabel('Inflation')
- ax.set_zlabel('Unemployment')
- plt.title("Clustering in " + str(n_clusters) + " clusters")
- plt.show()
- print("n = " + str(n_clusters))
- print("Silhouette score = " + str(silhouette_score(europe, clustering.labels_)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement