SHOW:
|
|
- or go back to the newest paste.
1 | import click | |
2 | ||
3 | from sklearn.cluster import KMeans | |
4 | from sklearn import metrics | |
5 | ||
6 | from scipy.linalg import svd | |
7 | ||
8 | import numpy as np | |
9 | import pandas as pd | |
10 | ||
11 | ||
12 | @click.command() | |
13 | @click.argument('input-file', help='Name of the input file. Should be stored as a Tab Separated File') | |
14 | @click.option('--output-file', default=None, help='Output results to a file instead of printing results') | |
15 | @click.option('--n-clusters', default=3, type=int, help='The number of KMeans clusters') | |
16 | @click.option('--pca', is_flag=True, help='Should we use PCA precprocessing of the data?') | |
17 | @click.option('--pca-min-variance', default=1.0, type=float, | |
18 | help='The amount of variance to perserve if using PCA. Ignored if not used with --pca flag') | |
19 | @click.option('--evaluate-clusters', is_flag=True, help='Should we evaluate cluster quality after clustering?') | |
20 | @click.option('--verbose', is_flag=True, help='Print progress') | |
21 | def run_clustering(input_file, output_file, n_clusters, pca, pca_min_variance, evaluate_clusters, verbose): | |
22 | """Run KMeans Clustering on an input TSV File with Optional PCA""" | |
23 | input_data = pd.read_table(input_file) | |
24 | ||
25 | X = np.array(input_data) | |
26 | if pca: | |
27 | if verbose: | |
28 | print 'Fitting PCA' | |
29 | ||
30 | X = pca(X, pca_min_variance, verbose) | |
31 | ||
32 | if verbose: | |
33 | print 'Fitting K-Means with %s clusters' % n_clusters | |
34 | ||
35 | kmeans = KMeans(n_clusters=n_clusters).fit(X) | |
36 | ||
37 | clusters = kmeans.labels_ | |
38 | ||
39 | if evaluate_clusters: | |
40 | if verbose: | |
41 | print 'Calculating Calinski-Harabaz Index' | |
42 | ||
43 | print 'Calinski-Harabaz Index: %s' % metrics.calinski_harabaz_score(X, clusters) | |
44 | ||
45 | if output_file is not None: | |
46 | input_data['clusters'] = clusters | |
47 | input_data.to_csv(output_file, index=False, columns='clusters', sep='\t') | |
48 | ||
49 | else: | |
50 | print 'clusters' | |
51 | for i in clusters: | |
52 | print i | |
53 | ||
54 | ||
55 | def pca(X, variance_explained=1., verbose=True): | |
56 | """Compute PCA on data matrix X""" | |
57 | X = np.array(X) | |
58 | m, n = X.shape | |
59 | mu = X.mean(0) | |
60 | ||
61 | Xn = X - mu | |
62 | ||
63 | if verbose: | |
64 | print 'Fitting covariance matrix' | |
65 | ||
66 | XtX = Xn.T.dot(Xn) / m | |
67 | ||
68 | if verbose: | |
69 | print 'Computing SVD' | |
70 | ||
71 | U, s, V = svd(XtX) | |
72 | ||
73 | variance_explained_ratio = (s / s.sum()).cumsum() | |
74 | n_comp = (variance_explained_ratio < variance_explained).sum() | |
75 | ||
76 | if verbose: | |
77 | print 'Selecting %s components with an explained variance of %s' % (n_comp, variance_explained_ratio[n_comp]) | |
78 | ||
79 | return Xn.dot(U[:, :n_comp]) | |
80 | ||
81 | ||
82 | if __name__ == '__main__': | |
83 | run_clustering() |