Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### VERSION 0.2 created Jan 26 2015
- from __future__ import division
- import sys
- import matplotlib.pyplot as plt
- import numpy as np
- import argparse
- from sklearn.cluster import KMeans as kmeans
- from pylab import savefig
- from scipy.optimize import minimize as fminsearch
- from sklearn.linear_model import Ridge
- def funRidge(x,X,D,p_train,p_test):
- R = Ridge(alpha=x)
- R.fit(D[p_train,:], X[p_train])
- return np.linalg.norm(X[p_test] - np.asmatrix(R.predict(D[p_test,:])))
- def diffex_FormatData(lfile, O, n, chrs=None, metric='mean'):
- M = []
- with open(lfile) as F:
- for line in F:
- if len(line)>1: # Use only lines with nontrivial text.
- line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
- parse = line.split('\t')
- tmp = buildColumn(parse[0], chrs, O, n, metric)
- tmp = tmp.transpose().astype(np.float64).tolist()[0]
- M.append(tmp)
- M = np.matrix(M).transpose()
- return M
- def make_xval_partition(n,n_folds):
- K = np.random.permutation(n)%n_folds
- return K
- def buildColumn(fname, chrs, O, n, metric):
- print(fname)
- col = np.zeros((n,len(metric)))
- with open(fname) as F:
- for line in F:
- if line[:3] == 'chr':
- line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
- tmp = line.split('\t')
- if tmp[0] in chrs:
- for i,row in O[tmp[0]]:
- if int(row[0]) > int(tmp[1]):
- pass
- elif int(row[1]) >= int(tmp[2]):
- nmin = max(int(row[0]),int(tmp[1]))
- nmax = min(int(row[1]),int(tmp[2]))
- for j,M in enumerate(metric):
- if M=='mean':
- col[i,j] += (nmax-nmin+1)*float(tmp[3])/(int(row[1])-int(row[0])+1)
- elif M=='max':
- col[i,j] = max(col[i,j],float(tmp[3]))
- else:
- raise ValueError('Invalid metric: %s' % metric)
- else:
- pass
- return col
- def getPCs(P, numPCs = -1):
- C = np.cov(P.transpose())
- a,v = np.linalg.eig(C)
- D = (P - np.mean(P,0))*v[:,:numPCs]
- return D,v
- parser = argparse.ArgumentParser(description='DiffEx 0.2 (C) 2015 Scott Norton (Univ of Pennsylvania School of Medicine, Dept of Biomedical Graduate Studies, Graduate Group in Genomics and Computational Biology), Lyle Ungar, Ph.D (Univ of Pennsylvania, Department of Computer and Information Sciences)',prog='DIFFEX')
- parser.add_argument('-1',dest='l1file',help='list of reference file paths for cell type 1')
- parser.add_argument('-2',dest='l2file',help='list of reference file paths for cell type 2')
- parser.add_argument('-e',dest='expfile',help='path to differential expression file')
- parser.add_argument('-m','--metric',dest='metric',type=str,default='mean',help='metric to use for computing the data matrix. Options: mean, max. Comma-separate multiple metrics.')
- parser.add_argument('-c','--chrs',default=None,dest='chrs',help='list of chromosomes to use, comma-separated i.e. "chr1,chr3,chr7_random". Default: the whole genome')
- parser.add_argument('-o','--out',default='./DiffEx.out',dest='ofile',help='output file path. Default: DiffEx.out')
- parser.add_argument('-k',default=3,dest='k',help='k to use for k-means. Default: 3',type=int)
- parser.add_argument('-s','--save-matrix',action='store_true',dest='saveMatrix',help='choose whether to save the intermediate data matrices. Ignored for any file loaded using --restore and --restore_path. Default: False')
- parser.add_argument('-r','--restore',dest='restore',default=0,type=int,help='choose the restore point: 0 = start from beginning; 1 = restore matrix; 2 = restore princpal component scores. 1 and 2 require that the --restore_path option be set. Default: 0.')
- parser.add_argument('-p','--restore-path',dest='rfile',default='',help='path to restore file. Default: same as --out')
- parser.add_argument('-n','--num-pcs',dest='numPCs',type=int,default=-1,help='number of principal components to use. Default: all of them')
- parser.add_argument('-t','--threshold',dest='thresh',type=np.float64,default=1,help='threshold for expression data. Default: 1')
- parser.add_argument('-g','--genome',dest='genome',type=str,default='./mm9',help='Path to genome data folder. Default: ./mm9')
- parser.add_argument('-f','--feature',dest='feature',type=str,default='isol',help='Which features to send to PCA. Options: isol (default), diff')
- parsed = parser.parse_args()
- if parsed.l1file is None or parsed.l2file is None or parsed.expfile is None:
- parser.print_help()
- exit()
- if parsed.rfile=='':
- parsed.rfile = parsed.ofile
- print('Loading transcript coordinates and padding by 2kb...')
- geneList = []
- geneData = []
- if parsed.chrs != None:
- parsed.chrs = parsed.chrs.split(',')
- empty=False
- else:
- parsed.chrs = []
- empty=True
- with open(parsed.genome+'/refGene.bed') as C:
- for line in C:
- n=1
- if len(line)>1: # Use only lines with nontrivial text.
- line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
- tmp = line.split('\t')
- if empty or tmp[0] in parsed.chrs:
- tmp[1] = int(tmp[1])-2000
- tmp[2] = int(tmp[2])+2000
- geneList.append(tmp[:4])
- geneList = np.array(geneList)
- O = {}
- for i,row in enumerate(geneList):
- if row[0] in O.keys():
- O[row[0]].append((i, row[1:]))
- else:
- O[row[0]] = [(i, row[1:])]
- n = sum([len(O[key]) for key in O.keys()])
- dtype = np.dtype("i8,i8,|S16")
- for key in O.keys():
- O[key].sort()
- if parsed.restore>=1:
- print('Restoring saved data matrix...')
- M = np.asmatrix(np.genfromtxt(parsed.rfile+'.data.txt', delimiter='\t'))
- X = np.asmatrix(np.genfromtxt(parsed.rfile+'.expression.txt', delimiter='\t')).transpose()
- geneList = np.genfromtxt(parsed.rfile+'.key.txt', delimiter='\t', dtype=str)
- if parsed.restore>=2:
- print('Restoring saved principal component scores...')
- D = np.asmatrix(np.genfromtxt(parsed.rfile+'.pcs.txt', delimiter='\t'))
- V = np.asmatrix(np.genfromtxt(parsed.rfile+'.pcv.txt', delimiter='\t'))
- else:
- print('Running principal component analysis...')
- D,V = getPCs(M, parsed.numPCs)
- if parsed.saveMatrix:
- print('Saving principal component scores...')
- np.savetxt(parsed.ofile+'.pcs.txt', D, delimiter='\t', fmt='%.6f')
- np.savetxt(parsed.ofile+'.pcv.txt', V, delimiter='\t')
- else:
- print('Loading differential expression data...')
- parsed.metric = parsed.metric.split(',')
- X = buildColumn(parsed.expfile,parsed.chrs,O,n,parsed.metric)
- print('Loading and restructuring epigenomic data from cell type 1...')
- M1 = diffex_FormatData(parsed.l1file, O, n, parsed.chrs, parsed.metric)
- print('Loading and restructuring epigenomic data from cell type 2...')
- M2 = diffex_FormatData(parsed.l2file, O, n, parsed.chrs, parsed.metric)
- if parsed.feature=='isol':
- M = np.concatenate((M1, M2), axis=1)
- elif parsed.feature=='diff':
- M = M1 - M2
- print('Removing all-zero and infinite entries...')
- LP = np.concatenate((M,X),axis=1)
- X = np.asarray([X[k,:].tolist()[0] for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
- M = np.asmatrix([M[k,:].tolist()[0] for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
- geneList = np.asarray([geneList[k,:].tolist() for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
- if parsed.saveMatrix:
- print('Saving data matrix...')
- np.savetxt(parsed.ofile+'.data.txt', M, delimiter='\t', fmt='%.3f')
- np.savetxt(parsed.ofile+'.expression.txt', X, delimiter='\n', fmt='%.6f')
- np.savetxt(parsed.ofile+'.key.txt', geneList, delimiter='\t', fmt='%s')
- print('Running principal component analysis...')
- D,V = getPCs(M, parsed.numPCs)
- if parsed.saveMatrix:
- np.savetxt(parsed.ofile+'.pcs.txt', D, delimiter='\t', fmt='%.6f')
- X = np.asmatrix(X)
- if len(X) != X.size:
- X = X.transpose()
- N = len(X)
- print('Ridge regression on PCA data...')
- D = D/np.linalg.norm(D, axis=0)
- # D = np.concatenate((np.ones((len(D),1)), D), axis=1)
- P = parsed.numPCs
- # Use 5-fold cross-validation with gradient descent to choose lambda
- part = make_xval_partition(N,5)
- lambdas = []
- for i in range(5):
- print('\tPartition %d...' % (i+1))
- p_train = [j for j,k in enumerate(part) if k != i]
- p_test = [j for j,k in enumerate(part) if k == i]
- fun_ridge = lambda x: funRidge(x,X,D,p_train,p_test)
- RES = fminsearch(fun_ridge, x0=0)
- lambdas.append(abs(RES.x))
- l = np.mean(lambdas)
- print('Optimal regularization parameter: %.8f' % l)
- R = Ridge(alpha=l)
- R.fit(D,X)
- w = np.asmatrix(R.coef_).transpose()
- resids = X - np.asmatrix(R.predict(D))
- trn_err = np.linalg.norm(resids)/np.sqrt(N)
- print('Training error: %f' % trn_err)
- if parsed.saveMatrix:
- np.savetxt(parsed.ofile+'.ridge.txt', w, delimiter='\n', fmt='%.8f')
- np.savetxt(parsed.ofile+'.resid.txt', resids, delimiter='\t', fmt='%.8f')
- print('K-means clustering using %d clusters...' % parsed.k)
- KM = kmeans(n_clusters=parsed.k)
- labels = KM.fit_predict(M)
- labels = np.array(labels, dtype=int)
- centroids = np.zeros((parsed.k,M.size/len(M)))
- E = np.zeros((parsed.k,3))
- F = np.zeros((parsed.k,2))
- print('Computing cluster composition...')
- for i in range(parsed.k):
- D_tmp = np.asmatrix([v.tolist()[0] for k,v in enumerate(M) if int(labels[k]) == i])
- centroids[i,:] = np.mean(D_tmp,0)
- X_tmp = [int(np.sign(v)*(np.abs(v)>parsed.thresh)) for k,v in enumerate(X) if int(labels[k]) == i]
- Bot = len(X_tmp)
- if Bot>0:
- for j in range(3):
- Top = len([v for v in X_tmp if v == j-1])
- E[i,j] = float(Top)/float(Bot)
- F[i,0] = np.mean(X_tmp)
- F[i,1] = float(Bot)/float(len(labels))
- print(np.concatenate((E,F),axis=1))
- if parsed.saveMatrix:
- np.savetxt(parsed.ofile+'.distrib.txt', E, delimiter='\t', fmt='%.6f')
- np.savetxt(parsed.ofile+'.centroids.txt', centroids, delimiter='\t', fmt='%.6f')
- np.savetxt(parsed.ofile+'.labels.txt', labels, delimiter='\t', fmt='%d')
- np.savetxt(parsed.ofile+'.means.txt', F, delimiter='\t', fmt='%d')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement