Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Polygenic risk score calculation pipeline
- #Wrtten by: Matthew Jensen, Girirajan lab, Penn State University (May 2021)
- #Pipeline modified by MJ for Gerstein lab, Yale University (June 2022)
- #Code adapted from: https://choishingwan.github.io/PRS-Tutorial/ldpred/
- library(bigsnpr)
- options(bigstatsr.check.parallel.blas = FALSE)
- options(default.nproc.blas = NULL)
- library(data.table)
- library(magrittr)
- #Step 1: Load phenotype + summary statistics files (not needed for auto model)
- #Load combination phenotype + covariate file
- #pheno <- fread('phenotype_covariate.txt')
- #Include in this file: rows for sample; columns for phenotype (binary or continuous) and covariates (i.e. sex, age, population PCA)
- #Load HapMap3 SNP dataset (download from: https://figshare.com/ndownloader/files/37802721)
- info <- readRDS("hm3_variants.rds")
- #Load non-transformed summary statistics files
- #Modify columns based on naming and order in summary stats file; these *must* be renamed to names below
- #I.e. a0=main allele, a1=alt. allele, OR=odds ratio, SE=standard error
- sumstats <- fread("../summary_stats/asd_filter_transformed.txt", fill=T) %>%
- .[,c("CHR"):=list(strtoi(CHR))]
- names(sumstats) <-c("chr","rsid","pos","a0","a1","INFO","OR","SE","p") #Change based on header
- sumstats$beta <- sumstats$OR #Use in-place log-transformed OR to generate betas
- sumstats$beta_se <- sumstats$SE #Use unmodified standard errors for betas
- sumstats <- sumstats[sumstats$rsid%in% info$rsid,] #Filter out SNPs not in HapMap3 dataset
- #Calculate effective sample size from literature if not already present in summary statistics. Formula for binary traits: 4/(1/18382 + 1/27969)
- sumstats$n_eff <- 44368
- sumstats <- sumstats[sumstats$rsid%in% info$rsid,]
- #Step 2: Calculate LD matrix
- # Get maximum amount of cores
- NCORES <- nb_cores()
- # Open a temporary file
- tmp <- tempfile(tmpdir = "tmp-data")
- on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
- # Initialize variables for storing the LD score and LD matrix
- corr <- NULL
- ld <- NULL
- # We want to know the ordering of samples in the bed file
- fam.order <- NULL
- # preprocess the bed file (only need to do once for each data set)
- snp_readBed("capstone4.asd.hg38.final.flip.bed")
- # now attach the genotype object
- obj.bigSNP <- snp_attach("capstone4.asd.hg38.final.flip.rds")
- # extract the SNP information from the genotype
- map <- obj.bigSNP$map[-3]
- names(map) <- c("chr", "rsid", "pos", "a1", "a0")
- map <- subset(map,chr>0) #Remove SNPs w/o location information
- map<-subset(map,chr<23) #Keep autosomal SNPs only (chr1-22)
- # perform SNP matching
- info_snp <- snp_match(sumstats, map, join_by_pos=FALSE,match.min.prop=0)
- # Assign the genotype to a variable for easier downstream analysis
- genotype <- obj.bigSNP$genotypes
- # Rename the data structures
- CHR <- map$chr
- POS <- map$pos
- # get the CM information from 1000 Genome
- # will download the 1000G file to the current directory (".")
- POS2 <- snp_asGeneticPos(CHR, POS, dir = ".",ncores = NCORES)
- # calculate LD
- for (chr in 1:22) {
- # Extract SNPs that are included in the chromosome
- ind.chr <- which(info_snp$chr == chr)
- ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
- # Calculate the LD
- corr0 <- snp_cor(genotype, ind.col = ind.chr2, ncores = NCORES, infos.pos = POS2[ind.chr2], size = 3 / 1000)
- if (chr == 1) {
- ld <- Matrix::colSums(corr0^2)
- corr <- as_SFBM(corr0, tmp)
- } else {
- ld <- c(ld, Matrix::colSums(corr0^2))
- corr$add_columns(corr0, nrow(corr))
- }
- }
- # We assume the fam order is the same across different chromosomes
- fam.order <- as.data.table(obj.bigSNP$fam)
- # Rename fam order
- setnames(fam.order,c("family.ID", "sample.ID"),c("FID", "IID"))
- #Step 3: Perform LD score regression on betas
- df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
- ldsc <- snp_ldsc(ld, length(ld), chi2 = (df_beta$beta / df_beta$beta_se)^2, sample_size = df_beta$n_eff, blocks = NULL)
- h2_est <- ldsc[["h2"]]
- #Create results DF for *.bimbam output
- results<-data.frame()
- #Step 4: Calculate null R^2 for binary trait
- library(rms)
- # Reformat the phenotype file such that y is of the same order as the sample ordering in the genotype file
- y <- pheno[fam.order, on = c("FID", "IID")]
- # use glm for binary trait (will also need the fmsb package to calculate the pseudo R2)
- #Modify this based on the covariates and phenotypes included above (i.e. PCA, phenotype, sex, age, etc.)
- null.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~Sex+Age+Carrier+", .) %>% as.formula
- null.model<- lrm(null.formula, data = y)
- null.r2 <- null.model$stats["R2"]
- #Step 5: Calculate PRS using infinitesemal model
- beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
- # calculate PRS for all samples
- ind.test <- 1:nrow(genotype)
- pred_inf <- big_prodVec(genotype,beta_inf,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
- #Test performance with regression model
- reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
- reg.dat <- y
- reg.dat$PRS <- pred_inf
- inf.model <- lrm(reg.formula, dat=reg.dat)
- inf.r2 <- inf.model$stats["R2"]
- #Scale PRS values (mean=0, SD=1) and reformat PRS to *.bimbam format for GEMMA
- prs_gemma<-scale(pred_inf)
- prs_gemma<-t(prs_gemma)
- prs_gemma<-cbind("PRS_inf",0,0,prs_gemma)
- results<-rbind(results, prs_gemma)
- ##Step 6: Calculate PRS using grid model
- ## Prepare data for grid model
- p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
- h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
- grid.param <-expand.grid(p = p_seq,h2 = h2_seq,sparse = c(FALSE, TRUE))
- # Get adjusted beta from grid model
- beta_grid <-snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
- # calculate PRS for all samples
- ind.test <- 1:nrow(genotype)
- pred_grid <- big_prodMat(genotype, beta_grid, ind.col = info_snp$`_NUM_ID_`)
- #Loop through columns, scale PRS, and output in *.bimbam format
- for(i in 1:ncol(pred_grid)){
- prs_grid <- pred_grid[,i]
- prs_grid<-scale(pred_inf)+1
- prs_grid<-t(prs_grid)#
- prs_grid<-cbind(paste0("PRS_grid_",i),0,0,prs_grid)
- results<-rbind(results, prs_grid)
- }
- #Test performance with regression model
- reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
- reg.dat <- y
- #Generate r^2 value for overal heritability measure
- #For below analyses, skip any columns of the grid that fail with singularity error when generating LRM
- grid.r2<-vector()
- index.pass<-vector()
- for(i in 1:ncol(pred_grid)){
- flag<-TRUE
- reg.dat$PRS <- pred_grid[,i]
- grid.model <- tryCatch(lrm(reg.formula, dat=reg.dat),warning=function(w) flag<<-FALSE)
- if (!flag) next
- grid.r2 <- c(grid.r2, grid.model$stats["R2"])
- index.pass <- c(index.pass, i)
- }
- max.r2 <- max(grid.r2)
- #Step 7: Calculate PRS using auto model
- # Get adjusted beta from the auto model (test range of 50 p-values to find optimum)
- multi_auto <- snp_ldpred2_auto(corr,df_beta,h2_init = h2_est,vec_p_init = seq_log(1e-4, 0.9, length.out = 50),ncores = NCORES)
- beta_auto <- sapply(multi_auto, function(auto) auto$beta_est)
- # calculate PRS for all samples
- ind.test <- 1:nrow(genotype)
- pred_auto <-big_prodMat(genotype,beta_auto,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
- # scale the PRS generated from AUTO
- pred_scaled <- apply(pred_auto, 2, sd)
- final_beta_auto <-rowMeans(beta_auto[,abs(pred_scaled -median(pred_scaled)) < 3 * mad(pred_scaled)])
- pred_auto <-big_prodVec(genotype,final_beta_auto,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
- #Scale PRS values (mean=0, SD=1) and reformat PRS to *.bimbam format for GEMMA
- prs_gemma<-scale(pred_auto)
- prs_gemma<-t(prs_gemma)
- prs_gemma<-cbind("auto",0,0,prs_gemma)
- results<-rbind(results, prs_gemma)
- prs_auto<-t(pred_auto)
- prs_auto<-cbind("autism_auto_noscale",0,0,prs_auto)
- results<-rbind(results, prs_auto)
- #Test performance with regression model
- reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
- reg.dat <- y
- reg.dat$PRS <- pred_auto
- auto.model <- lrm(reg.formula, dat=reg.dat)
- auto.r2 <- auto.model$stats["R2"]
- #Print results
- result <- data.table(infinitesimal = inf.r2 - null.r2, grid = max.r2 - null.r2, auto = auto.r2 - null.r2, null = null.r2)
- #Output results file as *.bimbam format
- write.table(results,file="ldpred2_prs_final.bimbam",sep="\t",row.names=FALSE,col.names=FALSE,quote=FALSE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement