Advertisement
matthew_jensen_bx

ldpred2_example.R

Nov 9th, 2023
188
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 8.41 KB | Science | 0 0
  1. #Polygenic risk score calculation pipeline
  2. #Wrtten by: Matthew Jensen, Girirajan lab, Penn State University (May 2021)
  3. #Pipeline modified by MJ for Gerstein lab, Yale University (June 2022)
  4. #Code adapted from: https://choishingwan.github.io/PRS-Tutorial/ldpred/
  5.  
  6. library(bigsnpr)
  7. options(bigstatsr.check.parallel.blas = FALSE)
  8. options(default.nproc.blas = NULL)
  9.  
  10. library(data.table)
  11. library(magrittr)
  12.  
  13.  
  14. #Step 1: Load phenotype + summary statistics files (not needed for auto model)
  15. #Load combination phenotype + covariate file
  16. #pheno <- fread('phenotype_covariate.txt')
  17. #Include in this file: rows for sample; columns for phenotype (binary or continuous) and covariates (i.e. sex, age, population PCA)
  18.  
  19. #Load HapMap3 SNP dataset (download from: https://figshare.com/ndownloader/files/37802721)
  20. info <- readRDS("hm3_variants.rds")
  21.  
  22. #Load non-transformed summary statistics files
  23. #Modify columns based on naming and order in summary stats file; these *must* be renamed to names below
  24. #I.e. a0=main allele, a1=alt. allele, OR=odds ratio, SE=standard error
  25. sumstats <- fread("../summary_stats/asd_filter_transformed.txt", fill=T) %>%
  26.     .[,c("CHR"):=list(strtoi(CHR))]
  27. names(sumstats) <-c("chr","rsid","pos","a0","a1","INFO","OR","SE","p") #Change based on header
  28. sumstats$beta <- sumstats$OR #Use in-place log-transformed OR to generate betas
  29. sumstats$beta_se <- sumstats$SE #Use unmodified standard errors for betas
  30. sumstats <- sumstats[sumstats$rsid%in% info$rsid,] #Filter out SNPs not in HapMap3 dataset
  31. #Calculate effective sample size from literature if not already present in summary statistics. Formula for binary traits: 4/(1/18382 + 1/27969)
  32. sumstats$n_eff <- 44368
  33. sumstats <- sumstats[sumstats$rsid%in% info$rsid,]
  34.  
  35.  
  36. #Step 2: Calculate LD matrix
  37. # Get maximum amount of cores
  38. NCORES <- nb_cores()
  39. # Open a temporary file
  40. tmp <- tempfile(tmpdir = "tmp-data")
  41. on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
  42. # Initialize variables for storing the LD score and LD matrix
  43. corr <- NULL
  44. ld <- NULL
  45. # We want to know the ordering of samples in the bed file
  46. fam.order <- NULL
  47.  
  48. # preprocess the bed file (only need to do once for each data set)
  49. snp_readBed("capstone4.asd.hg38.final.flip.bed")
  50. # now attach the genotype object
  51. obj.bigSNP <- snp_attach("capstone4.asd.hg38.final.flip.rds")
  52. # extract the SNP information from the genotype
  53. map <- obj.bigSNP$map[-3]
  54. names(map) <- c("chr", "rsid", "pos", "a1", "a0")
  55. map <- subset(map,chr>0) #Remove SNPs w/o location information
  56. map<-subset(map,chr<23) #Keep autosomal SNPs only (chr1-22)
  57. # perform SNP matching
  58. info_snp <- snp_match(sumstats, map, join_by_pos=FALSE,match.min.prop=0)
  59. # Assign the genotype to a variable for easier downstream analysis
  60. genotype <- obj.bigSNP$genotypes
  61.  
  62. # Rename the data structures
  63. CHR <- map$chr
  64. POS <- map$pos
  65.  
  66. # get the CM information from 1000 Genome
  67. # will download the 1000G file to the current directory (".")
  68. POS2 <- snp_asGeneticPos(CHR, POS, dir = ".",ncores = NCORES)
  69. # calculate LD
  70. for (chr in 1:22) {
  71.     # Extract SNPs that are included in the chromosome
  72.     ind.chr <- which(info_snp$chr == chr)
  73.     ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
  74.     # Calculate the LD
  75.     corr0 <- snp_cor(genotype, ind.col = ind.chr2, ncores = NCORES, infos.pos = POS2[ind.chr2], size = 3 / 1000)
  76.     if (chr == 1) {
  77.         ld <- Matrix::colSums(corr0^2)
  78.         corr <- as_SFBM(corr0, tmp)
  79.     } else {
  80.         ld <- c(ld, Matrix::colSums(corr0^2))
  81.         corr$add_columns(corr0, nrow(corr))
  82.     }
  83. }
  84.  
  85. # We assume the fam order is the same across different chromosomes
  86. fam.order <- as.data.table(obj.bigSNP$fam)
  87. # Rename fam order
  88. setnames(fam.order,c("family.ID", "sample.ID"),c("FID", "IID"))
  89.  
  90.  
  91. #Step 3: Perform LD score regression on betas
  92. df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
  93. ldsc <- snp_ldsc(ld,  length(ld), chi2 = (df_beta$beta / df_beta$beta_se)^2, sample_size = df_beta$n_eff, blocks = NULL)
  94. h2_est <- ldsc[["h2"]]
  95.  
  96. #Create results DF for *.bimbam output
  97. results<-data.frame()
  98.  
  99.  
  100. #Step 4: Calculate null R^2 for binary trait
  101. library(rms)
  102. # Reformat the phenotype file such that y is of the same order as the sample ordering in the genotype file
  103. y <- pheno[fam.order, on = c("FID", "IID")]
  104. # use glm for binary trait (will also need the fmsb package to calculate the pseudo R2)
  105. #Modify this based on the covariates and phenotypes included above (i.e. PCA, phenotype, sex, age, etc.)
  106. null.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~Sex+Age+Carrier+", .) %>% as.formula
  107. null.model<- lrm(null.formula, data = y)
  108. null.r2 <- null.model$stats["R2"]
  109.  
  110.  
  111. #Step 5: Calculate PRS using infinitesemal model
  112. beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
  113. # calculate PRS for all samples
  114. ind.test <- 1:nrow(genotype)
  115. pred_inf <- big_prodVec(genotype,beta_inf,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
  116.  
  117. #Test performance with regression model
  118. reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
  119. reg.dat <- y
  120. reg.dat$PRS <- pred_inf
  121. inf.model <- lrm(reg.formula, dat=reg.dat)
  122. inf.r2 <- inf.model$stats["R2"]
  123.  
  124. #Scale PRS values (mean=0, SD=1) and reformat PRS to *.bimbam format for GEMMA
  125. prs_gemma<-scale(pred_inf)
  126. prs_gemma<-t(prs_gemma)
  127. prs_gemma<-cbind("PRS_inf",0,0,prs_gemma)
  128. results<-rbind(results, prs_gemma)
  129.  
  130.  
  131. ##Step 6: Calculate PRS using grid model
  132. ## Prepare data for grid model
  133. p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
  134. h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
  135. grid.param <-expand.grid(p = p_seq,h2 = h2_seq,sparse = c(FALSE, TRUE))
  136. # Get adjusted beta from grid model
  137. beta_grid <-snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
  138.  
  139. # calculate PRS for all samples
  140. ind.test <- 1:nrow(genotype)
  141. pred_grid <- big_prodMat(genotype, beta_grid, ind.col = info_snp$`_NUM_ID_`)
  142.  
  143. #Loop through columns, scale PRS, and output in *.bimbam format
  144. for(i in 1:ncol(pred_grid)){
  145.     prs_grid <- pred_grid[,i]
  146.     prs_grid<-scale(pred_inf)+1
  147.     prs_grid<-t(prs_grid)#
  148.     prs_grid<-cbind(paste0("PRS_grid_",i),0,0,prs_grid)
  149.     results<-rbind(results, prs_grid)
  150. }
  151.  
  152. #Test performance with regression model
  153. reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
  154. reg.dat <- y
  155.  
  156. #Generate r^2 value for overal heritability measure
  157. #For below analyses, skip any columns of the grid that fail with singularity error when generating LRM
  158. grid.r2<-vector()
  159. index.pass<-vector()
  160. for(i in 1:ncol(pred_grid)){
  161.     flag<-TRUE
  162.     reg.dat$PRS <- pred_grid[,i]
  163.     grid.model <- tryCatch(lrm(reg.formula, dat=reg.dat),warning=function(w) flag<<-FALSE)
  164.     if (!flag) next
  165.     grid.r2 <- c(grid.r2, grid.model$stats["R2"])  
  166.     index.pass <- c(index.pass, i)
  167.     }
  168.  
  169. max.r2 <- max(grid.r2)
  170.  
  171.  
  172. #Step 7: Calculate PRS using auto model
  173. # Get adjusted beta from the auto model (test range of 50 p-values to find optimum)
  174. multi_auto <- snp_ldpred2_auto(corr,df_beta,h2_init = h2_est,vec_p_init = seq_log(1e-4, 0.9, length.out = 50),ncores = NCORES)
  175. beta_auto <- sapply(multi_auto, function(auto) auto$beta_est)
  176.  
  177. # calculate PRS for all samples
  178. ind.test <- 1:nrow(genotype)
  179. pred_auto <-big_prodMat(genotype,beta_auto,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
  180. # scale the PRS generated from AUTO
  181. pred_scaled <- apply(pred_auto, 2, sd)
  182. final_beta_auto <-rowMeans(beta_auto[,abs(pred_scaled -median(pred_scaled)) < 3 * mad(pred_scaled)])
  183. pred_auto <-big_prodVec(genotype,final_beta_auto,ind.row = ind.test,ind.col = info_snp$`_NUM_ID_`)
  184.  
  185. #Scale PRS values (mean=0, SD=1) and reformat PRS to *.bimbam format for GEMMA
  186. prs_gemma<-scale(pred_auto)
  187. prs_gemma<-t(prs_gemma)
  188. prs_gemma<-cbind("auto",0,0,prs_gemma)
  189. results<-rbind(results, prs_gemma)
  190.  
  191. prs_auto<-t(pred_auto)
  192. prs_auto<-cbind("autism_auto_noscale",0,0,prs_auto)
  193. results<-rbind(results, prs_auto)
  194.  
  195. #Test performance with regression model
  196. reg.formula <- paste("PCA", 1:6, sep = "", collapse = "+") %>% paste0("phenotype~PRS+Sex+Age+Carrier+", .) %>% as.formula
  197. reg.dat <- y
  198. reg.dat$PRS <- pred_auto
  199. auto.model <- lrm(reg.formula, dat=reg.dat)
  200. auto.r2 <- auto.model$stats["R2"]
  201.  
  202. #Print results
  203. result <- data.table(infinitesimal = inf.r2 - null.r2, grid = max.r2 - null.r2, auto = auto.r2 - null.r2, null = null.r2)
  204.  
  205. #Output results file as *.bimbam format
  206. write.table(results,file="ldpred2_prs_final.bimbam",sep="\t",row.names=FALSE,col.names=FALSE,quote=FALSE)
  207.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement