Advertisement
matthew_jensen_bx

PRS_preprocessing_pipeline.sh

Nov 9th, 2023
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 8.76 KB | Science | 0 0
  1. #Polygenic risk score calculation pipeline
  2. #Wrtten by: Matthew Jensen, Girirajan lab, Penn State University (May 2021)
  3. #Pipeline modified by MJ for Gerstein lab, Yale University (June 2022)
  4. #Pipeline steps adapted from: https://choishingwan.github.io/PRS-Tutorial/base/
  5.  
  6.  
  7. #Step 1: Quality control of GWAS summary statistics files
  8. #Filter for imputation INFO score >0.8 (when available); skip MAF>0.01 filter, as most GWAS statistics don't have minor allele frequencies available due to privacy concerns
  9. gunzip -c sumstats_sum_ctg_format.txt.gz |awk 'NR==1 || ($14 > 0.8) {print}' |gzip  > sumstats_info_filter.txt.gz
  10.  
  11. #Filter out ambiguous SNPs (SNPs where the alternate allele is complementary to the reference allele)
  12. gunzip -c sumstats_info_filter.txt.gz | awk '!( ($3=="A" && $4=="T") || ($3=="T" && $4=="A") || ($3=="G" && $4=="C") \
  13. || ($3=="C" && $4=="G")) {print}' | gzip > sumstats_ambig_snp_filter.txt.gz
  14.  
  15. #Remove duplicate SNPs (if any) from datasets
  16. gunzip -c sumstats_ambig_snp_filter.txt.gz |awk '{ print $2}' |sort | uniq -d > dup_snp.txt
  17. gunzip -c sumstats_ambig_snp_filter.txt.gz |grep -vf dup_snp.txt |gzip - > sumstats_final.txt.gz
  18.  
  19. #Log-transform ORs of summary statistics in R (skip this step for continuous traits, such as educational attainment or height)
  20. Rscript log_transform_stats.R
  21. #Contents of log_transform_stats.R:
  22. #   library(data.table)
  23. #   dat <- fread("sumstats_final.txt.gz", fill=T)
  24. #   fwrite(dat[,OR:=log(OR)], "sumstats_final_transformed.txt.gz", sep="\t")
  25.  
  26.  
  27. #Step 2: QC filtering of imputed genotypes
  28. #Start with imputed genotypes, in form of VCF file, derived from microarray data (i.e. TOPMED or UMich server)
  29.  
  30. #Create BED/BIM/FAM file from imputed VCF
  31. plink --vcf microarray_TOPMED_impute.vcf.gz --out microarray_TOPMED_impute
  32.  
  33. #PLINK QC filters: Cohort MAF <0.05, Hardy-Weinberg equilibrium <1e-6 (under selection), geno<0.01 (SNPs missing in >1% of subjects), mind<0.01 (Individuals with >1% missing genotypes)
  34. plink --bfile microarray_TOPMED_impute --maf 0.05 --hwe 1e-6 --geno 0.01 --make-bed --out microarray_final_QC
  35. plink --bfile microarray_final_QC --mind 0.01 --write-snplist --make-bed --out microarray_final_FAM_QC
  36.  
  37. #Remove samples with extremely high heterozygosity
  38. #Select SNPs in 200kb/50 variant windows that have LD r^2 scores of >0.25
  39. plink --bfile microarray_final_FAM_QC --keep microarray_final_FAM_QC.fam --extract microarray_final_FAM_QC.snplist \
  40.  --indep-pairwise 200 50 0.25 --out microarray_final_FAM_QC
  41. #Calculate F coefficient samples to estimate heterozygosity rate in each sample
  42. plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.fam \
  43.  --het --out microarray_final_FAM_QC
  44.  
  45. #Remove samples with >3SD heterozygosity rate, using the following R script:
  46. Rscript sample_heterozygosity_check.R
  47. #   library(data.table)
  48. #   dat <- fread("microarray_final_FAM_QC.het")
  49. #   # Get samples with F coefficient within 3 SD of the population mean
  50. #   valid <- dat[F<=mean(F)+3*sd(F) & F>=mean(F)-3*sd(F)]
  51. #   invalid <- dat[F>mean(F)+3*sd(F) | F<mean(F)-3*sd(F)]
  52. #   # print FID and IID for valid samples
  53. #   fwrite(valid[,c("FID","IID")], "microarray_final_FAM_QC.valid.sample", sep="\t")
  54. #   fwrite(invalid[,c("FID","IID","F")], "microarray_final_FAM_QC.invalid.sample", sep="\t")
  55.  
  56. #Check sex of samples: Generate sexcheck file in PLINK
  57. plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.valid.sample --check-sex --out microarray_final_FAM_QC
  58. #If ant samples do not have mtahcing sexes, add sample to microarray_final_FAM_QC.invalid.sample and remove from microarray_final_FAM_QC.valid.sample
  59.  
  60. #Calculate 6 principal component eigenvectors for population stratification (downstream analysis)
  61. plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --pca 6 --out microarray_final_FAM_QC
  62.  
  63. #Remove duplicate SNPs from PLINK files
  64. cut -f 2 microarray_final_FAM_QC.bim | sort | uniq -d > dup_snps.txt
  65. #If any duplicate SNPs, add this flag to following PLINK command: --exclude dup_snps.txt
  66.  
  67. #Output final set of QC-passing genotypes/samples
  68. plink --bfile microarray_final_FAM_QC --make-bed --keep microarray_final_FAM_QC.valid.sample \
  69. --out microarray_final --extract microarray_final_FAM_QC.snplist --write-snplist
  70.  
  71.  
  72. #Step 3: Perform strand-flipping analysis for aligning SNPs to GWAS statistics
  73.  
  74. #Copy BIM file to retain original for downstream strand-flipping analysis
  75. cp microarray_final.bim microarray_final_original.bim
  76.  
  77. #Adjust mismatched SNPs using strand-flipping in R
  78. Rscript strand_flipping.R
  79.  
  80. ##NOTE: Modify the following script as needed to account for column order and names within GWAS summary ststistic file
  81. #Also, this step should be re-run for each GWAS dataset to be used
  82.  
  83. #Contents of strand_flipping.R:
  84.  
  85. # library(data.table)
  86. # library(magrittr)
  87. # bim <- fread("microarray_final_original.bim") %>%
  88. #     setnames(., colnames(.), c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")) %>%
  89. #     #Change alleles to upper cases
  90. #     .[,c("B.A1","B.A2"):=list(toupper(B.A1), toupper(B.A2))]
  91. # # Read in GWAS summary statistic data (require data.table v1.12.0+)
  92. # # NOTE: Modify the following line as needed to account for column order and names within GWAS summary ststistic file
  93. # gwas <- fread("sumstats_final_transformed.txt.gz", fill=T) %>%
  94. #     #setnames(., colnames(.), c("CHR", "SNP", "A1", "A2", "BP", "INFO", "OR", "SE", "P", "NGT")) %>%
  95. #     .[,c("CHR"):=list(strtoi(CHR))] #%>% #Include if column needs recast
  96. #     #.[,c("A1","A2"):=list(toupper(A1), toupper(A2))]
  97. # qc <- fread("microarray_final_FAM_QC.snplist", header=F, fill=T)
  98. # #Merge data and filter for QC SNPs
  99. # info <- merge(bim, gwas, by=c("SNP", "CHR", "BP")) %>%
  100. #     .[SNP %in% qc[,V1]]
  101. # # Function for finding the complementary allele
  102. # complement <- function(x) {switch (x, "A" = "T", "C" = "G", "T" = "A", "G" = "C", return(NA)) }
  103. # # Get SNPs that have the same alleles across base and target
  104. # info.match <- info[A1 == B.A1 & A2 == B.A2, SNP]
  105. # # Identify SNPs that are complementary between base and target, and update BIM file
  106. # com.snps <- info[sapply(B.A1, complement) == A1 & sapply(B.A2, complement) == A2, SNP]
  107. # bim[SNP %in% com.snps, c("B.A1", "B.A2") := list(sapply(B.A1, complement), sapply(B.A2, complement))]
  108. # # Identify SNPs that need recoding and update BIM file
  109. # recode.snps <- info[B.A1==A2 & B.A2==A1, SNP]
  110. # bim[SNP %in% recode.snps, c("B.A1", "B.A2") := list(B.A2, B.A1)]
  111. # # identify SNPs that need recoding & complement, and update BIM file
  112. # com.recode <- info[sapply(B.A1, complement) == A2 & sapply(B.A2, complement) == A1, SNP]
  113. # bim[SNP %in% com.recode, c("B.A1", "B.A2") :=list(sapply(B.A2, complement), sapply(B.A1, complement))]
  114. # # Write the updated bim file
  115. # fwrite(bim, "microarray_final.bim", col.names=F, sep="\t")
  116. # #Output SNPs that do not match with target file
  117. # mismatch <- bim[!(SNP %in% info.match | SNP %in% com.snps | SNP %in% recode.snps | SNP %in% com.recode), SNP]
  118. # write.table(mismatch, "microarray_gwas_mismatch.txt", quote=F, row.names=F, col.names=F)
  119.  
  120. #Output final file with flipped SNPs
  121. plink --bfile microarray_final --make-bed --keep microarray_final_FAM_QC.valid.sample \
  122. --out microarray_final_flip --extract microarray_final.snplist
  123.  
  124.  
  125. #Step 4A: Calculate PRS using PLINK-Clump method
  126.  
  127. #Perform clump step within PLINK to cluster SNPs that are in linkage disequilibrium and are associated with the phenotype of interest
  128. #Parameters: p1=select all SNPs for clumping, r2=remove SNPs with r2 index (haplotype MLE) >0.1, clump SNPs within 250kbp of target SNP
  129. plink --bfile microarray_final_flip --clump-p1 1 --clump-r2 0.1 --clump-kb 250 --clump sumstats_final_transformed.txt.gz \
  130. --clump-snp-field SNP --clump-field P --out microarray_final_clump
  131. #Extract index SNPs selected for downstream analysis
  132. awk 'NR!=1{print $3}' microarray_final_clump.clumped > microarray_final_clump.valid.snp
  133.  
  134. #Generate file for ranges of p-values to test
  135. echo "0.001 0 0.001" > range_list.txt
  136. echo "0.05 0 0.05" >> range_list.txt
  137. echo "0.1 0 0.1" >> range_list.txt
  138. echo "0.2 0 0.2" >> range_list.txt
  139. echo "0.3 0 0.3" >> range_list.txt
  140. echo "0.4 0 0.4" >> range_list.txt
  141. echo "0.5 0 0.5" >> range_list.txt
  142.  
  143. #Generate input SNP list file for PRS calc (SNP ID and p-value)
  144. zcat sumstats_final_transformed.txt.gz | awk '{print $2,$9}' > sumstats.SNP.pvalue.txt
  145.  
  146. #Perform PRS calculation
  147. #Denote SNP ID, effective allele (A1), and effect size of GWAS statistic in --score parameter
  148. plink --bfile microarray_final_clump --score sumstats_final_transformed.txt.gz 2 4 6 header
  149. --q-score-range range_list.txt sumstats.SNP.pvalue.txt --extract microarray_final_clump.valid.snp --out microarray_PRS
  150.  
  151.  
  152. #Step 4B: Calculate PRS using LDPred2 method
  153.  
  154. Rscript ldpred2_prs.R #See script file for more details
  155.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement