Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Polygenic risk score calculation pipeline
- #Wrtten by: Matthew Jensen, Girirajan lab, Penn State University (May 2021)
- #Pipeline modified by MJ for Gerstein lab, Yale University (June 2022)
- #Pipeline steps adapted from: https://choishingwan.github.io/PRS-Tutorial/base/
- #Step 1: Quality control of GWAS summary statistics files
- #Filter for imputation INFO score >0.8 (when available); skip MAF>0.01 filter, as most GWAS statistics don't have minor allele frequencies available due to privacy concerns
- gunzip -c sumstats_sum_ctg_format.txt.gz |awk 'NR==1 || ($14 > 0.8) {print}' |gzip > sumstats_info_filter.txt.gz
- #Filter out ambiguous SNPs (SNPs where the alternate allele is complementary to the reference allele)
- gunzip -c sumstats_info_filter.txt.gz | awk '!( ($3=="A" && $4=="T") || ($3=="T" && $4=="A") || ($3=="G" && $4=="C") \
- || ($3=="C" && $4=="G")) {print}' | gzip > sumstats_ambig_snp_filter.txt.gz
- #Remove duplicate SNPs (if any) from datasets
- gunzip -c sumstats_ambig_snp_filter.txt.gz |awk '{ print $2}' |sort | uniq -d > dup_snp.txt
- gunzip -c sumstats_ambig_snp_filter.txt.gz |grep -vf dup_snp.txt |gzip - > sumstats_final.txt.gz
- #Log-transform ORs of summary statistics in R (skip this step for continuous traits, such as educational attainment or height)
- Rscript log_transform_stats.R
- #Contents of log_transform_stats.R:
- # library(data.table)
- # dat <- fread("sumstats_final.txt.gz", fill=T)
- # fwrite(dat[,OR:=log(OR)], "sumstats_final_transformed.txt.gz", sep="\t")
- #Step 2: QC filtering of imputed genotypes
- #Start with imputed genotypes, in form of VCF file, derived from microarray data (i.e. TOPMED or UMich server)
- #Create BED/BIM/FAM file from imputed VCF
- plink --vcf microarray_TOPMED_impute.vcf.gz --out microarray_TOPMED_impute
- #PLINK QC filters: Cohort MAF <0.05, Hardy-Weinberg equilibrium <1e-6 (under selection), geno<0.01 (SNPs missing in >1% of subjects), mind<0.01 (Individuals with >1% missing genotypes)
- plink --bfile microarray_TOPMED_impute --maf 0.05 --hwe 1e-6 --geno 0.01 --make-bed --out microarray_final_QC
- plink --bfile microarray_final_QC --mind 0.01 --write-snplist --make-bed --out microarray_final_FAM_QC
- #Remove samples with extremely high heterozygosity
- #Select SNPs in 200kb/50 variant windows that have LD r^2 scores of >0.25
- plink --bfile microarray_final_FAM_QC --keep microarray_final_FAM_QC.fam --extract microarray_final_FAM_QC.snplist \
- --indep-pairwise 200 50 0.25 --out microarray_final_FAM_QC
- #Calculate F coefficient samples to estimate heterozygosity rate in each sample
- plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.fam \
- --het --out microarray_final_FAM_QC
- #Remove samples with >3SD heterozygosity rate, using the following R script:
- Rscript sample_heterozygosity_check.R
- # library(data.table)
- # dat <- fread("microarray_final_FAM_QC.het")
- # # Get samples with F coefficient within 3 SD of the population mean
- # valid <- dat[F<=mean(F)+3*sd(F) & F>=mean(F)-3*sd(F)]
- # invalid <- dat[F>mean(F)+3*sd(F) | F<mean(F)-3*sd(F)]
- # # print FID and IID for valid samples
- # fwrite(valid[,c("FID","IID")], "microarray_final_FAM_QC.valid.sample", sep="\t")
- # fwrite(invalid[,c("FID","IID","F")], "microarray_final_FAM_QC.invalid.sample", sep="\t")
- #Check sex of samples: Generate sexcheck file in PLINK
- plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.valid.sample --check-sex --out microarray_final_FAM_QC
- #If ant samples do not have mtahcing sexes, add sample to microarray_final_FAM_QC.invalid.sample and remove from microarray_final_FAM_QC.valid.sample
- #Calculate 6 principal component eigenvectors for population stratification (downstream analysis)
- plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --pca 6 --out microarray_final_FAM_QC
- #Remove duplicate SNPs from PLINK files
- cut -f 2 microarray_final_FAM_QC.bim | sort | uniq -d > dup_snps.txt
- #If any duplicate SNPs, add this flag to following PLINK command: --exclude dup_snps.txt
- #Output final set of QC-passing genotypes/samples
- plink --bfile microarray_final_FAM_QC --make-bed --keep microarray_final_FAM_QC.valid.sample \
- --out microarray_final --extract microarray_final_FAM_QC.snplist --write-snplist
- #Step 3: Perform strand-flipping analysis for aligning SNPs to GWAS statistics
- #Copy BIM file to retain original for downstream strand-flipping analysis
- cp microarray_final.bim microarray_final_original.bim
- #Adjust mismatched SNPs using strand-flipping in R
- Rscript strand_flipping.R
- ##NOTE: Modify the following script as needed to account for column order and names within GWAS summary ststistic file
- #Also, this step should be re-run for each GWAS dataset to be used
- #Contents of strand_flipping.R:
- # library(data.table)
- # library(magrittr)
- # bim <- fread("microarray_final_original.bim") %>%
- # setnames(., colnames(.), c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")) %>%
- # #Change alleles to upper cases
- # .[,c("B.A1","B.A2"):=list(toupper(B.A1), toupper(B.A2))]
- # # Read in GWAS summary statistic data (require data.table v1.12.0+)
- # # NOTE: Modify the following line as needed to account for column order and names within GWAS summary ststistic file
- # gwas <- fread("sumstats_final_transformed.txt.gz", fill=T) %>%
- # #setnames(., colnames(.), c("CHR", "SNP", "A1", "A2", "BP", "INFO", "OR", "SE", "P", "NGT")) %>%
- # .[,c("CHR"):=list(strtoi(CHR))] #%>% #Include if column needs recast
- # #.[,c("A1","A2"):=list(toupper(A1), toupper(A2))]
- # qc <- fread("microarray_final_FAM_QC.snplist", header=F, fill=T)
- # #Merge data and filter for QC SNPs
- # info <- merge(bim, gwas, by=c("SNP", "CHR", "BP")) %>%
- # .[SNP %in% qc[,V1]]
- # # Function for finding the complementary allele
- # complement <- function(x) {switch (x, "A" = "T", "C" = "G", "T" = "A", "G" = "C", return(NA)) }
- # # Get SNPs that have the same alleles across base and target
- # info.match <- info[A1 == B.A1 & A2 == B.A2, SNP]
- # # Identify SNPs that are complementary between base and target, and update BIM file
- # com.snps <- info[sapply(B.A1, complement) == A1 & sapply(B.A2, complement) == A2, SNP]
- # bim[SNP %in% com.snps, c("B.A1", "B.A2") := list(sapply(B.A1, complement), sapply(B.A2, complement))]
- # # Identify SNPs that need recoding and update BIM file
- # recode.snps <- info[B.A1==A2 & B.A2==A1, SNP]
- # bim[SNP %in% recode.snps, c("B.A1", "B.A2") := list(B.A2, B.A1)]
- # # identify SNPs that need recoding & complement, and update BIM file
- # com.recode <- info[sapply(B.A1, complement) == A2 & sapply(B.A2, complement) == A1, SNP]
- # bim[SNP %in% com.recode, c("B.A1", "B.A2") :=list(sapply(B.A2, complement), sapply(B.A1, complement))]
- # # Write the updated bim file
- # fwrite(bim, "microarray_final.bim", col.names=F, sep="\t")
- # #Output SNPs that do not match with target file
- # mismatch <- bim[!(SNP %in% info.match | SNP %in% com.snps | SNP %in% recode.snps | SNP %in% com.recode), SNP]
- # write.table(mismatch, "microarray_gwas_mismatch.txt", quote=F, row.names=F, col.names=F)
- #Output final file with flipped SNPs
- plink --bfile microarray_final --make-bed --keep microarray_final_FAM_QC.valid.sample \
- --out microarray_final_flip --extract microarray_final.snplist
- #Step 4A: Calculate PRS using PLINK-Clump method
- #Perform clump step within PLINK to cluster SNPs that are in linkage disequilibrium and are associated with the phenotype of interest
- #Parameters: p1=select all SNPs for clumping, r2=remove SNPs with r2 index (haplotype MLE) >0.1, clump SNPs within 250kbp of target SNP
- plink --bfile microarray_final_flip --clump-p1 1 --clump-r2 0.1 --clump-kb 250 --clump sumstats_final_transformed.txt.gz \
- --clump-snp-field SNP --clump-field P --out microarray_final_clump
- #Extract index SNPs selected for downstream analysis
- awk 'NR!=1{print $3}' microarray_final_clump.clumped > microarray_final_clump.valid.snp
- #Generate file for ranges of p-values to test
- echo "0.001 0 0.001" > range_list.txt
- echo "0.05 0 0.05" >> range_list.txt
- echo "0.1 0 0.1" >> range_list.txt
- echo "0.2 0 0.2" >> range_list.txt
- echo "0.3 0 0.3" >> range_list.txt
- echo "0.4 0 0.4" >> range_list.txt
- echo "0.5 0 0.5" >> range_list.txt
- #Generate input SNP list file for PRS calc (SNP ID and p-value)
- zcat sumstats_final_transformed.txt.gz | awk '{print $2,$9}' > sumstats.SNP.pvalue.txt
- #Perform PRS calculation
- #Denote SNP ID, effective allele (A1), and effect size of GWAS statistic in --score parameter
- plink --bfile microarray_final_clump --score sumstats_final_transformed.txt.gz 2 4 6 header
- --q-score-range range_list.txt sumstats.SNP.pvalue.txt --extract microarray_final_clump.valid.snp --out microarray_PRS
- #Step 4B: Calculate PRS using LDPred2 method
- Rscript ldpred2_prs.R #See script file for more details
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement