PRS_preprocessing_pipeline.sh

#Polygenic risk score calculation pipeline
#Wrtten by: Matthew Jensen, Girirajan lab, Penn State University (May 2021)
#Pipeline modified by MJ for Gerstein lab, Yale University (June 2022)
#Pipeline steps adapted from: https://choishingwan.github.io/PRS-Tutorial/base/


#Step 1: Quality control of GWAS summary statistics files
#Filter for imputation INFO score >0.8 (when available); skip MAF>0.01 filter, as most GWAS statistics don't have minor allele frequencies available due to privacy concerns
gunzip -c sumstats_sum_ctg_format.txt.gz |awk 'NR==1 || ($14 > 0.8) {print}' |gzip  > sumstats_info_filter.txt.gz

#Filter out ambiguous SNPs (SNPs where the alternate allele is complementary to the reference allele)
gunzip -c sumstats_info_filter.txt.gz | awk '!( ($3=="A" && $4=="T") || ($3=="T" && $4=="A") || ($3=="G" && $4=="C") \
|| ($3=="C" && $4=="G")) {print}' | gzip > sumstats_ambig_snp_filter.txt.gz

#Remove duplicate SNPs (if any) from datasets
gunzip -c sumstats_ambig_snp_filter.txt.gz |awk '{ print $2}' |sort | uniq -d > dup_snp.txt
gunzip -c sumstats_ambig_snp_filter.txt.gz |grep -vf dup_snp.txt |gzip - > sumstats_final.txt.gz

#Log-transform ORs of summary statistics in R (skip this step for continuous traits, such as educational attainment or height)
Rscript log_transform_stats.R
#Contents of log_transform_stats.R:
#   library(data.table)
#   dat <- fread("sumstats_final.txt.gz", fill=T)
#   fwrite(dat[,OR:=log(OR)], "sumstats_final_transformed.txt.gz", sep="\t")


#Step 2: QC filtering of imputed genotypes
#Start with imputed genotypes, in form of VCF file, derived from microarray data (i.e. TOPMED or UMich server)

#Create BED/BIM/FAM file from imputed VCF
plink --vcf microarray_TOPMED_impute.vcf.gz --out microarray_TOPMED_impute

#PLINK QC filters: Cohort MAF <0.05, Hardy-Weinberg equilibrium <1e-6 (under selection), geno<0.01 (SNPs missing in >1% of subjects), mind<0.01 (Individuals with >1% missing genotypes)
plink --bfile microarray_TOPMED_impute --maf 0.05 --hwe 1e-6 --geno 0.01 --make-bed --out microarray_final_QC
plink --bfile microarray_final_QC --mind 0.01 --write-snplist --make-bed --out microarray_final_FAM_QC

#Remove samples with extremely high heterozygosity
#Select SNPs in 200kb/50 variant windows that have LD r^2 scores of >0.25
plink --bfile microarray_final_FAM_QC --keep microarray_final_FAM_QC.fam --extract microarray_final_FAM_QC.snplist \
 --indep-pairwise 200 50 0.25 --out microarray_final_FAM_QC
#Calculate F coefficient samples to estimate heterozygosity rate in each sample
plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.fam \
 --het --out microarray_final_FAM_QC

#Remove samples with >3SD heterozygosity rate, using the following R script:
Rscript sample_heterozygosity_check.R
#   library(data.table)
#   dat <- fread("microarray_final_FAM_QC.het")
#   # Get samples with F coefficient within 3 SD of the population mean
#   valid <- dat[F<=mean(F)+3*sd(F) & F>=mean(F)-3*sd(F)]
#   invalid <- dat[F>mean(F)+3*sd(F) | F<mean(F)-3*sd(F)]
#   # print FID and IID for valid samples
#   fwrite(valid[,c("FID","IID")], "microarray_final_FAM_QC.valid.sample", sep="\t")
#   fwrite(invalid[,c("FID","IID","F")], "microarray_final_FAM_QC.invalid.sample", sep="\t")

#Check sex of samples: Generate sexcheck file in PLINK
plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --keep microarray_final_FAM_QC.valid.sample --check-sex --out microarray_final_FAM_QC
#If ant samples do not have mtahcing sexes, add sample to microarray_final_FAM_QC.invalid.sample and remove from microarray_final_FAM_QC.valid.sample

#Calculate 6 principal component eigenvectors for population stratification (downstream analysis)
plink --bfile microarray_final_FAM_QC --extract microarray_final_FAM_QC.prune.in --pca 6 --out microarray_final_FAM_QC

#Remove duplicate SNPs from PLINK files
cut -f 2 microarray_final_FAM_QC.bim | sort | uniq -d > dup_snps.txt
#If any duplicate SNPs, add this flag to following PLINK command: --exclude dup_snps.txt

#Output final set of QC-passing genotypes/samples
plink --bfile microarray_final_FAM_QC --make-bed --keep microarray_final_FAM_QC.valid.sample \
--out microarray_final --extract microarray_final_FAM_QC.snplist --write-snplist


#Step 3: Perform strand-flipping analysis for aligning SNPs to GWAS statistics

#Copy BIM file to retain original for downstream strand-flipping analysis
cp microarray_final.bim microarray_final_original.bim

#Adjust mismatched SNPs using strand-flipping in R
Rscript strand_flipping.R

##NOTE: Modify the following script as needed to account for column order and names within GWAS summary ststistic file
#Also, this step should be re-run for each GWAS dataset to be used

#Contents of strand_flipping.R:

# library(data.table)
# library(magrittr)
# bim <- fread("microarray_final_original.bim") %>%
#     setnames(., colnames(.), c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")) %>%
#     #Change alleles to upper cases
#     .[,c("B.A1","B.A2"):=list(toupper(B.A1), toupper(B.A2))]
# # Read in GWAS summary statistic data (require data.table v1.12.0+)
# # NOTE: Modify the following line as needed to account for column order and names within GWAS summary ststistic file
# gwas <- fread("sumstats_final_transformed.txt.gz", fill=T) %>%
#     #setnames(., colnames(.), c("CHR", "SNP", "A1", "A2", "BP", "INFO", "OR", "SE", "P", "NGT")) %>%
#     .[,c("CHR"):=list(strtoi(CHR))] #%>% #Include if column needs recast
#     #.[,c("A1","A2"):=list(toupper(A1), toupper(A2))]
# qc <- fread("microarray_final_FAM_QC.snplist", header=F, fill=T)
# #Merge data and filter for QC SNPs
# info <- merge(bim, gwas, by=c("SNP", "CHR", "BP")) %>%
#     .[SNP %in% qc[,V1]]
# # Function for finding the complementary allele
# complement <- function(x) {switch (x, "A" = "T", "C" = "G", "T" = "A", "G" = "C", return(NA)) }
# # Get SNPs that have the same alleles across base and target
# info.match <- info[A1 == B.A1 & A2 == B.A2, SNP]
# # Identify SNPs that are complementary between base and target, and update BIM file
# com.snps <- info[sapply(B.A1, complement) == A1 & sapply(B.A2, complement) == A2, SNP]
# bim[SNP %in% com.snps, c("B.A1", "B.A2") := list(sapply(B.A1, complement), sapply(B.A2, complement))]
# # Identify SNPs that need recoding and update BIM file
# recode.snps <- info[B.A1==A2 & B.A2==A1, SNP]
# bim[SNP %in% recode.snps, c("B.A1", "B.A2") := list(B.A2, B.A1)]
# # identify SNPs that need recoding & complement, and update BIM file
# com.recode <- info[sapply(B.A1, complement) == A2 & sapply(B.A2, complement) == A1, SNP]
# bim[SNP %in% com.recode, c("B.A1", "B.A2") :=list(sapply(B.A2, complement), sapply(B.A1, complement))]
# # Write the updated bim file
# fwrite(bim, "microarray_final.bim", col.names=F, sep="\t")
# #Output SNPs that do not match with target file
# mismatch <- bim[!(SNP %in% info.match | SNP %in% com.snps | SNP %in% recode.snps | SNP %in% com.recode), SNP]
# write.table(mismatch, "microarray_gwas_mismatch.txt", quote=F, row.names=F, col.names=F)

#Output final file with flipped SNPs
plink --bfile microarray_final --make-bed --keep microarray_final_FAM_QC.valid.sample \
--out microarray_final_flip --extract microarray_final.snplist


#Step 4A: Calculate PRS using PLINK-Clump method

#Perform clump step within PLINK to cluster SNPs that are in linkage disequilibrium and are associated with the phenotype of interest
#Parameters: p1=select all SNPs for clumping, r2=remove SNPs with r2 index (haplotype MLE) >0.1, clump SNPs within 250kbp of target SNP
plink --bfile microarray_final_flip --clump-p1 1 --clump-r2 0.1 --clump-kb 250 --clump sumstats_final_transformed.txt.gz \
--clump-snp-field SNP --clump-field P --out microarray_final_clump
#Extract index SNPs selected for downstream analysis
awk 'NR!=1{print $3}' microarray_final_clump.clumped > microarray_final_clump.valid.snp

#Generate file for ranges of p-values to test
echo "0.001 0 0.001" > range_list.txt
echo "0.05 0 0.05" >> range_list.txt
echo "0.1 0 0.1" >> range_list.txt
echo "0.2 0 0.2" >> range_list.txt
echo "0.3 0 0.3" >> range_list.txt
echo "0.4 0 0.4" >> range_list.txt
echo "0.5 0 0.5" >> range_list.txt

#Generate input SNP list file for PRS calc (SNP ID and p-value)
zcat sumstats_final_transformed.txt.gz | awk '{print $2,$9}' > sumstats.SNP.pvalue.txt

#Perform PRS calculation
#Denote SNP ID, effective allele (A1), and effect size of GWAS statistic in --score parameter
plink --bfile microarray_final_clump --score sumstats_final_transformed.txt.gz 2 4 6 header
--q-score-range range_list.txt sumstats.SNP.pvalue.txt --extract microarray_final_clump.valid.snp --out microarray_PRS


#Step 4B: Calculate PRS using LDPred2 method

Rscript ldpred2_prs.R #See script file for more details