Advertisement
jhangyu

Batch_all_RNA_Seq.sh

Apr 19th, 2018
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 7.44 KB | None | 0 0
  1. fastqDir=/data
  2. genomeDir=/data
  3. genomeFile=tilapia.fa
  4. genomeName=${genomeFile%.*}
  5. gffFile=tilapia_20171005_with_yp_genes.gff3
  6. gffName=${gffFile%.*}
  7. project_species=tilapia
  8. R1_Append=_R1.fastq.gz
  9. R2_Append=_R2.fastq.gz
  10.  
  11.  
  12. cpu_threads=$(nproc)
  13. gatk_path=$(echo $(which gatk)/$(readlink $(which gatk)) | sed 's/bin\/gatk\/\.\.\///g' | sed 's/gatk$//g')$(ls $(echo $(which gatk)/$(readlink $(which gatk)) | sed 's/bin\/gatk\/\.\.\///g' | sed 's/gatk$//g') | grep -P -o '.*.jar$')
  14. picard_path=$(echo $(which picard)/$(readlink $(which picard)) | sed 's/bin\/picard\/\.\.\///g').jar
  15. java_path=$(which java)
  16. trimmomatic_path=$(echo $(which trimmomatic)/$(readlink $(which trimmomatic)) | sed 's/bin\/trimmomatic\/\.\.\///g' | sed 's/\/trimmomatic$//g')
  17.  
  18. cd ${fastqDir}
  19. for ID in $(ls | grep -P -o '.*(?=(_R1.fastq.gz))')
  20. do
  21.     if [ ! -f ${ID}_R1_paired.fastq ] && [ ! -f ${ID}_sambamba.bam ];then
  22.         echo process trimming ${ID}
  23.         nohup trimmomatic PE -threads ${cpu_threads} ${ID}${R1_Append} ${ID}${R2_Append} ${ID}_R1_paired.fastq.gz ${ID}_R1_unpaired.fastq.gz ${ID}_R2_paired.fastq.gz ${ID}_R2_unpaired.fastq.gz ILLUMINACLIP:${trimmomatic_path}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:30 &
  24.         pids="$pids $!"
  25.         echo
  26.         echo start waitting for finish ${ID} trimming
  27.         wait $pids
  28.     fi
  29. done
  30.  
  31. echo ====== Finish Trimming ======
  32. echo
  33. echo ====== Start Mapping ======
  34.  
  35. for ID in $(ls | grep -P -o '.*(?=(_R1_paired.fastq.gz))')
  36. do
  37.     if [ ! -f ${ID}.sam ] && [ ! -f ${ID}_rg_added_sorted.bam ];then
  38.  
  39.         ### Check if splicesites file exist or not ###
  40.         if [ ! -f ${genomeDir}/${gffName}_splicesites.txt ]; then
  41.             cd ${genomeDir}
  42.             if [ ! ${gffName}.gtf ]; then
  43.                 gffread ${gffFile} -T -o ${gffName}.gtf
  44.                 pids="$pids $!"
  45.                 echo
  46.                 echo start waitting Transform GFF to GTF
  47.                 wait $pids
  48.             fi
  49.  
  50.             hisat2_extract_splice_sites.py ${gffName}.gtf > ${gffName}_splicesites.txt
  51.             pids="$pids $!"
  52.             echo
  53.             echo start waitting for creating Splicesites
  54.             wait $pids
  55.  
  56.             cd ${fastqDir}
  57.         fi
  58.  
  59.         ### Create HISAT2 index ###
  60.         if [ ! -f ${genomeDir}/${genomeFile}.1.ht2 ]; then
  61.             cd ${genomeDir}
  62.             hisat2-build -p ${cpu_threads} ${genomeFile} ${genomeFile}
  63.             pids="$pids $!"
  64.             echo
  65.             echo start waitting for Building HISAT2 index
  66.             wait $pids
  67.             cd ${fastqDir}
  68.         fi
  69.  
  70.         echo process mapping ${ID}
  71.         nohup hisat2 -p ${cpu_threads} -x ${genomeDir}/${genomeFile} -1 ${ID}_R1_paired.fastq.gz -2 ${ID}_R2_paired.fastq.gz --known-splicesite-infile ${genomeDir}/${gffName}_splicesites.txt -S ${ID}.sam &
  72.         pids="$pids $!"
  73.         echo
  74.         echo start waitting for finish ${ID} mapping
  75.         wait $pids
  76.         if [ -f ${ID}_R1.fastq ];then
  77.             echo ====== Deleting Ori FASTQ File ======
  78.             rm ${ID}${R1_Append}
  79.             rm ${ID}${R2_Append}
  80.             rm ${ID}_trim.log
  81.         fi
  82.     fi
  83.     if [ ! -f ${ID}_sambamba.bam ];then
  84.         echo process Sambamba ${ID}
  85.         nohup sambamba view -S ${ID}.sam -t ${cpu_threads} -f bam -o ${ID}_sambamba.bam &
  86.         pids="$pids $!"
  87.         echo
  88.         echo start waitting for finish ${ID} Sambamba Convert
  89.         wait $pids
  90.  
  91.         echo process Sambamba Sort ${ID}
  92.         nohup sambamba sort ${ID}_sambamba.bam -t ${cpu_threads} --tmpdir ./tmp &
  93.         pids="$pids $!"
  94.         echo
  95.         echo start waitting for finish ${ID} Sambamba Sort
  96.         wait $pids
  97.         if [ -f ${ID}_R1_paired.fastq.gz ];then
  98.             echo ====== Deleting FASTQ File ======
  99.         #   rm ${ID}_R1_paired.fastq
  100.         #   rm ${ID}_R2_paired.fastq
  101.             rm ${ID}_R1_unpaired.fastq.gz
  102.             rm ${ID}_R2_unpaired.fastq.gz
  103.         fi
  104.     fi
  105. done
  106. for ID in $(ls | grep -P -o '.*(?=(_sambamba.sorted.bam))')
  107. do
  108.     if [ ! -d ${ID}_Ballgown_Table ];then
  109.         echo process StringTie ${ID}
  110.         nohup stringtie ${ID}_sambamba.sorted.bam -eG ${genomeDir}/${gffFile} -p ${cpu_threads} -b ${ID}_Ballgown_Table > ${ID}_stringtie.log &
  111.         pids="$pids $!"
  112.     fi
  113. done
  114.     echo
  115.     echo start waitting for finish ${ID} StringTie
  116.     wait $pids
  117.     echo finish ${ID} StringTie
  118.  
  119. echo ====== Finish Sambamba-StringTie Pipeline ======
  120. echo
  121. echo ====== Start Variant Calling =======
  122.  
  123. for ID in $(ls | grep -P -o '.*(?=(.sam))')
  124. do
  125.     ### Check fai index exist ###
  126.     if [ ! -f ${genomeDir}/${genomeFile}.fai ];then
  127.         cd ${genomeDir}/
  128.         samtools faidx ${genomeFile}
  129.         cd ${fastqDir}/
  130.     fi
  131.     ### Check dict exist ###
  132.     if [ ! -f ${genomeDir}/${genomeName}.dict ];then
  133.         cd ${genomeDir}
  134.         picard CreateSequenceDictionary R= ${genomeFile} O= ${genomeName}.dict
  135.         cd ${fastqDir}
  136.     fi
  137.     if [ ! -f ${ID}_rg_added_sorted.bam ];then
  138.         echo process AddOrReplaceReadGroups ${ID}
  139.         RGID=$(grep -v @ ${ID}.sam | head -1 | cut -f 1 | cut -d ':' -f 3 | cut -c -5)
  140.         RGPU=$(grep -v @ ${ID}.sam | head -1 | cut -f 1 | cut -d ':' -f 3)
  141.         RGLB=$(echo ${ID} | cut -d "_" -f 1)
  142.         nohup ${java_path} -Djava.io.tmpdir=/data/tmp -Xms1g -Xmx4g -jar ${picard_path} AddOrReplaceReadGroups I=${ID}.sam O=${ID}_rg_added_sorted.bam SO=coordinate RGID=${RGID} RGLB=${RGLB} RGPL=illumina RGPU=${RGPU} RGSM=${ID} &
  143.         pids="$pids $!"
  144.         echo
  145.         echo start waitting for finish ${ID} AddOrReplaceReadGroups
  146.         wait $pids
  147.     fi
  148. done
  149.  
  150. for ID in $(ls | grep -P -o '.*(?=(_rg_added_sorted.bam))')
  151. do
  152.     if [ ! -f ${ID}_dedupped.bam ];then
  153.         echo process MarkDuplicates ${ID}
  154.         nohup ${java_path} -Djava.io.tmpdir=/data/tmp -Xms1g -Xmx4g -jar ${picard_path} MarkDuplicates I=${ID}_rg_added_sorted.bam O=${ID}_dedupped.bam  CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics &
  155.         pids="$pids $!"
  156.         echo
  157.         echo start waitting for finish ${ID} MarkDuplicates
  158.         wait $pids
  159.         if [ -f ${ID}.sam ];then
  160.             echo ====== Deleting SAM File ======
  161.             rm ${ID}.sam
  162.         fi
  163.     fi
  164. done
  165.  
  166. for ID in $(ls | grep -P -o '.*(?=(_dedupped.bam))')
  167. do
  168.     if [ ! -f ${ID}_split.bam ];then
  169.         echo process SplitNCigarReads ${ID}
  170.         nohup gatk --java-options "-Djava.io.tmpdir=/data/tmp -Xmx20g" SplitNCigarReads -R ${genomeDir}/${genomeFile} -I ${ID}_dedupped.bam -O ${ID}_split.bam &
  171.         pids="$pids $!"
  172.         echo
  173.         echo start waitting for finish ${ID} SplitNCigarReads
  174.         wait $pids
  175.     fi
  176. done
  177.  
  178. for ID in $(ls | grep -P -o '.*(?=(_split.bam))')
  179. do
  180.     if [ ! -f ${ID}.vcf ];then
  181.         echo process Variant Calling ${ID}
  182.         nohup gatk --java-options "-Djava.io.tmpdir=/data/tmp -Xms1g -Xmx20g" HaplotypeCaller -R ${genomeDir}/${genomeFile} -I ${ID}_split.bam -stand-call-conf 20.0 -O ${ID}.vcf -ERC GVCF &
  183.         pids="$pids $!"
  184.         echo
  185.         echo start waitting for finish ${ID} Variant Calling
  186.         wait $pids
  187.     fi
  188. done
  189. echo ====== Finish Variant Calling ======
  190. echo
  191. echo ====== Start VEP Discovering ======
  192.  
  193. for ID in $(ls | grep -P -o '.*(?=(.vcf))')
  194. do
  195.     if [ ! -f ${ID}_variant_effect.txt ];then
  196.         ### Check GFF index exist ###
  197.         if [ ! -f ${genomeDir}/${gffName}.gff.gz.tbi ];then
  198.             cd ${genomeDir}
  199.             ### Check GFF gz file exist ###
  200.             if [ ! -f ${genomeDir}/${gffName}.gff.gz ];then
  201.                 grep -v "#" ${genomeDir}/${gffFile} | sort -k1,1 -k4,4n -k5,5n | bgzip -c >${genomeDir}/${gffName}.gff.gz
  202.                 pids="$pids $!"
  203.                 wait $pids
  204.             fi
  205.             tabix -p gff ${genomeDir}/${gffName}.gff.gz
  206.             pids="$pids $!"
  207.             wait $pids
  208.             cd ${fastqDir}
  209.         fi
  210.         ### Check fasta index exist ###
  211.         if [ ! -f ${genomeDir}/${genomeName}.gz ];then
  212.             cd ${genomeDir}
  213.             bgzip ${genomeDir}/${genomeFile}
  214.             pids="$pids $!"
  215.             wait $pids
  216.             cd ${fastqDir}
  217.         fi
  218.         echo process Variant Discovering ${ID}
  219.         vep -i ${ID}.vcf -gff ${genomeDir}/${gffName}.gff.gz -fasta ${genomeDir}/${genomeFile}.gz -o ${ID}_variant_effect.txt
  220.         echo
  221.         echo start waitting for finish Variant ${ID} Discovering
  222.         pids="$pids $!"
  223.         wait $pids
  224.         echo
  225.     fi
  226. done
  227.  
  228. echo ====== Finish VEP Discovering ======
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement