Difference between revisions of "Tassel 5 GBS v2 Pipeline sample script for calling SNPs using reference genome"

From Poland Lab Wiki
Jump to: navigation, search
 
(32 intermediate revisions by the same user not shown)
Line 1: Line 1:
 
==== For this script to work properly, you should have <code>gbs</code> folder in your home directory, and <code>jobs</code> and <code>projects</code> folders inside <code>gbs</code> ====
 
==== For this script to work properly, you should have <code>gbs</code> folder in your home directory, and <code>jobs</code> and <code>projects</code> folders inside <code>gbs</code> ====
  
Once you have the full script copied in a simple text file with extension <code>.sh</code>, the <code>name</code> variable should be replaced with something more explicit, such as your project name.
+
* Once you have the full script copied in a simple text file with extension <code>.sh</code>, the <code>name</code> variable should be replaced with something more explicit, such as your project name.
  
Keyfile should have these 4 mandatory headers:  'Flowcell'  'Lane'  'Barcode'  'FullSampleName'
+
* Keyfile should have these 4 mandatory headers:  'Flowcell'  'Lane'  'Barcode'  'FullSampleName'
  
 
<blockquote>
 
<blockquote>
 
'''''To keep everything organized, I recommend using the same base name for the project and the files, for example'''''
 
'''''To keep everything organized, I recommend using the same base name for the project and the files, for example'''''
  
<code>name</code> - projectName
+
<code>name</code> = projectName
  
Keyfile name - projectName.txt
+
Keyfile name = projectName.txt
  
Shell script name - projectName.sh
+
Shell script name = projectName.sh
 
</blockquote>
 
</blockquote>
  
Once the Keyfile and shell script are ready, put them in <code>jobs</code> folder and run <code> qsub projectName.sh </code>
+
* Once the Keyfile and shell script are ready, put them in <code>jobs</code> folder and run <code> sbatch projectName.sh </code>
  
  
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
#!/bin/bash
+
#!/bin/bash -l
  
# Update the user and name variables, user is your K-State eID
+
# Update beocat resources request
 +
#SBATCH --mem-per-cpu=64G  # Memory per core, use --mem= for memory per node
 +
#SBATCH --time=02-00:00:00  # Use the form DD-HH:MM:SS
 +
#SBATCH --mail-user=eID@ksu.edu --mail-type=ALL  # same as =BEGIN,FAIL,END
 +
#SBATCH --job-name=projectName
 +
 
 +
# Update the 'user' and 'name' variables, 'eID' is your K-State eID.
 
user=eID
 
user=eID
 
name=projectName
 
name=projectName
  
# Update beocat resources request
+
## #######################################
#$ -l h_rt=72:00:00 -l mem=64G -cwd
+
## NO NEED TO CHANGE ANYTHING FROM HERE ON
 
+
## #######################################
  
## NO NEED TO CHANGE ANYTHING FROM HERE ON ##
+
# Load Java module and set JAVA VM Version to 1.8 (if default is different)
 +
module load Java
  
 
keyFile=/homes/$user/gbs/jobs/${name}.txt
 
keyFile=/homes/$user/gbs/jobs/${name}.txt
  
 
seqDir=/bulk/jpoland/sequence
 
seqDir=/bulk/jpoland/sequence
dbPath=/bulk/jpoland/genome/ChineseSpring/pseudomolecules_v1.0/161010_Chinese_Spring_v1.0_pseudomolecules_parts
+
dbPath=/bulk/nss470/genomes/CS_NRGene/pseudomolecules_v1.0/161010_Chinese_Spring_v1.0_pseudomolecules_parts
 
tasselPath=/homes/nss470/softwares/tassel5/run_pipeline.pl
 
tasselPath=/homes/nss470/softwares/tassel5/run_pipeline.pl
  
Line 43: Line 50:
 
# Path for required software
 
# Path for required software
 
export PATH=$PATH:/homes/nss470/usr/bin:/homes/nss470/usr/bin/bin
 
export PATH=$PATH:/homes/nss470/usr/bin:/homes/nss470/usr/bin/bin
 
# Set JAVA VM Version
 
eselect java-vm set user 5
 
  
 
## GBSSeqToTagDBPlugin - RUN Tags to DB
 
## GBSSeqToTagDBPlugin - RUN Tags to DB
Line 58: Line 62:
 
$tasselPath -fork1 -TagExportToFastqPlugin \
 
$tasselPath -fork1 -TagExportToFastqPlugin \
 
     -db ${name}.db \
 
     -db ${name}.db \
     -o ${name}_tagsForAlign.fq -c 10 \
+
     -o ${name}_tagsForAlign.fa.gz -c 10 \
 
     -endPlugin -runfork1 >> z_pipeline.out
 
     -endPlugin -runfork1 >> z_pipeline.out
  
 
## RUN BOWTIE
 
## RUN BOWTIE
bowtie2 -p 20 --very-sensitive-local \
+
bowtie2 -p 20 --end-to-end \
 
     -x $dbPath \
 
     -x $dbPath \
     -U ${name}_tagsForAlign.fq \
+
     -U ${name}_tagsForAlign.fa.gz \
 
     -S ${name}.sam >> z_pipeline.out
 
     -S ${name}.sam >> z_pipeline.out
  
Line 92: Line 96:
 
     -endPlugin -runfork1 >> z_pipeline.out
 
     -endPlugin -runfork1 >> z_pipeline.out
  
## ProductionSNPCallerPluginV2 - RUN PRODUCTION PIPELINE - output .h5
+
## ProductionSNPCallerPluginV2 - RUN PRODUCTION PIPELINE - output .vcf
 
$tasselPath -Xms64G -Xmx64G -fork1 -ProductionSNPCallerPluginV2 \
 
$tasselPath -Xms64G -Xmx64G -fork1 -ProductionSNPCallerPluginV2 \
 
     -db ${name}.db \
 
     -db ${name}.db \
 
     -i $seqDir \
 
     -i $seqDir \
 
     -k $keyFile \
 
     -k $keyFile \
     -o ${name}.h5 \
+
     -o ${name}.vcf \
 
     -e PstI-MspI -kmerLength 64 \
 
     -e PstI-MspI -kmerLength 64 \
    -minPosQS 90 -ko false \
+
     -endPlugin -runfork1 >> z_pipeline.out
     -endPlugin -runfork1 z_pipeline.out
+
  
 
## Convert to Hapmap format
 
## Convert to Hapmap format
$tasselPath -Xms64G -Xmx64G -fork1 -h5 ${name}.h5 \
+
$tasselPath -Xms64G -Xmx64G -fork1 -vcf ${name}.vcf \
 
     -export ${name} -exportType Hapmap
 
     -export ${name} -exportType Hapmap
  
mv /homes/$user/gbs/jobs/name.* keyFileSh/
+
mv /homes/$user/gbs/jobs/${name}.* keyFileSh/
 
</syntaxhighlight>
 
</syntaxhighlight>
  
  
If everything ran correctly, you should get a folder named '''hessian_F3''' in your <code>gbs/projects</code> folder with all the output files including SNPs containing '''.vcf''' and '''.hmp.txt''' files that can be used for further analyses
+
If everything ran correctly, you should get a folder named '''projectName''' in your <code>gbs/projects</code> folder with all the output files including SNPs containing '''projectName.vcf''' and '''projectName.hmp.txt''' files that can be used for further analyses
 +
 
  
==== End ====
+
If you have any question or it doesn't work, please contact me [mailto:nss470@ksu.edu?Subject=TASSEL5%20GBS%20v2%20pipeline%20question here].

Latest revision as of 20:01, 14 March 2018

For this script to work properly, you should have gbs folder in your home directory, and jobs and projects folders inside gbs

  • Once you have the full script copied in a simple text file with extension .sh, the name variable should be replaced with something more explicit, such as your project name.
  • Keyfile should have these 4 mandatory headers: 'Flowcell' 'Lane' 'Barcode' 'FullSampleName'

To keep everything organized, I recommend using the same base name for the project and the files, for example

name = projectName

Keyfile name = projectName.txt

Shell script name = projectName.sh

  • Once the Keyfile and shell script are ready, put them in jobs folder and run sbatch projectName.sh


#!/bin/bash -l
 
# Update beocat resources request
#SBATCH --mem-per-cpu=64G   # Memory per core, use --mem= for memory per node
#SBATCH --time=02-00:00:00   # Use the form DD-HH:MM:SS
#SBATCH --mail-user=eID@ksu.edu --mail-type=ALL   # same as =BEGIN,FAIL,END
#SBATCH --job-name=projectName
 
# Update the 'user' and 'name' variables, 'eID' is your K-State eID.
user=eID
name=projectName
 
## #######################################
## NO NEED TO CHANGE ANYTHING FROM HERE ON
## #######################################
 
# Load Java module and set JAVA VM Version to 1.8 (if default is different)
module load Java
 
keyFile=/homes/$user/gbs/jobs/${name}.txt
 
seqDir=/bulk/jpoland/sequence
dbPath=/bulk/nss470/genomes/CS_NRGene/pseudomolecules_v1.0/161010_Chinese_Spring_v1.0_pseudomolecules_parts
tasselPath=/homes/nss470/softwares/tassel5/run_pipeline.pl
 
mkdir /homes/$user/gbs/projects/${name}
mkdir /homes/$user/gbs/projects/${name}/keyFileSh
cd /homes/$user/gbs/projects/${name}
 
# Path for required software
export PATH=$PATH:/homes/nss470/usr/bin:/homes/nss470/usr/bin/bin
 
## GBSSeqToTagDBPlugin - RUN Tags to DB
$tasselPath -Xms64G -Xmx64G -fork1 -GBSSeqToTagDBPlugin -e PstI-MspI \
    -i $seqDir \
    -db ${name}.db \
    -k $keyFile \
    -kmerLength 64 -minKmerL 20 -mnQS 20 -mxKmerNum 250000000 \
    -endPlugin -runfork1 >> z_pipeline.out
 
## TagExportToFastqPlugin - export Tags
$tasselPath -fork1 -TagExportToFastqPlugin \
    -db ${name}.db \
    -o ${name}_tagsForAlign.fa.gz -c 10 \
    -endPlugin -runfork1 >> z_pipeline.out
 
## RUN BOWTIE
bowtie2 -p 20 --end-to-end \
    -x $dbPath \
    -U ${name}_tagsForAlign.fa.gz \
    -S ${name}.sam >> z_pipeline.out
 
## SAMToGBSdbPlugin - SAM to DB
$tasselPath -Xms64G -Xmx64G -fork1 -SAMToGBSdbPlugin \
    -i ${name}.sam \
    -db ${name}.db \
    -aProp 0.0 -aLen 0 \
    -endPlugin -runfork1 >> z_pipeline.out
 
## DiscoverySNPCallerPluginV2 - RUN DISCOVERY SNP CALLER
$tasselPath -Xms64G -Xmx64G -fork1 -DiscoverySNPCallerPluginV2 \
    -db ${name}.db \
    -mnLCov 0.1 -mnMAF 0.01 -deleteOldData true \
    -endPlugin -runfork1 >> z_pipeline.out
 
## SNPQualityProfilerPlugin - RUN QUALITY PROFILER
$tasselPath -Xms64G -Xmx64G -fork1 -SNPQualityProfilerPlugin \
    -db ${name}.db \
    -statFile ${name}_SNPqual_stats.txt \
    -endPlugin -runfork1 >> z_pipeline.out
 
## UpdateSNPPositionQualityPlugin - UPDATE DATABASE WITH QUALITY SCORE
$tasselPath -Xms64G -Xmx64G -fork1 -UpdateSNPPositionQualityPlugin \
    -db ${name}.db \
    -qsFile ${name}_SNPqual_stats.txt \
    -endPlugin -runfork1 >> z_pipeline.out
 
## ProductionSNPCallerPluginV2 - RUN PRODUCTION PIPELINE - output .vcf
$tasselPath -Xms64G -Xmx64G -fork1 -ProductionSNPCallerPluginV2 \
    -db ${name}.db \
    -i $seqDir \
    -k $keyFile \
    -o ${name}.vcf \
    -e PstI-MspI -kmerLength 64 \
    -endPlugin -runfork1 >> z_pipeline.out
 
## Convert to Hapmap format
$tasselPath -Xms64G -Xmx64G -fork1 -vcf ${name}.vcf \
    -export ${name} -exportType Hapmap
 
mv /homes/$user/gbs/jobs/${name}.* keyFileSh/


If everything ran correctly, you should get a folder named projectName in your gbs/projects folder with all the output files including SNPs containing projectName.vcf and projectName.hmp.txt files that can be used for further analyses


If you have any question or it doesn't work, please contact me here.