Genomic structural variation identification and genotyping, from next-gen sequencing data
Software: Genomestrip http://www.broadinstitute.org/software/genomestrip/
Usage on SGE requires login to active node, pre-loading of SGE and genomestrip, then job submission.
Full code, logs and output data available at https://zenodo.org/record/159472
Settings for use on Linux cluster
#!/bin/sh#$ -N log_lhm_genomestrip#$ -pe openmp 10#$ -S /bin/sh#$ -cwd#$ -j y#$ -q bioinf.qFull logging, exit upon error, record date and time, start timer.
set -execho "`date +%y/%m/%d_%H:%M:%S`"START=$(date +%s.%N)uname -saSet environment, load software
. /etc/profile.d/modules.shmodule load sgemodule load genomestrip/2.0module load jre/1.7.0_25Set input variables
SV_TMPDIR=./tmpdirSV_DIR=/cm/shared/apps/svtoolkit/2.0.1602/project=lhm_gsbams=gstrip_lhm_RG_bams.listref_seq=reference_data/dm6.famy_config=reference_data/params_dm6_gstrip.txtgenome_mask=reference_data/dm6.svmask.fastadepth_mask=reference_data/dm6.rdmask.bedploidy=reference_data/ploidy_dm6.mapgender_map=reference_data/gstrip_lhm_rg_gender.mapmy_intervals=reference_data/dm6_majorChroms.listSet program versions
which java > /dev/null || exit 1which Rscript > /dev/null || exit 1which samtools > /dev/null || exit 1export PATH=${SV_DIR}/bwa:${PATH}export LD_LIBRARY_PATH=${SV_DIR}/bwa:${LD_LIBRARY_PATH}mx="-Xmx4g"classpath="${SV_DIR}/lib/SVToolkit.jar:${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar:${SV_DIR}/lib/gatk/Queue.jar"
State program version and build date.
java -cp ${classpath} ${mx} -jar ${SV_DIR}/lib/SVToolkit.jarmkdir -p preprocessing || exit 1mkdir -p preprocessing/logs || exit 1mkdir -p preprocessing/metadata || exit 1java -cp ${classpath} ${mx} org.broadinstitute.gatk.queue.QCommandLine \ -S ${SV_DIR}/qscript/SVPreprocess.q \ -S ${SV_DIR}/qscript/SVQScript.q \ -gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \ -jobRunner Drmaa \ -gatkJobRunner Drmaa \ -jobNative '-V -pe openmp 10 -q bioinf.q' \ -cp ${classpath} \ -tempDir ${SV_TMPDIR} \ -configFile ${my_config} \ -R ${ref_seq} \ -genomeMaskFile ${genome_mask} \ -readDepthMaskFile ${depth_mask} \ -ploidyMapFile ${ploidy} \ -genderMapFile ${gender_map} \ -I ${bams} \ -md preprocessing/metadata \ -disableGATKTraversal \ -useMultiStep \ -bamFilesAreDisjoint true \ -computeGCProfiles true \ -reduceInsertSizeDistributions true \ -computeReadCounts true \ -jobLogDir preprocessing/logs \ -run || exit 1Loop through chromosomes (as multiple intervals option not available)
mkdir -p deletion_pipeline || exit 1for i in chr2L chr2R chr3L chr3R chr4 chrX; do mkdir -p deletion_pipeline/deletions_${i} || exit 1 mkdir -p deletion_pipeline/deletions_${i}/logs || exit 1 mkdir -p deletion_pipeline/deletions_${i}/metadata || exit 1java -cp ${classpath} ${mx} org.broadinstitute.gatk.queue.QCommandLine \ -S ${SV_DIR}/qscript/SVDiscovery.q \ -S ${SV_DIR}/qscript/SVQScript.q \ -gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \ -jobRunner Drmaa \ -gatkJobRunner Drmaa \ -jobNative '-V -pe openmp 10 -q bioinf.q' \ -cp ${classpath} \ -tempDir ${SV_TMPDIR} \ -configFile ${my_config} \ -R ${ref_seq} \ -genomeMaskFile ${genome_mask} \ -readDepthMaskFile ${depth_mask} \ -I ${bams} \ -md preprocessing/metadata \ -runDirectory deletion_pipeline/deletions_${i} \ -L ${i} \ -disableGATKTraversal \ -maximumSize 1000000 \ -minimumSize 200 \ -O deletion_pipeline/deletions_${i}/${project}.${i}.disco.vcf \ -debug true -run || exit 1java -cp ${classpath} ${mx} org.broadinstitute.gatk.queue.QCommandLine \ -S ${SV_DIR}/qscript/SVGenotyper.q \ -S ${SV_DIR}/qscript/SVQScript.q \ -gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \ -jobRunner Drmaa \ -gatkJobRunner Drmaa \ -jobNative '-V -pe openmp 10 -q bioinf.q' \ -cp ${classpath} \ -tempDir ${SV_TMPDIR} \ -disableGATKTraversal \ -jobLogDir deletion_pipeline/deletions_${i}/logs \ -parallelJobs 10 \ -configFile ${my_config} \ -R ${ref_seq} \ -genomeMaskFile ${genome_mask} \ -readDepthMaskFile ${depth_mask} \ -ploidyMapFile ${ploidy} \ -genderMapFile ${gender_map} \ -runDirectory deletion_pipeline/deletions_${i} \ -md preprocessing/metadata \ -I ${bams} \ -vcf deletion_pipeline/deletions_${i}/${project}.${i}.disco.vcf \ -O deletion_pipeline/deletions_${i}/${project}.${i}.vcf \ -debug true -run || exit 1 done; ## End of deletion genotyping chromosome looping.mkdir -p cnv_pipeline || exit 1mkdir -p cnv_pipeline/logs || exit 1mkdir -p cnv_pipeline/metadata || exit 1java -cp ${classpath} ${mx} org.broadinstitute.gatk.queue.QCommandLine \ -S ${SV_DIR}/qscript/discovery/cnv/CNVDiscoveryPipeline.q \ -S ${SV_DIR}/qscript/SVQScript.q \ -jobRunner Drmaa \ -gatkJobRunner Drmaa \ -jobNative '-V -pe openmp 10 -q bioinf.q' \ -cp ${classpath} \ -gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \ -configFile ${my_config} \ -R ${ref_seq} \ -I ${bams} \ -genderMapFile ${gender_map} \ -runDirectory cnv_pipeline \ -md preprocessing/metadata \ -jobLogDir cnv_pipeline/logs \ -genomeMaskFile ${genome_mask} \ -readDepthMaskFile ${depth_mask} \ -ploidyMapFile ${ploidy} \ -intervalList ${my_intervals} \ -tempDir ${SV_TMPDIR} \ -disableGATKTraversal \ -minimumRefinedLength 500 \ -tilingWindowSize 1000 \ -tilingWindowOverlap 500 \ -maximumReferenceGapLength 1000 \ -boundaryPrecision 100 \ -debug true --verbose true -run || exit 1java -Xmx4g -cp ${classpath} ${mx} org.broadinstitute.sv.apps.GenerateHaploidCNVGenotypes \ -R ${ref_seq} \ -vcf cnv_pipeline/results/gs_cnv.genotypes.vcf.gz \ -O cnv_pipeline/${project}.cnvs.raw.vcf \ -ploidyMapFile ${ploidy} \ -genderMapFile ${gender_map} \ -estimateAlleleFrequencies true \ -genotypeLikelihoodThreshold 0.001 \ -debug true \ --verbose true || exitUnload software, list directories, Timer end, end script.
module unload genomestrip/2.0module unload jre/1.7.0_25module unload sgeuname -sals -laEND=$(date +%s.%N)Duration=$(echo "$END - $START" | bc)echo "`date +%y/%m/%d_%H:%M:%S`"echo $DurationexitDeletion viewed alongside wild-type (both bam files), and coloured genotypes for the population sample (vcf file), all viewed in the Integrated Genomics Viewed. Mapped sequencing reads are in grey. The deletion is indicated by the reduction in coverage, and also by distant read-pair mapping coloured red. Short deletions are shown in black.
Likely in vivo amplification in one of the samples sequenced (top). Bam files viewed using Integrated Genomics Viewer.
Perfecting structural variant detection on the the Sussex server cluster while waiting for a plane on the other side of the world.