Post date: Sep 13, 2017 8:41:26 PM
BWA version 0.7.10-r789 was used for alignment.
We first indexed the reference sequence:
bwa index -a is PSSP_reference_con90.fasta
We then submitted a series of batch jobs to run bwa aln and bwa samse for each individual fastq file.
From the parsed sub-directory, run:
perl ../scripts/wrap_qsub_slurm_bwa.pl PSSP_*fastq
This perl script is a wrapper that generates a bunch (62 in this case) of batch slurm submission scripts to actually run the alignments. Here is the most relevant chunk of the wrapper perl script:
#!/usr/bin/perl
### Version 1.0 -- 19 August 2012 -- Zach Gompert
### a perl script to submit many serial slurm jobs to the queue using the sbatch com
mand.
use warnings;
use strict;
### ------------------ JOB CONFIGURATION -----------------------
### You will probably need to modify some of these variables
my $walltime = '8:00:00'; ## Format: hh:mm:ss
# Specify the maximum wall clock time for a single job. The
# wall clock time should take possible queue waiting time into
# account. Format: hhhh:mm:ss hours:minutes:seconds Be sure to
# specify a reasonable value here. If the job does not finish by the
# time reached, the job is terminated.
my $nodes = '1'; # Request number of compute nodes per job.
my $ppn = '12'; # Number of tasks per node
my $accnt= 'gompert'; # PI account
my $partition = 'kingspeak'; # Specify compute cluster, options include
# kingspeak, ember, kingspeak-freecylce, ember-freecycle
my $jobname = 'bwa';
# Set the name of the job (up to 15 characters,
# no blank spaces, start with alphanumeric character)
my $email = 'zach.gompert@usu.edu'; ## e-mail this address if the job fails
my $dir = '/uufs/chpc.utah.edu/common/home/u6000989/data/grasses/parsed/';
my @jobarray;
my $aln;
my $samse;
my $ind;
my $genome = "/uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta";
my $cd = "cd $dir\n";
my @job;
my $job;
my $cnt = 0;
foreach my $file (@ARGV){
if ($file =~ m/(PSSP_[A-Za-z0-9_]+)\.fastq/){
$ind = $1;
}
else {
die "Failed to match $file\n";
}
$aln ="bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f aln"."$ind".".sai $genome $file\n";
$samse ="bwa samse -n 1 -r \'\@RG\\tID:$ind\\tPL:ILLUMINA\\tLB:$ind\\tSM:$ind"."\' -f aln"."$ind".".sam $genome aln"."$ind".".sai $file\n";
$cnt++;
push (@job, "$aln"."$samse");
if($cnt==10){ ## every 10 samples gets a node of its own
$job = join("",@job);
push (@jobarray, "$cd"."$job");
print "$cd"."$job";
$cnt = 0;
@job = ();
}
}
## last job
$job = join("",@job);
push (@jobarray, "$cd"."$job");
print "$cd"."$job";
This generates one job per 10 samples. One set of 10 alignments (the amount sent to one job/node) looks like this:
cd /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/parsed/
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1336.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1336.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1336\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1336\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1336' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1336.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1336.sai PSSP_P7E1_Control_NA_P7E1Q4_1336.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1340.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1340.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1340\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1340\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1340' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1340.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1340.sai PSSP_P7E1_Control_NA_P7E1Q4_1340.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1341.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1341.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1341\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1341\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1341' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1341.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1341.sai PSSP_P7E1_Control_NA_P7E1Q4_1341.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1344.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1344.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1344\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1344\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1344' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1344.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1344.sai PSSP_P7E1_Control_NA_P7E1Q4_1344.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1345.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1345.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1345\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1345\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1345' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1345.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1345.sai PSSP_P7E1_Control_NA_P7E1Q4_1345.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1347.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1347.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1347\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1347\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1347' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1347.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1347.sai PSSP_P7E1_Control_NA_P7E1Q4_1347.fastq
bwa aln -n 5 -l 20 -k 2 -t 12 -q 10 -f alnPSSP_P7E1_Control_NA_P7E1Q4_1348.sai /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta PSSP_P7E1_Control_NA_P7E1Q4_1348.fastq
bwa samse -n 1 -r '@RG\tID:PSSP_P7E1_Control_NA_P7E1Q4_1348\tPL:ILLUMINA\tLB:PSSP_P7E1_Control_NA_P7E1Q4_1348\tSM:PSSP_P7E1_Control_NA_P7E1Q4_1348' -f alnPSSP_P7E1_Control_NA_P7E1Q4_1348.sam /uufs/chpc.utah.edu/common/home/u6000989/data/grasses/clustering/PSSP_reference_con90.fasta alnPSSP_P7E1_Control_NA_P7E1Q4_1348.sai PSSP_P7E1_Control_NA_P7E1Q4_1348.fastq
A series of sam alignment files will be produced, one per individual, which will then be moved to the alignment sub-directory (manually). NOTE a quick check suggests that about 11% of reads are aligning. This is low. I might need to do something about this (i.e., figure out how to improve the reference or reduce the stringency of the alignment... maybe try bowtie2).