Post date: Mar 23, 2017 8:33:55 PM
I am generating a set of variants for Romain from the 84 Timema genomes that were on the sperm donor lane but not part of the experiment (this includes a mix of species).
Initial alignments and the start of variant calling is described here.
The final script I ran for g.vcf file generation is /uufs/chpc.utah.edu/common/home/u6000989/data/timema/combind_wgs_dovetailV3/alignments_tsp/runGatkFork.pl
#!/usr/bin/perl
open(IN, "bamlists.txt") or die "failed to read IN\n";
while (<IN>){
chomp;
$file = $_;
$out = $file;
$out =~ s/list$/g.vcf/ or die "failed sub\n";
$pid = fork;
if($pid){
$forks++;
system "java -Xmx44g -jar ~/bin/GenomeAnalysisTK.jar -T HaplotypeCaller -R /uufs/chpc.utah.edu/common/home/u6000989/data/timema/tcrDovetail/version3/mod_map_timema_06Jun2016_RvNkF702.fasta -I /uufs/chpc.utah.edu/common/home/u6000989/data/timema/combind_wgs_dovetailV3/alignments_tsp/$file -o /scratch/general/lustre/timemasGatk/$out -gt_mode DISCOVERY -hets 0.001 -mbq 30 -out_mode EMIT_VARIANTS_ONLY -ploidy 2 -stand_call_conf 50 -pcrModel AGGRESSIVE --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000\n";
system "cp /scratch/general/lustre/timemasGatk/$out /uufs/chpc.utah.edu/common/home/u6000989/data/timema/combind_wgs_dovetailV3/variants_tsp/$out\n";
exit;
}
push (@pids,$pid);
}
for $pid (@pids){
waitpid $pid, 0;
}
Joint variant calling was done with jointVarCall.sh in /uufs/chpc.utah.edu/common/home/u6000989/data/timema/combind_wgs_dovetailV3/variants_tsp/
#!/bin/sh
#SBATCH --time=168:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=20
#SBATCH --account=gompert-kp
#SBATCH --partition=gompert-kp
#SBATCH --job-name=gatk
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=zach.gompert@usu.edu
cd /uufs/chpc.utah.edu/common/home/gompert-group1/data/timema/combind_wgs_dovetailV3/variants_tsp/
java -Xmx460g -jar ~/bin/GenomeAnalysisTK.jar -T GenotypeGVCFs -R /uufs/chpc.utah.edu/common/home/gompert-group1/data/timema/tcrDovetail/version3/mod_map_timema_06Jun2016_RvNkF702.fasta --variant bart_JL_IC_2320.g.vcf --variant bart_JL_IC_2321.g.vcf --variant bart_JL_IC_2322.g.vcf --variant bart_JL_IC_2323.g.vcf --variant bart_JL_IC_2324.g.vcf --variant bart_JL_PP_2315.g.vcf --variant bart_JL_PP_2316.g.vcf --variant bart_JL_PP_2317.g.vcf --variant bart_JL_PP_2318.g.vcf --variant bart_JL_PP_2319.g.vcf --variant bart_JL_WF_2307.g.vcf --variant bart_JL_WF_2308.g.vcf --variant bart_JL_WF_2309.g.vcf --variant bart_JL_WF_2310.g.vcf --variant bart_JL_WF_2311.g.vcf --variant bart_JL_WP_870.g.vcf --variant bart_JL_WP_871.g.vcf --variant bart_JL_WP_872.g.vcf --variant bart_JL_WP_873.g.vcf --variant bart_JL_WP_874.g.vcf --variant bart_PCT_JP_2886.g.vcf --variant bart_PCT_JP_2887.g.vcf --variant bart_PCT_JP_2888.g.vcf --variant bart_PCT_JP_2889.g.vcf --variant bart_PCT_JP_2890.g.vcf --variant bart_PCT_WF_370.g.vcf --variant bart_PCT_WF_371.g.vcf --variant bart_PCT_WF_372.g.vcf --variant bart_PCT_WF_373.g.vcf --variant bart_PCT_WF_374.g.vcf --variant bart_PCT_WP_2906.g.vcf --variant bart_PCT_WP_2907.g.vcf --variant bart_PCT_WP_2908.g.vcf --variant bart_PCT_WP_2909.g.vcf --variant bart_PCT_WP_2910.g.vcf --variant chum_BS_C_447.g.vcf --variant chum_BS_C_448.g.vcf --variant chum_BS_C_449.g.vcf --variant chum_BS_C_450.g.vcf --variant chum_BS_C_451.g.vcf --variant chum_GR806_MM_1501.g.vcf --variant chum_GR806_MM_1502.g.vcf --variant chum_GR806_MM_1503.g.vcf --variant chum_GR806_MM_1504.g.vcf --variant chum_GR806_MM_1505.g.vcf --variant chum_GR806_Q_1514.g.vcf --variant chum_GR806_Q_1515.g.vcf --variant chum_GR806_Q_1516.g.vcf --variant chum_GR806_Q_1517.g.vcf --variant chum_GR806_Q_1518.g.vcf --variant curi_CRH_C_996.g.vcf --variant curi_CRH_C_997.g.vcf --variant curi_CRH_C_998.g.vcf --variant curi_CRH_M_995.g.vcf --variant curi_CRL_A_961.g.vcf --variant curi_CRL_A_962.g.vcf --variant curi_CRL_A_963.g.vcf --variant curi_CRL_A_964.g.vcf --variant curi_CRL_A_965.g.vcf --variant curi_CRL_M_967.g.vcf --variant curi_CRL_M_968.g.vcf --variant curi_CRL_M_969.g.vcf --variant curi_CRL_M_970.g.vcf --variant curi_CRL_M_971.g.vcf --variant curi_CRL_MM_984.g.vcf --variant curi_CRL_MM_985.g.vcf --variant curi_CRL_MM_986.g.vcf --variant curi_CRL_MM_987.g.vcf --variant curi_CRL_MM_988.g.vcf --variant land_BCHC_M_609.g.vcf --variant land_BCHC_M_610.g.vcf --variant land_BCHC_M_611.g.vcf --variant land_BCHC_M_612.g.vcf --variant land_BCHC_M_613.g.vcf --variant land_BCHC_Q_617.g.vcf --variant land_BCHC_Q_618.g.vcf --variant land_BCHC_Q_619.g.vcf --variant land_BCHC_Q_620.g.vcf --variant land_BCHC_Q_621.g.vcf --variant podu_BS_C_454.g.vcf --variant podu_BS_C_455.g.vcf --variant podu_BS_C_456.g.vcf --variant podu_BS_C_458.g.vcf --variant podu_PCT_WP_2934.g.vcf -ploidy 2 -o tcrSpWgsVariants.vcf
I then filtered the variant file with vcfFilter.pl
#### stringency variables, edits as desired
my $minCoverage = 168; # minimum number of sequences; DP
my $minCount = 4; ## minmum number of alternative alleles
my $bqrs = -8; # minimum absolute value of the base quality rank sum test; BaseQRankSum
my $mqrs = -12.5; # minimum absolute value of the mapping quality rank sum test; MQRankSum
my $rprs = -8; # minimum absolute value of the read position rank sum test; ReadPosRankSum
my $qd = 2; # minimum ratio of variant confidenct to non reference read depth; QD
my $mq = 40; # minimum mapping quality; MQ
my $fish = 60; #Phred-scaled p-value using Fisher’s Exact Test to detect strand bias (the variation being seen on only the forward or only the reverse strand) in the reads. More bias is indicative of false positive calls.
##### this set is for whole genome shotgun data
my $ac;
my $d;
my $dp;
my $minacov;
my @line;
my @fail;
my $in = shift(@ARGV);
open (IN, $in) or die "Could not read the infile = $in\n";
$in =~ m/^([a-zA-Z_]+\.vcf)$/ or die "Failed to match the variant file\n";
open (OUT, "> filtered1X_$1") or die "Could not write the outfile\n";
my $flag = 0;
my $cnt = 0;
foreach my $i (0..13){
$fail[$i]=0;
}
while (<IN>){
chomp;
$flag = 1;
if (m/^\#/){ ## header row, always write
$flag = 1;
}
elsif (m/^Sc/){ ## this is a sequence line, you migh need to edit this reg. expr.
$flag = 1;
if (m/[ACTGN]\,[ACTGN]/){ ## two alternative alleles identified
$flag = 0;
# print "fail allele : ";
$fail[0]++;
}
@line = split(/\s+/,$_);
if(length($line[3]) > 1 or length($line[4]) > 1){
$flag = 0;
# print "fail INDEL : ";
$fail[1]++;
}
m/DP=(\d+)/ or die "Syntax error, DP not found\n";
$dp = $1;
if ($dp < $minCoverage){
$flag = 0;
$fail[2]++;
# print "fail DP : ";
}
m/BaseQRankSum=([0-9e\-\.\+]*)/;
if ($1 < $bqrs){
$flag = 0;
$fail[3]++;
# print "fail BQRS : ";
}
m/MQRankSum=([0-9\-\.e\+]*)/;
if ($1 < $mqrs){
$flag = 0;
$fail[4]++;
# print "fail MQRS : ";
}
m/ReadPosRankSum=([0-9\-\.e\+]*)/;
if ($1 < $rprs){
$flag = 0;
$fail[5]++;
# print "fail RPRS : ";
}
if(m/QD=([0-9\.]+)/){
if ($1 < $qd){
$flag = 0;
$fail[6]++;
# print "fail QD : ";
}
}
if(m/MQ=([0-9\.]+)/){
if ($1 < $mq){
$flag = 0;
$fail[7]++;
# print "fail MQ : ";
}
}
m/FS=([0-9e\-\.\+]*)/;
if ($1 > $fish){
$flag = 0;
$fail[8]++;
# print "fail BQRS : ";
}
if (m/LG-NA/){
$flag = 0;
$fail[9]++;
}
m/MLEAC=(\d+)/ or die "Syntax error, MLEAC not found\n";
$ac = $1;
if ($ac < $minCount){
$flag = 0;
$fail[10]++;
}
m/MLEAF=(\d+)/ or die "Syntax error, MLEAF not found\n";
if ($1 > 0.999){
$flag = 0;
$fail[11]++;
}
if (m/LG-0/){
$flag = 0;
$fail[12]++;
}
if ($flag == 1){
$cnt++; ## this is a good SNV
}
else{
$fail[13]++;
}
}
else{
print "Warning, failed to match the chromosome or scaffold name regular expression for this line\n$_\n";
$flag = 0;
}
if ($flag == 1){
print OUT "$_\n";
}
}
close (IN);
close (OUT);
print "Finished filtering $in\nRetained $cnt variable loci\n";
foreach my $i (0..13){
print "failed $i : $fail[$i]\n";
}
Here is the fail output:
failed 0 : 6940510
failed 1 : 25911175
failed 2 : 86484516
failed 3 : 2
failed 4 : 2
failed 5 : 8
failed 6 : 62157
failed 7 : 87119292
failed 8 : 6639
failed 9 : 0
failed 10 : 36884770
failed 11 : 17314715
failed 12 : 13083316
failed 13 : 103886474
Retained 3437520 variable loci in filtered1X_tcrSpWgsVariants.vcf, which I moved to diogenes for Romain.