Post date: Jun 19, 2018 8:30:53 PM
1. Convert to gl, no additional maf filter.
perl vcf2gl.pl 0.0 morefilter_filtered2x_lycTimeSeriesVariants.vcf
1537 inds. 30,513 loci
2. Split by population and year.
perl splitPops.pl lyc_timeseries_2018.gl
We have 33 population x year samples:
BCR = 2013, 2015, 2017
BNP = 2013, 2015, 2017
BTB = 2013, 2014, 2015, 2017
GNP = 2013, 2015, 2017
HNV = 2013, 2014, 2015, 2017
MRF = 2013, 2015, 2017
PSP = 2013, 2015, 2017
RNV = 2013, 2014, 2017
SKI = 2013, 2014, 2015, 2017
USL = 2013, 2015, 2017
3. Estimate allele frequencies, from AlleleFreqs subdirectory.
perl RunEstpEMFork.pl lyctimeseries_2018_*gl
#!/usr/bin/perl
#
# run estpEM as an interactive job
#
use Parallel::ForkManager;
my $max = 20;
my $pm = Parallel::ForkManager->new($max);
FILES:
foreach $file (@ARGV){
$pm->start and next FILES; ## fork
$file =~ m/2018_([A-Z]+\-\d+)/ or die "failed to match $file\n";
$out = "p_$1".".txt";
system "~/bin/estpEM -i $file -o $out -m 50 -h 2\n";
$pm->finish;
}
$pm->wait_all_children;
3b. Played some with allele freq results.
See AfreqPCA.R. There is some evidence of space and time effects.
4. Convert to genest, initial values for entropy
~/bin/estpEM -i lyc_timeseries_2018.gl -o p_combined.txt -m 50 -h 2
cut -f 3 -d " " p_combined.txt > afreqs.txt
perl gl2genest.pl afreqs.txt lyc_timeseries_2018.gl
5. Run entropy (use genotypes for mapping). From /uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_diversity/Gemma/Entropy.
Generate initial values with initq.R
## read genotype estimates, i.e., g = 0 * Pr(g=0) + 1 * Pr(g=1) + 2 * Pr(g=2)
## row = locus, column = individual
L<-30513
N<-1536
g<-matrix(scan("pntest_lyc_timeseries_2018.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=T)
## pca on the genotype covariance matrix
pcgcov<-prcomp(x=t(g),center=TRUE,scale=FALSE)
## kmeans and lda
library(MASS)
k2<-kmeans(pcgcov$x[,1:5],2,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k3<-kmeans(pcgcov$x[,1:5],3,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
ldak2<-lda(x=pcgcov$x[,1:5],grouping=k2$cluster,CV=TRUE)
ldak3<-lda(x=pcgcov$x[,1:5],grouping=k3$cluster,CV=TRUE)
write.table(round(ldak2$posterior,5),file="ldak2.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak3$posterior,5),file="ldak3.txt",quote=F,row.names=F,col.names=F)
save(list=ls(),file="init.rdat")
## when you run entropy use provide the input values as, e.g., -q ldak2.txt
## also set -s to something like 50
Run entropy, k = {2,3} with 2 chains.
sbatch RunEntropy.sh
Which runs RunEntropyFork.pl.
#!/usr/bin/perl
#
# run entropy jobs
#
use Parallel::ForkManager;
my $max = 20;
my $pm = Parallel::ForkManager->new($max);
my $in = "/uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_diversity/Gemma/Entropy/lyc_timeseries_2018.gl";
my $k2 = "/uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_diversity/Gemma/Entropy/ldak2.txt";
my $k3 = "/uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_diversity/Gemma/Entropy/ldak3.txt";
my $odir = "/uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_diversity/Gemma/Entropy/";
@C = ("entropy -i $in -l 10000 -b 5000 -t 5 -k 2 -Q 0 -s 50 -q $k2 -o $odir"."out_k2_ch1.hdf5 -w 0 -m 1\n",
"entropy -i $in -l 10000 -b 5000 -t 5 -k 2 -Q 0 -s 50 -q $k2 -o $odir"."out_k2_ch2.hdf5 -w 0 -m 1\n",
"entropy -i $in -l 10000 -b 5000 -t 5 -k 3 -Q 0 -s 50 -q $k3 -o $odir"."out_k3_ch1.hdf5 -w 0 -m 1\n",
"entropy -i $in -l 10000 -b 5000 -t 5 -k 3 -Q 0 -s 50 -q $k3 -o $odir"."out_k3_ch2.hdf5 -w 0 -m 1\n");
COMMANDS:
foreach $command (@C){
$pm->start and next COMMANDS; ## fork
system $command;
$pm->finish;
}
$pm->wait_all_children;