Post date: May 12, 2017 3:42:23 AM
1. Conver the vcf file to a genotype likelihood file
perl ../Scripts/vcf2gl.pl 0.0 morefilter_filtered2x-70_varsLycGwa.vcf
Creates lyc_gwas.gl with 75,476 SNPs and 1895 individuals
2. Generate starting values for Entropy based on LDA assignment probs.
In 'Entropy' sub-directory, ran the following to generate rough genotype estimates with a flat prior:
perl gl2genest.pl lyc_gwas.gl
R CMD BATCH initq.R
Here is the R script initq.R
########################################################
## read genotype estimates, i.e., g = 0 * Pr(g=0) + 1 * Pr(g=1) + 2 * Pr(g=2)
## row = locus, column = individual
L<-75476
N<-1895
g<-matrix(scan("pntest_lyc_gwas.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=T)
## calculate N x N genotype covariance matrix
gmn<-apply(g,1,mean)
gmnmat<-matrix(gmn,nrow=L,ncol=N)
gprime<-g-gmnmat ## remove mean
gcovarmat<-matrix(NA,nrow=N,ncol=N)
for(i in 1:N){
for(j in i:N){
if (i==j){
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
}
else{
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
gcovarmat[j,i]<-gcovarmat[i,j]
}
}
}
## pca on the genotype covariance matrix
pcgcov<-prcomp(x=gcovarmat,center=TRUE,scale=FALSE)
## kmeans and lda
library(MASS)
k2<-kmeans(pcgcov$x[,1:5],2,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k3<-kmeans(pcgcov$x[,1:5],3,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k4<-kmeans(pcgcov$x[,1:5],4,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k5<-kmeans(pcgcov$x[,1:5],5,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k6<-kmeans(pcgcov$x[,1:5],6,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
ldak2<-lda(x=pcgcov$x[,1:5],grouping=k2$cluster,CV=TRUE)
ldak3<-lda(x=pcgcov$x[,1:5],grouping=k3$cluster,CV=TRUE)
ldak4<-lda(x=pcgcov$x[,1:5],grouping=k4$cluster,CV=TRUE)
ldak5<-lda(x=pcgcov$x[,1:5],grouping=k5$cluster,CV=TRUE)
ldak6<-lda(x=pcgcov$x[,1:5],grouping=k6$cluster,CV=TRUE)
write.table(round(ldak2$posterior,5),file="ldak2.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak3$posterior,5),file="ldak3.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak4$posterior,5),file="ldak4.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak5$posterior,5),file="ldak5.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak6$posterior,5),file="ldak6.txt",quote=F,row.names=F,col.names=F)
save(list=ls(),file="init.rdat")
#########################################################
3. Run entropy, two chains for k = 2 ... 6
batch runentropy.sh
This calls my.conf
0-4 /uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_wings/Entropy/entropy1.sh %t
which has to be run twice, once with entropy1.sh and once with entropy2.sh
That is where the real bits are (the rest is just stuff making the submissions efficient). Here is the code, where $K is 2, 3, 4, 5 or 6
#!/bin/bash
module load gsl
module load hdf5
K=$(($1+2))
/uufs/chpc.utah.edu/common/home/u6000989/bin/entropy -i /uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_wings/Entropy/lyc_gwas.gl -l 15000 -b 5000 -t 5 -k $K -Q 0 -s 50 -q /uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_wings/Entropy/startingvals/ldak$K.txt -o /scratch/general/lustre/entropy/ento_lycGwaCh1K$K.hdf5 -w 0 -m 1
mv /scratch/general/lustre/entropy/ento_lycGwaCh1K$K.hdf5 /uufs/chpc.utah.edu/common/home/u6000989/projects/lycaeides_wings/Entropy/mcmc/
Key paramters here are that we are running 15,000 MCMC steps, with a 5000 step burnin and a thinning interval of 5.