Post date: May 15, 2014 7:27:55 PM
#See script vcf2genotype.pl
#Instead of estimating allele frequencies separately for each population like we did for the other spring endemic project, we will estimate them all together; we'll use a global allele frequency as a prior. We will convert gprob to a point estimate (a number between 0 and 2; 0 = homozygous for ref allele, 2 = homozygous for alternative allele).
#minmaf (min. minor allele frequency): 0.05
#I kept 13999 loci; the file name is filtered_varEsosorum.genest
#The header in the file are individual names, each row is a locus, the first thing in each row is contig#:position, then genest
#File without the header and locus stuff: filtered_varEsosorumNoHdr.genest
#File with just the header info: indIds.txt
#Hand-made a file with pop id's and ind id's separated: sosorum_pops.csv
#PCA in R
sosopopinfo<-read.csv("sosorum_pops.csv",header=F)
sosogendata<-read.table("filtered_varEsosorumNoHdr.genest",header=F)
sosopcaout<-prcomp(t(as.matrix(sosogendata)),center=T,scale.=F)
summary(sosopcaout)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 5.54409 3.77391 3.20111 3.08046 2.79084 2.77712 2.75591
#Proportion of Variance 0.02321 0.01075 0.00774 0.00717 0.00588 0.00582 0.00574
#Cumulative Proportion 0.02321 0.03396 0.04170 0.04887 0.05475 0.06057 0.06631
##compare to pca on NxN genotype covariance matrix
g<-(as.matrix(sosogendata))
gmn<-apply(g,1,mean)
nloci<-13999
nind<-258
gmnmat<-matrix(gmn,nrow=nloci,ncol=nind)
gprime<-g-gmnmat
gcovarmat<-matrix(NA,nrow=nind,ncol=nind)
for(i in 1:nind){
for(j in i:nind){
if (i==j){
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
}
else{
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
gcovarmat[j,i]<-gcovarmat[i,j]
}
}
}
pcgcov<-prcomp(x=gcovarmat,center=TRUE,scale=FALSE)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6
#Standard deviation 0.02537 0.01593 0.01129 0.01079 0.008915 0.008829
#Proportion of Variance 0.06415 0.02530 0.01269 0.01160 0.007920 0.007770
#Cumulative Proportion 0.06415 0.08944 0.10213 0.11374 0.121660 0.129420
pdf("sosorum_gcov_PCA.pdf",width=6,height=6)
plot(pcgcov$x[,1],pcgcov$x[,2],type="n",ylab="PC2 (6.4%)",xlab="PC1 (2.5%)")
title(main="A) E. sosorum",adj=0)
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-EL',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-EL',2],col="purple",labels=("C-EL"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F1',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F1',2],col="blue",labels=("C-F1"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F2',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F2',2],col="gray",labels=("C-F2"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F3',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-C-F3',2],col="black",labels=("C-F3"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-CS',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-CS',2],col="red",labels=("W-CS"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-EL',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-EL',2],col="orange",labels=("W-EL"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-PO',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-PO',2],col="yellow",labels=("W-PO"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-SG',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-SG',2],col="green",labels=("W-SG"))
text(pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-UP',1],pcgcov$x[sosopopinfo[,1] == 'AlnE-BS-W-UP',2],col="darkgreen",labels=("W-UP"))
dev.off()