Post date: May 15, 2014 10:53:22 PM
#See script vcf2genotype.pl
#Instead of estimating allele frequencies separately for each population like we did for the other spring endemic project, we will estimate them all together; we'll use a global allele frequency as a prior. We will convert gprob to a point estimate (a number between 0 and 2; 0 = homozygous for ref allele, 2 = homozygous for alternative allele).
#minmaf (min. minor allele frequency): 0.05
#I kept 252 loci; the file name is filtered_varEnana.genest
#The header in the file are individual names, each row is a locus, the first thing in each row is contig#:position, then genest
#File without the header and locus stuff: filtered_varEnanaNoHdr.genest
#File with just the header info, with ind # in separate column (plus I made all F1s have the "10" id): indIds.csv
#PCA in R
nanapopinfo<-read.csv("indIds.csv",header=F)
nanagendata<-read.table("filtered_varEnanaNoHdr.genest",header=F)
nanapcaout<-prcomp(t(as.matrix(nanagendata)),center=T,scale.=F)
summary(nanapcaout)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 1.8701 1.34263 1.0236 0.76889 0.71897 0.70453 0.67673
#Proportion of Variance 0.1235 0.06365 0.0370 0.02088 0.01825 0.01753 0.01617
#Cumulative Proportion 0.1235 0.18715 0.2241 0.24503 0.26328 0.28081 0.29698
##compare to pca on NxN genotype covariance matrix
g<-(as.matrix(nanagendata))
gmn<-apply(g,1,mean)
nloci<-252
nind<-195
gmnmat<-matrix(gmn,nrow=nloci,ncol=nind)
gprime<-g-gmnmat
gcovarmat<-matrix(NA,nrow=nind,ncol=nind)
for(i in 1:nind){
for(j in i:nind){
if (i==j){
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
}
else{
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
gcovarmat[j,i]<-gcovarmat[i,j]
}
}
}
pcgcov<-prcomp(x=gcovarmat,center=TRUE,scale=FALSE)
summary(pcgcov)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 0.1937 0.09791 0.05175 0.02948 0.02781 0.02679 0.02442
#Proportion of Variance 0.5700 0.14565 0.04069 0.01321 0.01175 0.01090 0.00906
#Cumulative Proportion 0.5700 0.71563 0.75632 0.76952 0.78127 0.79218 0.80124
pdf("nana_gcov_PCA.pdf",width=6,height=6)
plot(pcgcov$x[,1],pcgcov$x[,2],type="n",ylab="PC2 (14.6%)",xlab="PC1 (57%)")
title(main="E. nana",adj=0)
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',2],col="purple",labels=("C-F1"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',2],col="blue",labels=("C-F2"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',2],col="gray",labels=("C-F3"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',2],col="black",labels=("C-F4"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',2],col="red",labels=("W-BD"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',2],col="orange",labels=("W-DS"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',2],col="green",labels=("W-HO"))
dev.off()
pdf("nana_PCA.pdf",width=6,height=6)
plot(nanapcaout$x[,1],nanapcaout$x[,2],type="n",ylab="PC2 (6.4%)",xlab="PC1 (12.4%)")
title(main="E. nana",adj=0)
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-10',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-10',2],col="purple",labels=("C-F1"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-20',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-20',2],col="blue",labels=("C-F2"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-30',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-30',2],col="gray",labels=("C-F3"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-40',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-40',2],col="black",labels=("C-F4"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-BD',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-BD',2],col="red",labels=("W-BD"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-DS',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-DS',2],col="orange",labels=("W-DS"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-HO',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-HO',2],col="green",labels=("W-HO"))
dev.off()
#The patter is not driven by sex:
nanapopinfo<-read.csv("indIds_sex.csv",header=F)
nanagendata<-read.table("filtered_varEnanaNoHdr.genest",header=F)
nanapcaout<-prcomp(t(as.matrix(nanagendata)),center=T,scale.=F)
summary(nanapcaout)
pdf("nana_sex_PCA.pdf",width=6,height=6)
plot(nanapcaout$x[,1],nanapcaout$x[,2],type="n",ylab="PC2 (6.4%)",xlab="PC1 (12.4%)")
title(main="E. nana",adj=0)
text(nanapcaout$x[nanapopinfo[,4] == 'M',1],nanapcaout$x[nanapopinfo[,4] == 'M',2],col="blue",labels=("M"))
text(nanapcaout$x[nanapopinfo[,4] == 'F',1],nanapcaout$x[nanapopinfo[,4] == 'F',2],col="pink",labels=("F"))
dev.off()
#These patterns are weird and hard to explain! I'm going to re-do variant calling with -d 0.5 instead of -d 0.8 in case there is something weird driving the pattern.
reuse SAMtools
#See jobsubVariantCall05.sh for variant calling script. Changed -d and names. Outfiles will be varEnana05.bcf, varEnana05.vcf. To run:
qsub jobsubVariantCall05.sh
#257098.torque.rc.usu.edu
#Filtered the .vcf file for quality control, see: vcfFilter.pl
#mincoverage = 585 (this is 3 x 195 to get 3x coverage per individual per locus)
#minAltRds = 10
#maxRevOrient = 0
perl vcfFilter.pl varEnana05.vcf
#retained 1333 variable loci
perl getCoverage.pl filtered_varEnana05.vcf
less coverage.txt #this is the coverage at each locus
> co<-read.table("coverage.txt")
>mean (co[,1])/195
#average coverage is 6.75x
perl vcf2genotype.pl filtered_varEnana05.vcf
#number of loci: 856
#File without the header and locus stuff: filtered_varEnana05NoHdr.genest
tail -n 856 filtered_varEnana05.genest | cut -f 3-197 -d " " > filtered_varEnana05NoHdr.genest
nanapopinfo<-read.csv("indIds.csv",header=F)
nanagendata<-read.table("filtered_varEnana05NoHdr.genest",header=F)
nanapcaout<-prcomp(t(as.matrix(nanagendata)),center=T,scale.=F)
summary(nanapcaout)
#Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 3.1198 2.63528 1.58462 0.9841 0.91812 0.89749 0.89222
Proportion of Variance 0.1246 0.08891 0.03215 0.0124 0.01079 0.01031 0.01019
Cumulative Proportion 0.1246 0.21352 0.24566 0.2581 0.26885 0.27917 0.28936
PC8 PC9 PC10 PC11 PC12 PC13 PC14
##compare to pca on NxN genotype covariance matrix
g<-(as.matrix(nanagendata))
gmn<-apply(g,1,mean)
nloci<-856
nind<-195
gmnmat<-matrix(gmn,nrow=nloci,ncol=nind)
gprime<-g-gmnmat
gcovarmat<-matrix(NA,nrow=nind,ncol=nind)
for(i in 1:nind){
for(j in i:nind){
if (i==j){
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
}
else{
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
gcovarmat[j,i]<-gcovarmat[i,j]
}
}
}
pcgcov<-prcomp(x=gcovarmat,center=TRUE,scale=FALSE)
summary(pcgcov)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.1586 0.1107 0.03205 0.01433 0.01356 0.01297 0.01288
Proportion of Variance 0.5542 0.2702 0.02264 0.00453 0.00405 0.00371 0.00366
Cumulative Proportion 0.5542 0.8244 0.84705 0.85158 0.85563 0.85934 0.86300
pdf("nana_gcov05_PCA.pdf",width=6,height=6)
plot(pcgcov$x[,1],pcgcov$x[,2],type="n",ylab="PC2 (27%)",xlab="PC1 (55%)")
title(main="E. nana",adj=0)
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',2],col="purple",labels=("C-F1"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',2],col="blue",labels=("C-F2"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',2],col="gray",labels=("C-F3"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',2],col="black",labels=("C-F4"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',2],col="red",labels=("W-BD"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',2],col="orange",labels=("W-DS"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',2],col="green",labels=("W-HO"))
dev.off()
pdf("nana05_PCA.pdf",width=6,height=6)
plot(nanapcaout$x[,1],nanapcaout$x[,2],type="n",ylab="PC2 (%)",xlab="PC1 (%)")
title(main="E. nana",adj=0)
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-10',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-10',2],col="purple",labels=("C-F1"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-20',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-20',2],col="blue",labels=("C-F2"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-30',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-30',2],col="gray",labels=("C-F3"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-40',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-C-40',2],col="black",labels=("C-F4"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-BD',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-BD',2],col="red",labels=("W-BD"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-DS',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-DS',2],col="orange",labels=("W-DS"))
text(nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-HO',1],nanapcaout$x[nanapopinfo[,1] == 'alnE-SM-W-HO',2],col="green",labels=("W-HO"))
dev.off()
#got rid of individuals with too low coverage and redoing PCA
#See getIndCoverage.pl
#Used R and found that there was a correlation between abs PC1 / PC2 and coverage per individual (0.44).
#See covHist.pdf for distribution of coverage
#There were 125 inds with a coverage of 5 reads per locus or higher
nanapopinfo<-read.csv("indIds.csv",header=F)
nanagendata<-read.table("filtered_varEnanaNoHdr.genest",header=F)
icov<-read.table("indCov.txt",header=F)
hicov<-which(icov[,1] >= 5)
nanapopinfo<-nanapopinfo[hicov,]
nanagendata<-nanagendata[,hicov]
nanapcaout<-prcomp(t(as.matrix(nanagendata)),center=T,scale.=F)
summary(nanapcaout)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.1916 1.54585 0.94742 0.84088 0.80716 0.77631 0.76114
Proportion of Variance 0.1464 0.07283 0.02736 0.02155 0.01986 0.01837 0.01766
Cumulative Proportion 0.1464 0.21922 0.24657 0.26812 0.28798 0.30635 0.32401
##compare to pca on NxN genotype covariance matrix
g<-(as.matrix(nanagendata))
gmn<-apply(g,1,mean)
nloci<-252
nind<-125
gmnmat<-matrix(gmn,nrow=nloci,ncol=nind)
gprime<-g-gmnmat
gcovarmat<-matrix(NA,nrow=nind,ncol=nind)
for(i in 1:nind){
for(j in i:nind){
if (i==j){
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
}
else{
gcovarmat[i,j]<-cov(gprime[,i],gprime[,j])
gcovarmat[j,i]<-gcovarmat[i,j]
}
}
}
pcgcov<-prcomp(x=gcovarmat,center=TRUE,scale=FALSE)
summary(pcgcov)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.2125 0.1042 0.03871 0.03133 0.02889 0.02653 0.02570
Proportion of Variance 0.6114 0.1469 0.02029 0.01329 0.01130 0.00953 0.00894
Cumulative Proportion 0.6114 0.7584 0.77865 0.79195 0.80325 0.81278 0.82172
pdf("nana_gcov_hicov_PCA.pdf",width=6,height=6)
plot(pcgcov$x[,1],pcgcov$x[,2],type="n",ylab="PC2 (14.7%)",xlab="PC1 (61%)")
title(main="E. nana",adj=0)
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-10',2],col="purple",labels=("C-F1"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-20',2],col="blue",labels=("C-F2"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-30',2],col="gray",labels=("C-F3"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-C-40',2],col="black",labels=("C-F4"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-BD',2],col="red",labels=("W-BD"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-DS',2],col="orange",labels=("W-DS"))
text(pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',1],pcgcov$x[nanapopinfo[,1] == 'alnE-SM-W-HO',2],col="green",labels=("W-HO"))
dev.off()
----------
#It's worth BLASTing some of the contigs to make sure Eurycea DNA is driving these weird patterns.
#I BLASTed 34 loci. 7 matched something in the database with an Evalue of 0.034-ish (the lowest I saw), one of which matched a salamander, 3 bacteria, 1 mouse, 1 C. elegans, 1 gorilla. BUT this probably means there isn't massive contamination, since there wasn't a prevalence of perfect matches to E. coli or whatever.