Post date: Apr 07, 2016 10:2:42 PM
Starting from mean genotypes (i.e., posterior point estimates), I summarized variation in Fallon by calculating heterozygosities, PC scores, and by creating pairwise identity-by-state matrixes. The results and R script are in /uufs/chpc.utah.edu/common/home/u6000989/projects/alfalfa_fallon/Popgen/.
Here is the R script:
## summarize genetic variation in the Fallon M. sativa population
## Posterior mean genotype file
N<-48
L<-43920
G<-matrix(scan("pntest_mod_filtered4x_alfalfavariants.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=TRUE)
## coverage (depth) file
depth<-matrix(scan("mod_depth_filtered4x_alfalfavariants.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=TRUE)
## individual ids
ids<-read.table("ids.txt",header=FALSE)
## coverage per individual
mndepth<-apply(depth,2,mean)
mean(mndepth) ## 13.3X
sd(mndepth) ## 6.6X
pdf("coveragePerInd.pdf",width=5,height=5)
par(mar=c(5,5,0.5,0.5))
barplot(mndepth,xlab="individual",ylab="mean coverage",cex.lab=1.5,cex.axis=1.2)
dev.off()
## drop individuals with < 3X coverage (note all three that are dropped had 0 reads)
good<-which(mndepth > 3)
Gx<-G[,good]
dpx<-depth[,good]
idsx<-as.character(ids[good,])
mndepthx<-mndepth[good]
## heterozygosity per individual
## we assume that point estimates between 0.9 and 3.1 are heterozygotes
het<-apply(Gx > 0.9 & Gx < 3.1,2,mean)
pdf("hetbycov.pdf",width=5,height=5)
par(mar=c(5,5,0.5,0.5))
plot(mndepthx,het,xlab="mean coverage",ylab="heterozygosity",cex.lab=1.5,cex.axis=1.2)
dev.off()
cor.test(mndepthx,het)
#Pearson's product-moment correlation
#data: mndepthx and het
#t = 3.637, df = 43, p-value = 0.0007333
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# 0.2232891 0.6815341
#sample estimates:
# cor
#0.4850325
out<-lm(het ~ mndepthx)
hresid<-out$residuals
## PCA, genetic covariance matrix, SNPs with > 4X
mincov<-4
Nx<-length(good)
Gcov<-matrix(NA,nrow=Nx,ncol=Nx)
Gcovdp<-Gcov
for(i in 1:Nx){
for(j in i:Nx){
hic<-which(dpx[,i] >= mincov & dpx[,j] >= mincov)
Gcovdp[i,j]<-length(hic)
Gcovdp[j,i]<-Gcovdp[i,j]
Gcov[i,j]<-cov(Gx[hic,i],Gx[hic,j])
Gcov[j,i]<-Gcov[i,j]
}
}
pcout<-prcomp(Gcov,center=TRUE,scale=FALSE)
summary(pcout)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 0.2478 0.11741 0.1068 0.10209 0.10055 0.09802 0.09227
#Proportion of Variance 0.2104 0.04724 0.0391 0.03572 0.03464 0.03292 0.02917
#Cumulative Proportion 0.2104 0.25761 0.2967 0.33242 0.36706 0.39999 0.42916
pdf("pcaGcov.pdf",width=5,height=5)
par(mar=c(5,5,0.5,0.5))
plot(pcout$x[,1],pcout$x[,2],xlab="PC1 (21.0%)",ylab="PC2 (4.7%)",cex.lab=1.5,cex.axis=1.2)
dev.off()
## PCs 1 and 2 do not reflect variation in coverage:
cor.test(mndepthx,pcout$x[,1])
#Pearson's product-moment correlation
#data: mndepthx and pcout$x[, 1]
#t = 1.0959, df = 43, p-value = 0.2792
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# -0.1352377 0.4372185
#sample estimates:
# cor
#0.1648399
cor.test(mndepthx,pcout$x[,2])
#Pearson's product-moment correlation
#data: mndepthx and pcout$x[, 2]
#t = 1.0118, df = 43, p-value = 0.3173
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# -0.1476564 0.4269119
#sample estimates:
# cor
#0.1524866
## distance/similarity matrixes
## identity by (pseudo)state matrix... uses the point estimates
ibs<-matrix(NA,nrow=Nx,ncol=Nx)
for(i in 1:Nx){
for(j in i:Nx){
hic<-which(dpx[,i] >= mincov & dpx[,j] >= mincov)
ibs[i,j]<-1-(mean(abs(Gx[hic,i]-Gx[hic,j]))/4)
ibs[j,i]<-ibs[i,j]
}
}
## identity by state with integer genotypes
Gxi<-round(Gx,0)
ibsi<-matrix(NA,nrow=Nx,ncol=Nx)
for(i in 1:Nx){
for(j in i:Nx){
hic<-which(dpx[,i] >= mincov & dpx[,j] >= mincov)
ibsi[i,j]<-1-(mean(abs(Gxi[hic,i]-Gxi[hic,j]))/4)
ibsi[j,i]<-ibsi[i,j]
}
}
## NOTE THESE WERE NEARLY IDENTICAL, r = 0.99
## genetic distance matrix, 1 - ibs
gdist<-1-ibs
## write results
out<-data.frame(idsx,round(mndepthx,4),round(het,4),round(hresid,4),round(pcout$x[,1],4),round(pcout$x[,2],4))
colnames(out)<-c("IDs","depth","het","residhet","pc1","pc2")
write.csv(out,file="summaryGenVarFallon.csv",quote=FALSE,row.names=FALSE)
write.table(ibs,"ibsMatrix.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)
write.table(gdist,"gdistMatrix.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)
save(list=ls(),file="fallonGbs.rdat")