Post date: May 28, 2019 8:27:17 PM
See AfreqChange.R. So far, just playing around. Clearly, some SNPs show considerable change, but I want to see if this is anything but real, e.g., maybe reflecting lower coverage.
## base file names
afiles<-c("p_auto_BCR-13","p_auto_BCR-15","p_auto_BCR-17","p_auto_BNP-13","p_auto_BNP-15","p_auto_BNP-17","p_auto_BTB-13","p_auto_BTB-14","p_auto_BTB-15","p_auto_BTB-17","p_auto_GNP-13","p_auto_GNP-15","p_auto_GNP-17","p_auto_HNV-13","p_auto_HNV-14","p_auto_HNV-15","p_auto_HNV-17","p_auto_MRF-13","p_auto_MRF-15","p_auto_MRF-17","p_auto_PSP-13","p_auto_PSP-15","p_auto_PSP-17","p_auto_RNV-13","p_auto_RNV-14","p_auto_RNV-17","p_auto_SKI-13","p_auto_SKI-14","p_auto_SKI-15","p_auto_SKI-17","p_auto_USL-13","p_auto_USL-15","p_auto_USL-17")
zfiles<-c("p_Z_BCR-13","p_Z_BCR-15","p_Z_BCR-17","p_Z_BNP-13","p_Z_BNP-15","p_Z_BNP-17","p_Z_BTB-13","p_Z_BTB-14","p_Z_BTB-15","p_Z_BTB-17","p_Z_GNP-13","p_Z_GNP-15","p_Z_GNP-17","p_Z_HNV-13","p_Z_HNV-14","p_Z_HNV-15","p_Z_HNV-17","p_Z_MRF-13","p_Z_MRF-15","p_Z_MRF-17","p_Z_PSP-13","p_Z_PSP-15","p_Z_PSP-17","p_Z_RNV-13","p_Z_RNV-14","p_Z_RNV-17","p_Z_SKI-13","p_Z_SKI-14","p_Z_SKI-15","p_Z_SKI-17","p_Z_USL-13","p_Z_USL-15","p_Z_USL-17")
N<-length(afiles)
L<-c(30033,12886,12886) ## no. snps
P<-vector("list",3)
for(k in 1:3){
P[[k]]<-matrix(NA,nrow=N,ncol=L[k])
}
fsub<-c("_GATK_full","_GATK_sub","_SAM_sub")
for(i in 1:N){
for(k in 1:3){
fname<-paste(afiles[i],fsub[k],".txt",sep="")
fa<-read.table(fname,header=FALSE)
fname<-paste(zfiles[i],fsub[k],".txt",sep="")
fz<-read.table(fname,header=FALSE)
P[[k]][i,]<-as.numeric(c(fa[,3],fz[,3]))
}
}
## SAM/GATK gl-based allele freq. comparison
cors<-rep(NA,33)
for(i in 1:33){
cors[i]<-cor(P[[2]][i,],P[[3]][i,])
}
summary(cors)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.9792 0.9809 0.9816 0.9819 0.9831 0.9854
## note, this is good overall, but some SNPs stand out
## as having different allele frequencies in different pops.
## PCAs
pops<-c("BCR-13","BCR-15","BCR-17","BNP-13","BNP-15","BNP-17","BTB-13","BTB-14","BTB-15","BTB-17","GNP-13","GNP-15","GNP-17","HNV-13","HNV-14","HNV-15","HNV-17","MRF-13","MRF-15","MRF-17","PSP-13","PSP-15","PSP-17","RNV-13","RNV-14","RNV-17","SKI-13","SKI-14","SKI-15","SKI-17","USL-13","USL-15","USL-17")
popsI<-c("BCR","BCR","BCR","BNP","BNP","BNP","BTB","BTB","BTB","BTB","GNP","GNP","GNP","HNV","HNV","HNV","HNV","MRF","MRF","MRF","PSP","PSP","PSP","RNV","RNV","RNV","SKI","SKI","SKI","SKI","USL","USL","USL")
yr<-as.numeric(unlist(strsplit(pops,split="-"))[seq(2,66,2)])
library(RColorBrewer)
pdf("AfreqPCA.pdf",width=6,height=5)
par(mar=c(4.5,4.5,.5,7))
par(pty='s')
myc<-brewer.pal(n=10,"Paired")
uP<-unique(popsI)
for(k in 1:3){
o<-prcomp(P[[k]],center=TRUE,scale=FALSE)
oo<-summary(o)
pve<-100 * round(oo$sdev^2/sum(oo$sdev^2),3)
plot(o$x[,1],o$x[,2],type='n',cex.lab=1.5,xlab=paste("PC1 (",pve[1],"%)",sep=""),ylab=paste("PC2 (",pve[2],"%)",sep=""))
for(i in 1:10){
a<-which(popsI==uP[i])
points(o$x[a,1],o$x[a,2],col=myc[i],pch=19,type='b',lwd=1.5)
}
par(xpd=NA)
legend(max(o$x[,1]) * 1.2,max(o$x[,2]),uP,pch=19,lwd=1.5,col=myc,bty='n')
}
dev.off()
## note, the PC2 axis really does look more like time than run
o<-vector("list",3)
for(k in 1:3){
o[[k]]<-prcomp(P[[k]],center=TRUE,scale=FALSE)
}
set1<-c(1,2,4,5,7,9,11,12,18,19,21,22,24,25,31,32)
set2<-c(3,6,8,10,13,14:17,20,23,26,27:30,33)
sets<-rep(1,33)
sets[set2]<-2
pc1<-data.frame(popsI,yr,as.factor(sets),"GATK_full"=o[[1]]$x[,1],"GATK_sub"=o[[2]]$x[,1],"SAM_sub"=o[[3]]$x[,1])
pc2<-data.frame(popsI,yr,as.factor(sets),"GATK_full"=o[[1]]$x[,2],"GATK_sub"=-1*o[[2]]$x[,2],"SAM_sub"=o[[3]]$x[,3])
## test for year, population and set effects
library(blme)
## PC1
vcmat<-matrix(NA,nrow=3,ncol=3)
est<-blmer(GATK_full ~ 1 + (1 | popsI),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_full ~ 1 + (1 | yr),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_full ~ 1 + (1 | sets),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,3]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | popsI),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | yr),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | sets),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,3]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(SAM_sub ~ 1 + (1 | popsI),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[3,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(SAM_sub ~ 1 + (1 | yr),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[3,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(SAM_sub ~ 1 + (1 | sets),data=pc1,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[3,3]<-vc$vcov[1]/sum(vc$vcov)
##vcmat
# [,1] [,2] [,3]
#[1,] 0.9993490 0.03679005 0.04103969
#[2,] 0.9986333 0.03680136 0.04104015
#[3,] 0.9984307 0.03684483 0.04099892
## PC2
vcmat<-matrix(NA,nrow=3,ncol=3)
est<-blmer(GATK_full ~ 1 + (1 | popsI),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_full ~ 1 + (1 | yr),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_full ~ 1 + (1 | sets),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[1,3]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | popsI),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | yr),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(GATK_sub ~ 1 + (1 | sets),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[2,3]<-vc$vcov[1]/sum(vc$vcov)
## had to sue wishart here for convergence
est<-blmer(SAM_sub ~ 1 + (1 | popsI),data=pc2,REML=TRUE,cov.prior=wishart)
vc<-as.data.frame(VarCorr(est))
vcmat[3,1]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(SAM_sub ~ 1 + (1 | yr),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[3,2]<-vc$vcov[1]/sum(vc$vcov)
est<-blmer(SAM_sub ~ 1 + (1 | sets),data=pc2,REML=TRUE,cov.prior=invgamma)
vc<-as.data.frame(VarCorr(est))
vcmat[3,3]<-vc$vcov[1]/sum(vc$vcov)
# [,1] [,2] [,3]
#[1,] 0.4052466 0.04764759 0.30227968
#[2,] 0.5270792 0.03749916 0.07687001
#[3,] 0.2766804 0.04373793 0.04302430
save(list=ls(),file="af.rdat")
#### allele freq. change
## 2 yr chage
ub2yr<-c(2:3,5:6,9:10,12:13,16:17,19:20,22:23,29:30,32:33)
lb2yr<-ub2yr-1
lb2yr[c(5,9,15)]<-lb2yr[c(5,9,15)]-1
comps2yr<-length(lb2yr)
dp2yr<-matrix(NA,nrow=L[3],ncol=comps2yr)
for(i in 1:comps2yr){
dp2yr[,i]<-P[[3]][ub2yr[i],]-P[[3]][lb2yr[i],]
}
## a few of the more compelling examples plots
cs<-brewer.pal(n=10,"Paired")
pdf("dpSNP12394.pdf",width=7,height=7)
par(mar=c(5,5,.5,.5))
plot(yr+2000,P[[3]][,12394],type='n',xlab="Year",ylab="Allele freq.",cex.lab=1.5)
for(j in 1:10){
a<-which(popsI==upops[j])
points(yr[a]+2000,P[[3]][a,12394],type='b',col=cs[j],pch=19)
}
legend(2013,0.18,col=cs,lty=1,pch=19,upops,ncol=2)
dev.off()
pdf("dpSNP10198.pdf",width=7,height=7)
par(mar=c(5,5,.5,.5))
plot(yr+2000,P[[3]][,10198],type='n',xlab="Year",ylab="Allele freq.",cex.lab=1.5)
for(j in 1:10){
a<-which(popsI==upops[j])
points(yr[a]+2000,P[[3]][a,10198],type='b',col=cs[j],pch=19)
}
legend(2015,0.7,col=cs,lty=1,pch=19,upops,ncol=2)
dev.off()
pdf("dpSNP6540.pdf",width=7,height=7)
par(mar=c(5,5,.5,.5))
plot(yr+2000,P[[3]][,6540],type='n',xlab="Year",ylab="Allele freq.",cex.lab=1.5)
for(j in 1:10){
a<-which(popsI==upops[j])
points(yr[a]+2000,P[[3]][a,6540],type='b',col=cs[j],pch=19)
}
legend(2015,0.26,col=cs,lty=1,pch=19,upops,ncol=2)
dev.off()
i<-200
par(mar=c(5,5,.5,.5))
plot(yr+2000,P[[3]][,i],type='n',xlab="Year",ylab="Allele freq.",cex.lab=1.5)
for(j in 1:10){
a<-which(popsI==upops[j])
points(yr[a]+2000,P[[3]][a,i],type='b',col=cs[j],pch=19)
}
## 4yr change
ub4yr<-ub2yr[seq(2,18,2)]
lb4yr<-lb2yr[seq(1,18,2)]
comps4yr<-length(lb4yr)
dp4yr<-matrix(NA,nrow=L[3],ncol=comps4yr)
for(i in 1:comps4yr){
dp4yr[,i]<-P[[3]][ub4yr[i],]-P[[3]][lb4yr[i],]
}
## mns
mn4yr<-apply(dp4yr,1,mean)
mn2yr<-apply(dp2yr,1,mean)
sd4yr<-apply(dp4yr,1,sd)
sd2yr<-apply(dp2yr,1,sd)
##
az<-rep(2,12886)
az[1:11638]<-1