Post date: Nov 03, 2016 2:24:47 AM
I conducted several analyses based on Dxy to examine evidence for hybridization and ancestry blocks from the genomes. The results and analyses are in /uufs/chpc.utah.edu/common/home/u6000989/projects/lyc_hybanc/genanc/ with all of the code in genanc.R (current version pasted below) and the results saved in the genanc.Rdat. Here are the key points.
Dxy was calculated for all pairs of individuals for the 22 autosomes and with window sizes of 10, 25, 50, 100, 500 and 1000 SNPs (window size matters). Windows were not allowed to cross scaffold or LG boundaries. I made some basic plots of Dxy (nothing too exciting, see dxy.pdf).
A hybrid index of sorts was calculated as H = Dxa/(Dxa + Dxb), where x is an admixed population (Sierra Nevada or Warner Mts.) and a and b are L. anna and L. melissa. This works well when the hybrids are recent, but slowly tends towards 0.5 over time (as divergence post admixture trumps the source population). It is in part for this reason that I abandoned my original plans to fit a HMM for the hybrid index (hard to cleanly define classes).
I calculated correlations between all pairs of Dxy's with and without first regressing them on D(LA, LM). The results are in dxycors*.pdf (v. 3 is the simplest and probably most useful). Results depend some on window size, but overall there are negative correlations between D(LA, X) and D(LM, X) where X is one of the hybrids (stronger evidence with Warner Mts.), which is consistent with hybridization. There are modest (LM) or weak/absent/negative (LS) correlation between D(L, SN) and D(L, WM) where L is either LM or LA (this not strong evidence for or against consistency between hybrids, but with some evidence of more consistency for the L. melissa subset??).
Autocorrelations in Dxy and H were calculated across window sizes to look for LD/ancestry blocks. We see the least autocorrelation for D(LA, LM) (background LD, background variation in divergence) and the most for H (more for H in WM than SN), with intermediate for D(L, SN) and D(L, WM). This all makes good sense in the context of admixture and shows that ancestry blocks extent beyond background LD and variation in divergence. If we take r > 0.1 or 0.05 as evidence of non-negligible autocorrelation (coupled with p < 0.05 for null of no autocorrelation), than (based on H) we have good evidence of ancestry blocks (more atucorrelation than for Dxy with LA vs LM). The signal is a bit complicated and I will need to think more about how to quantify this once I have similar results from local ancestry blocks in popanc.
## read data
dat<-read.table("sorted_LycWGDat.txt",header=FALSE)
## genotype data for autosomes, 912,292 SNPs
## columns are anna, idas, melissa, sierra, warners
aut<-which(dat[,1] != 23)
G<-as.matrix(dat[aut,5:9])
lgs<-dat[aut,1]
map<-dat[aut,1:4]
## Nei's Dxy with different window sizes and between diffrent inds.
w<-c(10,25,50,100,500,1000)
W<-length(w)
D<-vector("list",W)
H<-vector("list",W)
for(i in 1:W){
dam<-nei(G[,1],G[,3],w[i],lgs,map[,3])
das<-nei(G[,1],G[,4],w[i],lgs,map[,3])
dms<-nei(G[,3],G[,4],w[i],lgs,map[,3])
hs<-das[[1]]/(das[[1]]+dms[[1]])
daw<-nei(G[,1],G[,5],w[i],lgs,map[,3])
dmw<-nei(G[,3],G[,5],w[i],lgs,map[,3])
hw<-daw[[1]]/(daw[[1]]+dmw[[1]])
D[[i]]<-list(dam[[1]],das[[1]],dms[[1]],daw[[1]],dmw[[1]])
H[[i]]<-list(hs,hw)
}
save(list=ls(),file="genanc.Rdat")
## make plots and quantitative summaries
i<-4
dam<-nei(G[,1],G[,3],w[i],lgs,map[,3])
lgsw<-dam[[4]]
N<-length(lgsw)
bnds<-which(lgsw[2:N] != lgsw[1:(N-1)])
bnds<-c(bnds,length(lgsw))
bndsPlus<-c(1,bnds,length(lgsw))
mids<-rep(NA,22)
for(i in 1:22){
mids[i]<-(bndsPlus[i]+bndsPlus[i+1])/2
}
pdf("dxy.pdf",width=8,height=8)
par(mfrow=c(5,1))
par(mar=c(4,5,0.7,0.5))
txt<-c("L. anna-L. melissa","L. anna-Sierra Nevada","L. melissa-Sierra Nevada","L. anna-Warner Mts.","L. melissa-Warner Mts.")
for(x in 1:5){
par(xpd=FALSE)
plot(D[[4]][[x]],type='n',ylim=c(0,.7),axes=FALSE,xlab="",ylab=expression(paste(D[XY])),cex.lab=1.5)
for(i in seq(1,21,2)){
j<-i+1
polygon(x=bnds[c(i,j,j,i)],y=c(-1,-1,2,2),border=NA,col="lightblue")
}
par(xpd=NA)
par(xpd=TRUE)
lines(D[[4]][[x]],type='l',lwd=0.3)
m<-round(mean(D[[4]][[x]],na.rm=TRUE),3)
text(1,0.65,paste(txt[x],", mean = ",m,sep=""),adj=0,cex=1.3)
axis(2,at=c(0,0.35,.7))
text(mids,rep(-0.15,22),c(1:19,NA,21,NA))
}
text(length(D[[4]][[x]])/2,-0.3,"Linkage group",cex=1.7)
dev.off()
## autocorrelations
l<-20
acs<-matrix(NA,nrow=W,ncol=l)
acw<-matrix(NA,nrow=W,ncol=l)
pcs<-matrix(NA,nrow=W,ncol=l)
pcw<-matrix(NA,nrow=W,ncol=l)
for(w in 1:W){
acs[w,]<-autocorr(as.mcmc(H[[w]][[1]][is.na(H[[w]][[1]])==FALSE]),lags=seq(1,l,1))
acw[w,]<-autocorr(as.mcmc(H[[w]][[2]][is.na(H[[w]][[2]])==FALSE]),lags=seq(1,l,1))
acsNull<-matrix(NA,nrow=1000,ncol=l)
acwNull<-matrix(NA,nrow=1000,ncol=l)
v1<-H[[w]][[1]][is.na(H[[w]][[1]])==FALSE]
v2<-H[[w]][[1]][is.na(H[[w]][[1]])==FALSE]
for(i in 1:1000){
v1<-sample(v1,length(v1),replace=F)
v2<-sample(v2,length(v2),replace=F)
acsNull[i,]<-autocorr(as.mcmc(v1),lags=seq(1,l,1))
acwNull[i,]<-autocorr(as.mcmc(v2),lags=seq(1,l,1))
}
for(j in 1:l){
pcs[w,j]<-mean(acsNull[,j] > acs[w,j])
pcw[w,j]<-mean(acwNull[,j] > acw[w,j])
}
}
pdf("genhybanc.pdf",width=8,height=8)
layout(matrix(c(1,1,2,2,3,4),nrow=3,ncol=2,byrow=TRUE),widths=c(4,4),heights=c(2,2,4))
par(mar=c(4.5,4.5,2.5,2))
tit<-c("(a) Hybridity in Sierra Nevada","(b) Hybridity in Warner Mts.")
for(x in 1:2){
par(xpd=FALSE)
plot(H[[4]][[x]],type='n',ylim=c(0,1),axes=FALSE,xlab="Linkage group",ylab=expression(paste(I=D[XA]/(D[XA]+D[XB]))),cex.lab=1.4)
for(i in seq(1,21,2)){
j<-i+1
polygon(x=bnds[c(i,j,j,i)],y=c(-1,-1,2,2),border=NA,col="lightblue")
}
par(xpd=TRUE)
lines(H[[4]][[x]],type='l',lwd=0.3)
#text(1,0.65,paste(txt[x],", mean = ",m,sep=""),adj=0,cex=1.3)
axis(2,at=c(0,0.5,1))
text(mids,rep(-0.15,22),c(1:19,NA,21,NA))
title(main=tit[x],adj=0)
}
ub<-0.4
lb<--0.06
pty<-c(19,21)
cs<-c("#99D8C9","#66C2A4","#41AE76","#238B45","#006D2C","#00441B")
plot(acs[1,],pch=pty[(pcs[1,] > 0.05) + 1],ylim=c(lb,ub),col=cs[1],type="b",ylab="Autocorrelation",xlab="Lag",cex.lab=1.4)
for(w in 2:W){
lines(acs[w,],col=cs[w],type='b',pch=pty[(pcs[w,] > 0.05) + 1])
}
legend(14,0.4,c(10,25,50,100,500,1000),lty=1,pch=21,col=cs,bty='n',title="window size")
title(main="(c) Autocorrelation in Sierra Nevada",adj=0)
plot(acw[1,],pch=pty[(pcw[1,] > 0.05) + 1],ylim=c(lb,ub),col=cs[1],type="b",ylab="Autocorrelation",xlab="Lag",cex.lab=1.4)
for(w in 2:W){
lines(acw[w,],pch=pty[(pcw[w,] > 0.05) + 1],col=cs[w],type='b')
}
title(main="(d) Autocorrelation in Warner Mts.",adj=0)
dev.off()
## get max lag autocs for p < 0.05 and r = 0.01 0.05 and 0.1
mlacs<-matrix(NA,ncol=3,nrow=6)
rs<-c(0.01,0.05,0.1)
for(i in 1:6){for(j in 1:3){
m<-max(which(pcs[i,] < 0.05 & acs[i,] > rs[j]))
if(is.infinite(m)==FALSE){mlacs[i,j]<-m}
else{mlacs[i,j]<-0}
}}
mlacw<-matrix(NA,ncol=3,nrow=6)
rs<-c(0.01,0.05,0.1)
for(i in 1:6){for(j in 1:3){
m<-max(which(pcw[i,] < 0.05 & acw[i,] > rs[j]))
if(is.infinite(m)==FALSE){mlacw[i,j]<-m}
else{mlacw[i,j]<-0}
}}
colnames(mlacs)<-rs
rownames(mlacs)<-c(10,25,50,100,500,1000)
colnames(mlacw)<-rs
rownames(mlacw)<-c(10,25,50,100,500,1000)
## Dxy correlations over scales of window size
pdf("dxycors.pdf",width=6,height=12)
layout(matrix(1:10,nrow=5,ncol=2,byrow=TRUE),widths=c(3,3),heights=c(3,3,3,3,3))
par(mar=c(4,4.5,2.5,2))
w<-c(10,25,50,100,500,1000)
dxylabs<-c("LA, LM","LA, SN","LM, SN","LA, WM","LM, WM")
k<-1
for(i in 1:4){for(j in (i+1):5){
cors<-rep(NA,W)
lb<-rep(NA,W)
ub<-rep(NA,W)
for(x in 1:W){
r<-cor.test(D[[x]][[i]],D[[x]][[j]],na.rm=TRUE)
cors[x]<-r$estimate
lb[x]<-r$conf.int[1]
ub[x]<-r$conf.int[2]
}
plot(cors,ylim=c(-.5,.8),xlab="Window size (no. SNPs)",ylab="Correlation (r)",cex.lab=1.3,axes=FALSE)
segments(1:6,lb,1:6,ub)
axis(1,at=1:6,w)
axis(2)
box()
abline(h=0,lty=2)
title(main=paste("(",letters[k],") ",dxylabs[i]," vs. ",dxylabs[j],sep=""),adj=0)
k<-k+1
}
}
dev.off()
## Dxy correlations over scales of window size, controlling for parental dxy
pdf("dxycors2.pdf",width=6,height=9)
layout(matrix(1:6,nrow=3,ncol=2,byrow=TRUE),widths=c(3,3),heights=c(3,3,3))
par(mar=c(4,4.5,2.5,2))
w<-c(10,25,50,100,500,1000)
dxylabs<-c("LA, LM","LA, SN","LM, SN","LA, WM","LM, WM")
k<-1
for(i in 2:4){for(j in (i+1):5){
cors<-rep(NA,W)
lb<-rep(NA,W)
ub<-rep(NA,W)
for(x in 1:W){
nas<-which(is.na(D[[x]][[i]])==FALSE & is.na(D[[x]][[j]])==FALSE & is.na(D[[x]][[1]]) == FALSE)
a<-lm(D[[x]][[i]][nas] ~ D[[x]][[1]][nas])
b<-lm(D[[x]][[j]][nas] ~ D[[x]][[1]][nas])
r<-cor.test(a$residuals,b$residuals,na.rm=TRUE)
cors[x]<-r$estimate
lb[x]<-r$conf.int[1]
ub[x]<-r$conf.int[2]
}
plot(cors,ylim=c(-.6,.2),xlab="Window size (no. SNPs)",ylab="Correlation (r)",cex.lab=1.3,axes=FALSE)
segments(1:6,lb,1:6,ub)
axis(1,at=1:6,w)
axis(2)
box()
abline(h=0,lty=2)
title(main=paste("(",letters[k],") ",dxylabs[i]," vs. ",dxylabs[j],sep=""),adj=0)
k<-k+1
}
}
dev.off()
## Dxy correlations over scales of window size, controlling for parental dxy, focus on 4 key ones, between AX x BX both hybrids and AX x AX for each hybrid
pdf("dxycors3.pdf",width=5,height=5)
par(mar=c(4,4.5,2.5,2))
w<-c(10,25,50,100,500,1000)
dxylabs<-c("LA, LM","LA, SN","LM, SN","LA, WM","LM, WM")
ii<-c(2,4,2,3)
jj<-c(3,5,4,5)
cs<-c("#66C2A5","#FC8D62","#8DA0CB","#E78AC3")
for(z in 1:4){
i<-ii[z]
j<-jj[z]
cors<-rep(NA,W)
lb<-rep(NA,W)
ub<-rep(NA,W)
for(x in 1:W){
nas<-which(is.na(D[[x]][[i]])==FALSE & is.na(D[[x]][[j]])==FALSE & is.na(D[[x]][[1]]) == FALSE)
a<-lm(D[[x]][[i]][nas] ~ D[[x]][[1]][nas])
b<-lm(D[[x]][[j]][nas] ~ D[[x]][[1]][nas])
r<-cor.test(a$residuals,b$residuals,na.rm=TRUE)
cors[x]<-r$estimate
lb[x]<-r$conf.int[1]
ub[x]<-r$conf.int[2]
}
if(z == 1){
plot(1:6,cors,ylim=c(-.6,.2),xlab="Window size (no. SNPs)",ylab="Correlation (r)",cex.lab=1.3,axes=FALSE,type='b',col=cs[z])
axis(1,at=1:6,w)
axis(2)
box()
abline(h=0,lty=2)
}
else{
points(cors,type='b',col=cs[z])
}
segments(1:6,lb,1:6,ub)
}
legend(1,-0.35,c("(LA, SN), (LM, SN)","(LA, WM), (LM, WM)","(LA, SN), (LA, WM)","(LM, SN), (LM, WM)"),col=cs,lty=1,pch=21,bty='n')
dev.off()
## Dxy autocorrelations
l<-20
dxyAc<-vector("list",5)
dxyPv<-vector("list",5)
for(p in 1:5){
dxyAc[[p]]<-matrix(NA,nrow=W,ncol=l)
dxyPv[[p]]<-matrix(NA,nrow=W,ncol=l)
for(w in 1:W){
dxyAc[[p]][w,]<-autocorr(as.mcmc(D[[w]][[p]][is.na(D[[w]][[p]])==FALSE]),lags=seq(1,l,1))
v1<-D[[w]][[p]][is.na(D[[w]][[p]])==FALSE]
null<-matrix(NA,nrow=1000,ncol=l)
for(i in 1:1000){
v1<-sample(v1,length(v1),replace=F)
null[i,]<-autocorr(as.mcmc(v1),lags=seq(1,l,1))
}
for(j in 1:l){
dxyPv[[p]][w,j]<-mean(null[,j] > dxyAc[[p]][w,j])
}
}
}
pair<-c("LA, LM","LA, SN","LM, SN","LA, WM","LM, WM")
pdf("dxyautoc.pdf",width=6,height=9)
layout(matrix(1:6,nrow=3,ncol=2,byrow=TRUE),widths=c(3,3),heights=c(3,3,3))
par(mar=c(4,4.5,2.5,2))
for(p in 1:5){
ub<-0.4
lb<--0.1
cs<-c("#99D8C9","#66C2A4","#41AE76","#238B45","#006D2C","#00441B")
plot(dxyAc[[p]][1,],pch=pty[(dxyPv[[p]][1,] > 0.05) + 1],ylim=c(lb,ub),col=cs[1],type="b",ylab="Autocorrelation",xlab="Lag",cex.lab=1.4)
for(w in 2:W){
lines(dxyAc[[p]][w,],pch=pty[(dxyPv[[p]][w,] > 0.05) + 1],col=cs[w],type='b')
}
if(p==1){legend(14,0.4,c(10,25,50,100,500,1000),lty=1,pch=21,col=cs,bty='n',title="window size")}
txt<-paste("(",letters[p],") Autocorrelation D(",pair[p],")",sep="")
title(main=txt,adj=0)
}
dev.off()
## get max lag autocs for p < 0.05 and r = 0.01 0.05 and 0.1
mlacdxy<-vector("list",5)
for(p in 1:5){
mlacdxy[[p]]<-matrix(NA,ncol=3,nrow=6)
rs<-c(0.01,0.05,0.1)
for(i in 1:6){for(j in 1:3){
m<-max(which(dxyPv[[p]][i,] < 0.05 & dxyAc[[p]][i,] > rs[j]))
if(is.infinite(m)==FALSE){mlacdxy[[p]][i,j]<-m}
else{mlacdxy[[p]][i,j]<-0}
}}
colnames(mlacdxy[[p]])<-rs
rownames(mlacdxy[[p]])<-c(10,25,50,100,500,1000)
}
## function for Nei's Dxy
nei<-function(x=NA,y=NA,window=100,lgs=NA,scafs=NA){
## format data
nlgs<-max(lgs)
nwin<-round(table(lgs)/window)
px<-vector("list",nlgs)
py<-vector("list",nlgs)
for(i in 1:nlgs){
a<-which(lgs==i)
px[[i]]<-x[a]
py[[i]]<-y[a]
}
## window distances
dxy<-vector("list",nlgs)
hetx<-vector("list",nlgs)
hety<-vector("list",nlgs)
for(i in 1:nlgs){
dxy[[i]]<-rep(NA,nwin[i])
hetx[[i]]<-rep(NA,nwin[i])
hety[[i]]<-rep(NA,nwin[i])
for(j in 1:nwin[i]){
lb<-(j-1) * window + 1
ub<-lb + window - 1
## allele 'freqs' for window
xx<-px[[i]][lb:ub]/2
yy<-py[[i]][lb:ub]/2
dxy[[i]][j]<-mean(1 - (sqrt((1-xx)*(1-yy)) + sqrt(xx*yy)),na.rm=TRUE)
hetx[[i]][j]<-mean(2 * xx * 1-xx,na.rm=TRUE)
hety[[i]][j]<-mean(2 * yy * 1-yy,na.rm=TRUE)
}
}
dxy<-unlist(dxy)
hetx<-unlist(hetx)
hety<-unlist(hety)
out<-list(dxy=dxy,hetx=hetx,hety=hety,lgs=rep(1:22,nwin))
out
}
## function for Nei's Dxy, this version does not cross scaffold boundaries
nei<-function(x=NA,y=NA,window=100,lgs=NA,scafs=NA){
## format data
nlgs<-max(lgs)
px<-vector("list",nlgs)
py<-vector("list",nlgs)
for(i in 1:nlgs){
a<-which(lgs==i)
px[[i]]<-x[a]
py[[i]]<-y[a]
}
## window distances
dxy<-vector("list",nlgs)
hetx<-vector("list",nlgs)
hety<-vector("list",nlgs)
for(i in 1:nlgs){
a<-which(lgs==i)
sc<-scafs[a]
nsc<-length(unique(sc))
usc<-unique(sc)
nwin<-round(table(sc)/window)
dxysc<-vector("list",nsc)
hetxsc<-vector("list",nsc)
hetysc<-vector("list",nsc)
for(k in 1:nsc){
scn<-as.numeric(names(nwin)[k])
pxsc<-px[[i]][sc==scn]
pysc<-py[[i]][sc==scn]
dxysc[[k]]<-rep(NA,nwin[k])
hetxsc[[k]]<-rep(NA,nwin[k])
hetysc[[k]]<-rep(NA,nwin[k])
for(j in 1:nwin[k]){
lb<-(j-1) * window + 1
ub<-lb + window - 1
## allele 'freqs' for window
xx<-pxsc[lb:ub]/2
yy<-pysc[lb:ub]/2
dxysc[[k]][j]<-mean(1 - (sqrt((1-xx)*(1-yy)) + sqrt(xx*yy)),na.rm=TRUE)
hetxsc[[k]][j]<-mean(2 * xx * 1-xx,na.rm=TRUE)
hetysc[[k]][j]<-mean(2 * yy * 1-yy,na.rm=TRUE)
}
}
dxy[[i]]<-unlist(dxysc)
hetx[[i]]<-unlist(hetxsc)
hety[[i]]<-unlist(hetysc)
# dxy[[i]]<-dxy[[i]][is.nan(dxy[[i]])==FALSE]
# hetx[[i]]<-hetx[[i]][is.nan(hetx[[i]])==FALSE]
# hety[[i]]<-hety[[i]][is.nan(hety[[i]])==FALSE]
}
nwin<-rep(NA,22)
for(i in 1:22){
nwin[i]<-length(dxy[[i]])
}
dxy<-unlist(dxy)
hetx<-unlist(hetx)
hety<-unlist(hety)
out<-list(dxy=dxy,hetx=hetx,hety=hety,lgs=rep(1:22,nwin))
out
}