Post date: May 09, 2017 11:7:20 PM
I have parameter estimates from 1388 autosomal and 115 sex-linked AIMs with window sizes of 1, 3, 5 or 7 in /uufs/chpc.utah.edu/common/home/u6000989/projects/lyc_hybanc/ancfreqs/params/aims/ and linked to the results sub-directory. Here is the final summary of these, starting with the 3 bp window. The code below comes from summarizePopAncAims.R, but is interspersed with results and comments.
1. Data are read in and organized.
# read in all of the data
autoMTR<-read.table("q_out_aims_w3MTR.txt",header=FALSE,sep=",")
autoCSP<-read.table("q_out_aims_w3CSP.txt",header=FALSE,sep=",")
autoCRP<-read.table("q_out_aims_w3CRP.txt",header=FALSE,sep=",")
autoSOP<-read.table("q_out_aims_w3SOP.txt",header=FALSE,sep=",")
autoLAE<-read.table("q_out_aims_w3LAE.txt",header=FALSE,sep=",")
autoSWM<-read.table("q_out_aims_w3SWM.txt",header=FALSE,sep=",")
autoTIC<-read.table("q_out_aims_w3TIC.txt",header=FALSE,sep=",")
autoSOF<-read.table("q_out_aims_w3SOF.txt",header=FALSE,sep=",")
autoCLH<-read.table("q_out_aims_w3CLH.txt",header=FALSE,sep=",")
autoREF<-read.table("q_out_aims_w3REF.txt",header=FALSE,sep=",")
autoEGP<-read.table("q_out_aims_w3EGP.txt",header=FALSE,sep=",")
autoBKM<-read.table("q_out_aims_w3BKM.txt",header=FALSE,sep=",")
autoSTM<-read.table("q_out_aims_w3STM.txt",header=FALSE,sep=",")
sexMTR<-read.table("q_out_aims_sex_w3MTR.txt",header=FALSE,sep=",")
sexCSP<-read.table("q_out_aims_sex_w3CSP.txt",header=FALSE,sep=",")
sexCRP<-read.table("q_out_aims_sex_w3CRP.txt",header=FALSE,sep=",")
sexSOP<-read.table("q_out_aims_sex_w3SOP.txt",header=FALSE,sep=",")
sexLAE<-read.table("q_out_aims_sex_w3LAE.txt",header=FALSE,sep=",")
sexSWM<-read.table("q_out_aims_sex_w3SWM.txt",header=FALSE,sep=",")
sexTIC<-read.table("q_out_aims_sex_w3TIC.txt",header=FALSE,sep=",")
sexSOF<-read.table("q_out_aims_sex_w3SOF.txt",header=FALSE,sep=",")
sexCLH<-read.table("q_out_aims_sex_w3CLH.txt",header=FALSE,sep=",")
sexREF<-read.table("q_out_aims_sex_w3REF.txt",header=FALSE,sep=",")
sexEGP<-read.table("q_out_aims_sex_w3EGP.txt",header=FALSE,sep=",")
sexBKM<-read.table("q_out_aims_sex_w3BKM.txt",header=FALSE,sep=",")
sexSTM<-read.table("q_out_aims_sex_w3STM.txt",header=FALSE,sep=",")
anc<-list(rbind(autoMTR,sexMTR),rbind(autoCSP,sexCSP),rbind(autoCRP,sexCRP),rbind(autoSOP,sexSOP),rbind(autoLAE,sexLAE),rbind(autoSWM,sexSWM),rbind(autoTIC,sexTIC),rbind(autoSOF,sexSOF),rbind(autoCLH,sexCLH),rbind(autoREF,sexREF),rbind(autoEGP,sexEGP),rbind(autoBKM,sexBKM),rbind(autoSTM,sexSTM))
pnms<-c("MTR","CSP","CRP","SOP","LAE","SWM","TIC","SOF","CLH","REF","EGP","BKM","STM")
## get d
anna<-read.table("../infiles/comb_lychybAutos_Anna.txt",skip=1,header=F)
anna_m<-as.matrix(anna[,-1])
anna_p<-apply(anna_m,1,mean)/2
anna_sex<-read.table("../infiles/comb_lychybSex_Anna.txt",skip=1,header=F)
anna_m<-as.matrix(anna_sex[,-1])
anna_p<-c(anna_p,apply(anna_m,1,mean)/2)
mel<-read.table("../infiles/comb_lychybAutos_MelWest.txt",skip=1,header=F)
mel_m<-as.matrix(mel[,-1])
mel_p<-apply(mel_m,1,mean)/2
mel_sex<-read.table("../infiles/comb_lychybSex_MelWest.txt",skip=1,header=F)
mel_m<-as.matrix(mel_sex[,-1])
mel_p<-c(mel_p,apply(mel_m,1,mean)/2)
## define aims as those with |p_anna - p_mel| >= 2
aims<-which(abs(anna_p-mel_p) > 0.2)
d<-(abs(anna_p-mel_p))
aims<-which(d>0.2)
2. Linkage group level summaries
## read locus information and subset
lgs<-scan("lgs.txt")
lgs<-c(lgs,rep(23,852))[aims]
## LG meanms
lgm<-matrix(NA,nrow=13,ncol=23)
for(i in 1:13){
lgm[i,]<-tapply(X=anc[[i]][,1],INDEX=lgs,mean)
}
## sds by lg
lgsd<-apply(lgm,2,sd)
autoS<-lgs
autoS[lgs<23]<-1
autoS[lgs==23]<-2
lgSa<-matrix(NA,nrow=13,ncol=2)
for(i in 1:13){lgSa[i,]<-tapply(X=anc[[i]][,1],INDEX=autoS,mean)}
## plot of autosomal vs Z ancestry
plot(lgSa[,1],lgSa[,2])
abline(a=0,b=1)
devs<-lgm
mns<-rep(NA,13)
for(i in 1:13){mns[i]<-mean(anc[[i]][,1])}
for(i in 1:13){devs[i,]<-lgm[i,]-mns[i]}
boxplot(devs,col="gray")
abline(h=0)
Chromosome Z (23) has less L. melissa ancestry than the rest of the genome (mean across populations, 40.6% melissa for Z vs. 68.3% for the 22 autosomes). This holds for all populations individually as well. Additionally Z is less variable in mean ancestry across populations (s.d. Z = 0.040, mean for others = 0.122, 32.3% of the mean and it is the least variable overall). Thus, all populations show a reduction in L. melissa ancestry, but the reduction is greater in more L. melissa-like populations reducing the overall variability across populations.
2b. Overall correlations across LGs
mns<-vector("list",13)
sds<-vector("list",13)
for(i in 1:13){
mns[[i]]<-tapply(X=anc[[i]][,1],INDEX=lgs,mean)
sds[[i]]<-tapply(X=anc[[i]][,1],INDEX=lgs,sd)
}
names(mns)<-pnms
names(sds)<-pnms
## cormatrixes
lgCors<-matrix(NA,nrow=13,ncol=13)
for(i in 1:13){for(j in 1:13){
if(i > j){
lgCors[i,j]<-cor(sds[[i]],sds[[j]])
}
if(i < j){
lgCors[i,j]<-cor(mns[[i]],mns[[j]])
}
}}
library(fields)
pdf("ancLgCors.pdf",width=7,height=6)
bls<-brewer.pal(8,"Blues")
par(mar=c(4,4,1,6))
par(xpd=TRUE)
image(lgCors,breaks=c(-1,0,seq(0.1,0.7,0.1),1),col=c("gray",bls),axes=FALSE)
axis(1,at=seq(0,12,1)/12,pnms,las=2)
axis(2,at=seq(0,12,1)/12,pnms,las=2)
box()
par(xpd=FALSE)
image.plot(lgCors,breaks=c(-1,0,seq(0.1,0.7,0.1),1),col=c("gray",bls),legend.lab="cor. coef. (r)",legend.only=TRUE,add=TRUE)
dev.off()
## correlations for means
summary(lgCors[upper.tri(lgCors)])
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.01006 0.57760 0.75810 0.68680 0.89540 0.97560
## correlations for sds
summary(lgCors[lower.tri(lgCors)])
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.2453 0.3578 0.4514 0.4482 0.5926 0.9210
Overall, LG means and s.d. show positive correlations across populations, with a higher average correlation for means (0.69) than s.d. (0.45). Thus, there is greater consistency in LG level differences than in variability across LGs, though both are non-trivial. MTR and CSP are the greatest outliers, particularly with means, while SOP, LAE, SWM and TIC clearly hold together.
3. Plots/analyses of ancestry along chromosomes
## population level-plots
N<-length(lgs)
bnds<-which(lgs[2:N] != lgs[1:(N-1)])
bndsPlus<-c(1,bnds,length(lgs))
mids<-rep(NA,23)
for(i in 1:23){
mids[i]<-(bndsPlus[i]+bndsPlus[i+1])/2
}
## population plots
for(i in 1:13){
plotanc(pop=anc[[i]][,1],nm=pnms[i],mids,bnds)
}
## population plots for talk
pdf("aims_ancplotMTR-CSP.pdf",width=10,height=8)
par(mfrow=c(2,1))
par(mar=c(5.5,5.5,2,0.5))
for(i in c(1,8)){
plotanc(pop=anc[[i]][,1],nm=pnms[i],mids,bnds)
}
dev.off()
Here are summaries of the number of regions with (nearly) fixed (point estimates of 0.01 or 0.99 and CIs of 0.05 or 0.95) or high frequency ancestry regions (point estimates of 0.05 or 0.95 and CIs of 0.1 or 0.9). These suggest most of the genome is segregating for ancestry.
fixedHi<-matrix(NA,nrow=13,ncol=4)
for(i in 1:13){
fixedHi[i,1]<-sum(anc[[i]][,1] > 0.99 & anc[[1]][,3] > 0.95)
fixedHi[i,2]<-sum(anc[[i]][,1] <0.01 & anc[[1]][,4] < 0.05)
fixedHi[i,3]<-sum(anc[[i]][,1] > 0.95 & anc[[1]][,3] > 0.9)
fixedHi[i,4]<-sum(anc[[i]][,1] <0.05 & anc[[1]][,4] < 0.1)
}
colnames(fixedHi)<-c("fixMel","fixAnna","hiMel","hiAnna")
rownames(fixedHi)<-pnms
fixMel fixAnna hiMel hiAnna
MTR 3 27 6 67
CSP 0 2 0 3
CRP 0 0 9 2
SOP 0 1 3 1
LAE 0 0 10 2
SWM 0 0 1 3
TIC 0 1 3 1
SOF 0 0 10 2
CLH 0 1 3 1
REF 0 0 10 2
EGP 0 2 3 2
BKM 0 2 10 2
STM 0 0 3 2
And lastly (for now), I defined excess ancestry as the top 1% or 5% of SNPs (most extreme melissa or anna ancestry) and tested for overlap across populations. There is a clear signal on the Z.
## excess ancestry, here top (and bottom) 1%
bnds<-c(0.01,0.99)
#bnds<-c(0.05,0.95)
## matrix of excess anna
exAnna<-matrix(NA,nrow=dim(anc[[1]])[1],ncol=13)
for(i in 1:13){
ub<-quantile(anc[[i]][,1],probs=bnds[1])
exAnna[,i]<-as.numeric(anc[[i]][,1] < ub)
}
nullMx<-rep(NA,1000)
null99<-rep(NA,1000)
## perm
for(i in 1:1000){
X<-exAnna
for(j in 1:13){
X[,j]<-X[sample(1:dim(X)[1],dim(X),replace=FALSE),j]
}
cnts<-apply(X,1,sum)
nullMx[i]<-max(cnts)
null99[i]<-quantile(cnts,probs=0.99)
}
## matrix of excess melissa
exMel<-matrix(NA,nrow=dim(anc[[1]])[1],ncol=13)
for(i in 1:13){
ub<-quantile(anc[[i]][,1],probs=bnds[2])
exMel[,i]<-as.numeric(anc[[i]][,1] > ub)
}
nullMxM<-rep(NA,1000)
null99M<-rep(NA,1000)
## perm
for(i in 1:1000){
X<-exMel
for(j in 1:13){
X[,j]<-X[sample(1:dim(X)[1],dim(X),replace=FALSE),j]
}
cnts<-apply(X,1,sum)
nullMxM[i]<-max(cnts)
null99M[i]<-quantile(cnts,probs=0.99)
}
pdf("excessAncestryAims.pdf",width=8,heigh=5)
plot(apply(exAnna,1,sum),type='l',ylim=c(0,13))
abline(h=quantile(nullMx,probs=0.95),col="red")
abline(h=quantile(null99,probs=0.95),col="orange")
plot(apply(exMel,1,sum),type='l',ylim=c(0,13))
abline(h=quantile(nullMxM,probs=0.95),col="red")
abline(h=quantile(null99M,probs=0.95),col="orange")
dev.off()
save(list=ls(),file="aimAnc.Rdat")