Post date: Oct 03, 2013 8:14:24 PM
I finished some initial analyses to quantify genetic differentiation between four pairs of Timema populations (three are parapatric, one is allopatric, and all are on different host plants) from the Timema whole genome resequence data. I experimented with multiple estimators of Fst. The unbiased estimator (WC and an unbiased estimator modified from Hudsdon) give many negative Fst values, whereas Nei's unbiased estimator takes on values between 0 and 2. These are undesirable properties. So instead I am using Hudson's original estimators: Fst = 1 - Hw/Hb = 1 - (p1 * (1-p1) + p2 * (1-p2))/(p1 * (1-p2) + p2 * (1-p1)). Mean (Hudson's) Fst is uniformly low, but about twice as high for the allopatric population pair. There is clear evidence of heterogeneity in Fst, with a large chunk of one linkage group showing elevated Fst for the R12 population pair, and multiple regions with higher (visually) Fst for the allopatric pair (fst plot). I am currently testing for parallelism in divergence by calculating the summed empirical quantile of Fst across population pairs and testing whether this is greater than expected if Fst is randomized among loci (I am calculating the maximum and 99th quantile of the Fst quantile under the null hypothesis). Here is all of the R code:
## hudson fst for each locus (see equation 10 in Bhatia et al. 2013, doi:10.1101/gr.154831.113)
hudsonFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){
numerator<-(p1 - p2)^2 - ((p1 * (1 - p1))/(n1 - 1)) - ((p2 * (1 - p2))/(n2 - 1))
denominator<-p1 * (1 - p2) + p2 * (1 - p1)
fst<-numerator/denominator
out<-cbind(numerator,denominator,fst)
return(out)
}
## nei fst
neiFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){
pu<-(p1 + p2)/2
numerator<-(p1 - p2)^2
denominator<-2 * pu * (1 - pu)
fst<-numerator/denominator
out<-cbind(numerator,denominator,fst)
return(out)
}
## original hudson Fst = 1 - Hw/Hb <<<<<< THIS IS THE ONE I AM USING >>>>>>>>>>
hudsonFst2<-function(p1=NA,p2=NA,n1=NA,n2=NA){
numerator<-p1 * (1 - p1) + p2 * (1 - p2)
denominator<-p1 * (1 - p2) + p2 * (1 - p1)
fst<-1 - numerator/denominator
out<-cbind(numerator,denominator,fst)
return(out)
}
## weir and cockerham estimator based on equation 6 in Bhatia's paper
wcFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){
numerator<-(2 * ((n1 * n2)/(n1 + n2)) * (1/(n1 + n2 -2))) * (n1 * p1 * (1 - p1) + n2 * p2 * (1 -p2))
denominator<-((n1 * n2)/(n1 + n2)) * (p1 - p2)^2 + (2 * ((n1 * n2)/(n1 + n2)) - 1) * (1/(n1 + n2 -2)) * (n1 * p1 * (1 - p1) + n2 * p2 * (1 -p2))
fst<-1-numerator/denominator
out<-cbind(numerator,denominator,fst)
return(out)
}
## calculate sliding window of fst, note fst is the matrix from hudson fst
slidingWindow<-function(scaf=NA,pos=NA,fst=NA,window=5000){
nl<-dim(fst)[1]
multifst<-numeric(nl)
cnt<-numeric(nl)
for (i in 1:nl){
focalS<-scaf[i]
focalP<-pos[i]
flag = TRUE
num<-fst[i,1]
den<-fst[i,2]
n<-1
l<-i+1
while (flag == TRUE & l <= nl){
if (scaf[l] == focalS & (pos[l] - focalP) <= window){
if (is.na(fst[l,1]) == FALSE & is.na(fst[l,2]) == FALSE){
num<-num + fst[l,1]
den<-den + fst[l,2]
n<-n+1
l<-l+1
}
}
else {
flag = FALSE
}
}
multifst[i]<-1 - num/den
cnt[i]<-n
}
return(cbind(multifst,cnt))
}
G<-matrix(scan("../variant/pntest_wildwgVarsA.txt",n=160 * 4391556,sep=" "),nrow=4391556,ncol=160,byrow=TRUE)
pops<-read.table("pops.txt",header=F)
plist<-unique(as.character(pops[,1]))
pops<-as.character(pops[,1])
nloc<-dim(G)[1]
npop<-length(plist)
locinfo<-read.table("locids.txt",header=F)
## population allele frequencies
af<-matrix(NA,nrow=nloc,ncol=npop)
for (k in 1:npop){
af[,k]<-apply(G[,which(pops==plist[k])],1,mean)/2
}
N<-table(pops)
## fst for each snp
fstHVAxHVChu<-hudsonFst2(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])
fstMR1AxMR1Chu<-hudsonFst2(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])
fstR12AxR12Chu<-hudsonFst2(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])
fstLAxPRChu<-hudsonFst2(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])
fstHVAxHVCwc<-wcFst(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])
fstMR1AxMR1Cwc<-wcFst(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])
fstR12AxR12Cwc<-wcFst(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])
fstLAxPRCwc<-wcFst(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])
fstHVAxHVCnei<-neiFst(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])
fstMR1AxMR1Cnei<-neiFst(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])
fstR12AxR12Cnei<-neiFst(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])
fstLAxPRCnei<-neiFst(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])
## slding window multi-locus fst
mlfstHV<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstHVAxHVChu,window=1000)
mlfstMR1<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstMR1AxMR1Chu,window=1000)
mlfstR12<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstR12AxR12Chu,window=1000)
mlfstAllopatric<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstLAxPRChu,window=1000)
## candidate snps
#lateral hue scaffold_1067 217427
#dorsal scaffold_3448 47421
#dorsal scaffold_2100 34576
#dorsal scaffold_3448 26532
#dorsal scaffold_6255 2808
#traitSnps<-cbind(c(1067,3448,2100,3448,6255),c(217427,47421,34576,26532,2808))
#trSnpLoc<-numeric(5)
#for (i in 1:5){
# A<-which(locinfo[,3] == traitSnps[i,1])
# trSnpLoc[i]<-which(min(abs(locinfo[A,4]-traitSnps[i,2])) == abs(locinfo[A,4]-traitSnps[i,2]))
#}
# constant selection, sig
#4313, 10750, beta0 -0.937125
#4313, 10784, beta0 -0.947682
#4313, 10799, beta0 -0.942799
#biggest distance effect:
#1683, 119435, beta = -0.216152
#biggest plant effect:
#8157, 2183, beta -0.09554
#biggest change
#scaffold 1782, position 2962
## ONLY TWO OF THE ABOVE OR ON LG's
A<-which(locinfo[,3] == 1683)
M<-min(abs(locinfo[A,4]-119435)) ## check sign
dsnp<-which(locinfo[,3]==1683 & abs(locinfo[,4]-119435) == M)
A<-which(locinfo[,3] == 1067)
M<-min(abs(locinfo[A,4]-217427)) ## check sign
lhsnp<-which(locinfo[,3]==1067 & abs(locinfo[,4]-217427) == M)
## plot of point estimates
png("~/labs/evolution/projects/timema_wgwild/fst/fsthu.png",width=36,height=12,units="in",res=480)
par(mar=c(4.5,4.5,2,0.5))
par(mfrow=c(4,1))
bnds<-which(as.character(locinfo[2:nloc,1]) != as.character(locinfo[1:(nloc-1),1]))
plot(fstHVAxHVChu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="HVA x HVC",ylim=c(0,.8))
B<-quantile(fstHVAxHVChu[,3],probs=c(0.999,0.9999),na.rm=TRUE)
abline(h=B,col="red",lty=2)
points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)
abline(v=bnds,lty=2,lwd=2,col="gray")
plot(fstMR1AxMR1Chu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="MRC x MRC",ylim=c(0,.8))
B<-quantile(fstMR1AxMR1Chu[,3],probs=c(0.999,0.9999),na.rm=TRUE)
abline(h=B,col="red",lty=2)
points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)
abline(v=bnds,lty=2,lwd=2,col="gray")
plot(fstR12AxR12Chu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="R12A x R12C",ylim=c(0,.8))
B<-quantile(fstR12AxR12Chu[,3],probs=c(0.999,0.9999),na.rm=TRUE)
abline(h=B,col="red",lty=2)
points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)
abline(v=bnds,lty=2,lwd=2,col="gray")
plot(fstLAxPRChu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="LA x PRC",ylim=c(0,.8))
points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)
B<-quantile(fstLAxPRChu[,3],probs=c(0.999,0.9999),na.rm=TRUE)
abline(h=B,col="red",lty=2)
abline(v=bnds,lty=2,lwd=2,col="gray")
dev.off()
## zoomed plot
pdf("~/labs/evolution/projects/timema_wgwild/fst/fstMR.pdf",width=12,height=6)
plot(X,fstMR1AxMR1Chu[X,3],xlab="snp number",ylab=expression(paste(F[ST])),main="MR1A x MR1C lg 10",ylim=c(0,.8))
points(dsnp,0.77,pch=15,col="darkblue")
dev.off()
## statistics
summary(fstHVAxHVChu[,3])
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.0000 0.0014 0.0061 0.0132 0.0176 0.3576 1011
summary(fstMR1AxMR1Chu[,3])
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#0.00000 0.00152 0.00684 0.01514 0.01997 0.46140 83
summary(fstR12AxR12Chu[,3])
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#0.00000 0.00167 0.00739 0.01522 0.02086 0.39750 173
summary(fstLAxPRChu[,3])
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.0000 0.0033 0.0146 0.0313 0.0418 0.7955 317
quantile(fstHVAxHVChu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)
# 95% 99% 99.9% 99.99%
# 0.05008487 0.08640532 0.14204231 0.20142633
quantile(fstMR1AxMR1Chu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)
# 95% 99% 99.9% 99.99%
# 0.0580757 0.1010189 0.1723164 0.2497428
quantile(fstR12AxR12Chu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)
# 95% 99% 99.9% 99.99%
# 0.05632994 0.09403784 0.15467574 0.22214953
quantile(fstLAxPRChu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)
# 95% 99% 99.9% 99.99%
# 0.1161079 0.2079093 0.3736882 0.5456160
lgmnHVAxHVChu<-tapply(X=fstHVAxHVChu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)
lgmnMR1xMR1Chu<-tapply(X=fstMR1AxMR1Chu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)
lgmnR12AxR12Chu<-tapply(X=fstR12AxR12Chu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)
lgmnLAxPRChu<-tapply(X=fstLAxPRChu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)
lgmn<-rbind(lgmnHVAxHVChu,lgmnMR1xMR1Chu,lgmnR12AxR12Chu,lgmnLAxPRChu)
## quantiles
fstall<-cbind(fstHVAxHVChu[,3],fstMR1AxMR1Chu[,3],fstR12AxR12Chu[,3],fstLAxPRChu[,3])
quant<-fstall
for(i in 1:4){
quant[order(fstall[,i]),i]<-seq(0,1,1/(dim(fstall)[1]-1))
}
qsum<-apply(quant,1,sum)
nrep<-1000
maxq<-numeric(nrep)
q99q<-numeric(nrep)
for(n in 1:nrep){
qrep<-quant
for(j in 1:4){
qrep[,j]<-qrep[sample(1:dim(qrep)[1],dim(qrep)[1],replace=FALSE),j]
}
qrepsum<-apply(qrep,1,sum)
maxq[n]<-max(qrepsum)
q99q[n]<-quantile(qrepsum,probs=0.99)
}