timema whole genome Fst

Post date: Oct 03, 2013 8:14:24 PM

I finished some initial analyses to quantify genetic differentiation between four pairs of Timema populations (three are parapatric, one is allopatric, and all are on different host plants) from the Timema whole genome resequence data. I experimented with multiple estimators of Fst. The unbiased estimator (WC and an unbiased estimator modified from Hudsdon) give many negative Fst values, whereas Nei's unbiased estimator takes on values between 0 and 2. These are undesirable properties. So instead I am using Hudson's original estimators: Fst = 1 - Hw/Hb = 1 - (p1 * (1-p1) + p2 * (1-p2))/(p1 * (1-p2) + p2 * (1-p1)). Mean (Hudson's) Fst is uniformly low, but about twice as high for the allopatric population pair. There is clear evidence of heterogeneity in Fst, with a large chunk of one linkage group showing elevated Fst for the R12 population pair, and multiple regions with higher (visually) Fst for the allopatric pair (fst plot). I am currently testing for parallelism in divergence by calculating the summed empirical quantile of Fst across population pairs and testing whether this is greater than expected if Fst is randomized among loci (I am calculating the maximum and 99th quantile of the Fst quantile under the null hypothesis). Here is all of the R code:

## hudson fst for each locus (see equation 10 in Bhatia et al. 2013, doi:10.1101/gr.154831.113)

hudsonFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){

numerator<-(p1 - p2)^2 - ((p1 * (1 - p1))/(n1 - 1)) - ((p2 * (1 - p2))/(n2 - 1))

denominator<-p1 * (1 - p2) + p2 * (1 - p1)

fst<-numerator/denominator

out<-cbind(numerator,denominator,fst)

return(out)

}

## nei fst

neiFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){

pu<-(p1 + p2)/2

numerator<-(p1 - p2)^2

denominator<-2 * pu * (1 - pu)

fst<-numerator/denominator

out<-cbind(numerator,denominator,fst)

return(out)

}

## original hudson Fst = 1 - Hw/Hb <<<<<< THIS IS THE ONE I AM USING >>>>>>>>>>

hudsonFst2<-function(p1=NA,p2=NA,n1=NA,n2=NA){

numerator<-p1 * (1 - p1) + p2 * (1 - p2)

denominator<-p1 * (1 - p2) + p2 * (1 - p1)

fst<-1 - numerator/denominator

out<-cbind(numerator,denominator,fst)

return(out)

}

## weir and cockerham estimator based on equation 6 in Bhatia's paper

wcFst<-function(p1=NA,p2=NA,n1=NA,n2=NA){

numerator<-(2 * ((n1 * n2)/(n1 + n2)) * (1/(n1 + n2 -2))) * (n1 * p1 * (1 - p1) + n2 * p2 * (1 -p2))

denominator<-((n1 * n2)/(n1 + n2)) * (p1 - p2)^2 + (2 * ((n1 * n2)/(n1 + n2)) - 1) * (1/(n1 + n2 -2)) * (n1 * p1 * (1 - p1) + n2 * p2 * (1 -p2))

fst<-1-numerator/denominator

out<-cbind(numerator,denominator,fst)

return(out)

}

## calculate sliding window of fst, note fst is the matrix from hudson fst

slidingWindow<-function(scaf=NA,pos=NA,fst=NA,window=5000){

nl<-dim(fst)[1]

multifst<-numeric(nl)

cnt<-numeric(nl)

for (i in 1:nl){

focalS<-scaf[i]

focalP<-pos[i]

flag = TRUE

num<-fst[i,1]

den<-fst[i,2]

n<-1

l<-i+1

while (flag == TRUE & l <= nl){

if (scaf[l] == focalS & (pos[l] - focalP) <= window){

if (is.na(fst[l,1]) == FALSE & is.na(fst[l,2]) == FALSE){

num<-num + fst[l,1]

den<-den + fst[l,2]

n<-n+1

l<-l+1

}

else {

flag = FALSE

}

multifst[i]<-1 - num/den

cnt[i]<-n

}

return(cbind(multifst,cnt))

}

G<-matrix(scan("../variant/pntest_wildwgVarsA.txt",n=160 * 4391556,sep=" "),nrow=4391556,ncol=160,byrow=TRUE)

pops<-read.table("pops.txt",header=F)

plist<-unique(as.character(pops[,1]))

pops<-as.character(pops[,1])

nloc<-dim(G)[1]

npop<-length(plist)

locinfo<-read.table("locids.txt",header=F)

## population allele frequencies

af<-matrix(NA,nrow=nloc,ncol=npop)

for (k in 1:npop){

af[,k]<-apply(G[,which(pops==plist[k])],1,mean)/2

}

N<-table(pops)

## fst for each snp

fstHVAxHVChu<-hudsonFst2(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])

fstMR1AxMR1Chu<-hudsonFst2(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])

fstR12AxR12Chu<-hudsonFst2(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])

fstLAxPRChu<-hudsonFst2(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])

fstHVAxHVCwc<-wcFst(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])

fstMR1AxMR1Cwc<-wcFst(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])

fstR12AxR12Cwc<-wcFst(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])

fstLAxPRCwc<-wcFst(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])

fstHVAxHVCnei<-neiFst(p1=af[,1],p2=af[,2],n1=N[1],n2=N[2])

fstMR1AxMR1Cnei<-neiFst(p1=af[,4],p2=af[,5],n1=N[4],n2=N[5])

fstR12AxR12Cnei<-neiFst(p1=af[,7],p2=af[,8],n1=N[7],n2=N[8])

fstLAxPRCnei<-neiFst(p1=af[,3],p2=af[,6],n1=N[3],n2=N[6])

## slding window multi-locus fst

mlfstHV<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstHVAxHVChu,window=1000)

mlfstMR1<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstMR1AxMR1Chu,window=1000)

mlfstR12<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstR12AxR12Chu,window=1000)

mlfstAllopatric<-slidingWindow(scaf=locinfo[,3],pos=locinfo[,4],fst=fstLAxPRChu,window=1000)

## candidate snps

#lateral hue scaffold_1067 217427

#dorsal scaffold_3448 47421

#dorsal scaffold_2100 34576

#dorsal scaffold_3448 26532

#dorsal scaffold_6255 2808

#traitSnps<-cbind(c(1067,3448,2100,3448,6255),c(217427,47421,34576,26532,2808))

#trSnpLoc<-numeric(5)

#for (i in 1:5){

# A<-which(locinfo[,3] == traitSnps[i,1])

# trSnpLoc[i]<-which(min(abs(locinfo[A,4]-traitSnps[i,2])) == abs(locinfo[A,4]-traitSnps[i,2]))

# constant selection, sig

#4313, 10750, beta0 -0.937125

#4313, 10784, beta0 -0.947682

#4313, 10799, beta0 -0.942799

#biggest distance effect:

#1683, 119435, beta = -0.216152

#biggest plant effect:

#8157, 2183, beta -0.09554

#biggest change

#scaffold 1782, position 2962

## ONLY TWO OF THE ABOVE OR ON LG's

A<-which(locinfo[,3] == 1683)

M<-min(abs(locinfo[A,4]-119435)) ## check sign

dsnp<-which(locinfo[,3]==1683 & abs(locinfo[,4]-119435) == M)

A<-which(locinfo[,3] == 1067)

M<-min(abs(locinfo[A,4]-217427)) ## check sign

lhsnp<-which(locinfo[,3]==1067 & abs(locinfo[,4]-217427) == M)

## plot of point estimates

png("~/labs/evolution/projects/timema_wgwild/fst/fsthu.png",width=36,height=12,units="in",res=480)

par(mar=c(4.5,4.5,2,0.5))

par(mfrow=c(4,1))

bnds<-which(as.character(locinfo[2:nloc,1]) != as.character(locinfo[1:(nloc-1),1]))

plot(fstHVAxHVChu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="HVA x HVC",ylim=c(0,.8))

B<-quantile(fstHVAxHVChu[,3],probs=c(0.999,0.9999),na.rm=TRUE)

abline(h=B,col="red",lty=2)

points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)

abline(v=bnds,lty=2,lwd=2,col="gray")

plot(fstMR1AxMR1Chu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="MRC x MRC",ylim=c(0,.8))

B<-quantile(fstMR1AxMR1Chu[,3],probs=c(0.999,0.9999),na.rm=TRUE)

abline(h=B,col="red",lty=2)

points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)

abline(v=bnds,lty=2,lwd=2,col="gray")

plot(fstR12AxR12Chu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="R12A x R12C",ylim=c(0,.8))

B<-quantile(fstR12AxR12Chu[,3],probs=c(0.999,0.9999),na.rm=TRUE)

abline(h=B,col="red",lty=2)

points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)

abline(v=bnds,lty=2,lwd=2,col="gray")

plot(fstLAxPRChu[,3],xlab="snp number",ylab=expression(paste(F[ST])),main="LA x PRC",ylim=c(0,.8))

points(c(dsnp,lhsnp),c(0.77,0.77),pch=c(15,16),col=c("darkblue","forestgreen"),cex=1.2)

B<-quantile(fstLAxPRChu[,3],probs=c(0.999,0.9999),na.rm=TRUE)

abline(h=B,col="red",lty=2)

abline(v=bnds,lty=2,lwd=2,col="gray")

dev.off()

## zoomed plot

pdf("~/labs/evolution/projects/timema_wgwild/fst/fstMR.pdf",width=12,height=6)

plot(X,fstMR1AxMR1Chu[X,3],xlab="snp number",ylab=expression(paste(F[ST])),main="MR1A x MR1C lg 10",ylim=c(0,.8))

points(dsnp,0.77,pch=15,col="darkblue")

dev.off()

## statistics

summary(fstHVAxHVChu[,3])

# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

# 0.0000 0.0014 0.0061 0.0132 0.0176 0.3576 1011

summary(fstMR1AxMR1Chu[,3])

# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

#0.00000 0.00152 0.00684 0.01514 0.01997 0.46140 83

summary(fstR12AxR12Chu[,3])

# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

#0.00000 0.00167 0.00739 0.01522 0.02086 0.39750 173

summary(fstLAxPRChu[,3])

# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

# 0.0000 0.0033 0.0146 0.0313 0.0418 0.7955 317

quantile(fstHVAxHVChu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)

# 95% 99% 99.9% 99.99%

# 0.05008487 0.08640532 0.14204231 0.20142633

quantile(fstMR1AxMR1Chu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)

# 95% 99% 99.9% 99.99%

# 0.0580757 0.1010189 0.1723164 0.2497428

quantile(fstR12AxR12Chu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)

# 95% 99% 99.9% 99.99%

# 0.05632994 0.09403784 0.15467574 0.22214953

quantile(fstLAxPRChu[,3],probs=c(0.95,0.99,0.999,0.9999),na.rm=TRUE)

# 95% 99% 99.9% 99.99%

# 0.1161079 0.2079093 0.3736882 0.5456160

lgmnHVAxHVChu<-tapply(X=fstHVAxHVChu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)

lgmnMR1xMR1Chu<-tapply(X=fstMR1AxMR1Chu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)

lgmnR12AxR12Chu<-tapply(X=fstR12AxR12Chu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)

lgmnLAxPRChu<-tapply(X=fstLAxPRChu[,3],INDEX=locinfo[,1],mean,na.rm=TRUE)

lgmn<-rbind(lgmnHVAxHVChu,lgmnMR1xMR1Chu,lgmnR12AxR12Chu,lgmnLAxPRChu)

## quantiles

fstall<-cbind(fstHVAxHVChu[,3],fstMR1AxMR1Chu[,3],fstR12AxR12Chu[,3],fstLAxPRChu[,3])

quant<-fstall

for(i in 1:4){

quant[order(fstall[,i]),i]<-seq(0,1,1/(dim(fstall)[1]-1))

}

qsum<-apply(quant,1,sum)

nrep<-1000

maxq<-numeric(nrep)

q99q<-numeric(nrep)

for(n in 1:nrep){

qrep<-quant

for(j in 1:4){

qrep[,j]<-qrep[sample(1:dim(qrep)[1],dim(qrep)[1],replace=FALSE),j]

}

qrepsum<-apply(qrep,1,sum)

maxq[n]<-max(qrepsum)

q99q[n]<-quantile(qrepsum,probs=0.99)

}

Page updated

Google Sites

Report abuse