IBD/IBA for natural L. melissa populations

Post date: Mar 26, 2017 5:55:42 PM

I tested for genetic isolation by geographic distance and host for the 25 L. melissa populations (this excludes ABM).

I used Sam's ML allele frequency estimates as a starting point. These were based on my original variants file (see discussion here), and are thus the same starting point for my entropy/pca analyses of this data. This includes 206,028 SNPs.

These were copied (eventually, initially done on my desktop) to king:/uufs/chpc.utah.edu/common/home/u6000989/projects/lmelissa_hostAdaptation/lmelIBA (freqs sub-directory). I then created a single combined allele frequency file with populations (rows) in alphabetical order (lmelFreqs.txt):

perl makeFreqFile.pl p_*txt

The IBD/IBA analysis was conucuted in R/rjags and fits the Bayesian linear model with population effect for distance matrixes I have used in the past. The code is pasted below. In summary, I first only retained the subset of 26,330 common (maf > 5%) SNPs. I used Hudson's Fst for genetic distance, and the logit of Fst for the analysis.

Host distance was treated as a binary (alfalfa or not as hosts, and 0 or 1 for same vs. different), which as then centered. Geographic distance was calculated with rdist.earth as the great circle distance (in km). This was centered.

For MCMC I fit the model with 3 chains, 30,000 iterations + 10,000 burnin, and a thinning interval of 5. Diagnostics looked great: ESS > 16,000 for all, and Gelman-Rubin = 1.00 (point estimate and CIs). The estimates are below (beta[1] = host, beta[2] = geo and beta0 = intercept). There is clear IBD and no evidence of IBA.

2.5% 25% 50% 75% 97.5%

beta[1] -0.07747 -0.03807 -0.01738 0.003403 0.04228

beta[2] 0.39186 0.42155 0.43747 0.452798 0.48301

beta0 -2.56462 -2.54522 -2.53496 -2.525029 -2.50572

taua 2.93607 4.50076 5.47019 6.592635 9.01333

taue 12.58692 14.09272 14.94700 15.808209 17.52748

Here is the code and a plot:

## R function to fit a Bayesian linear model with population effect for distance matrixes

## similar to clarke et al. 2002

## variable definitions

## Y = genetic distance matrix (dist)

## X = predictor distance matrixes (list of dist)

## n = number of populations or matrix rows (integer)

## p = number of predictor matrixes

## nmcmc = number of mcmc iterations (integer)

## burnin = number of initial mcmc iterations to discard (integer)

## mixed model regression MCMC function

mmrMcmc<-function(Y=NA,X1=NULL,X2=NULL,npop=NA,nmcmc=5000,burnin=1000,thin=5,nchain=2){

library(rjags)

load.module("dic")

n<-(npop*(npop-1))/2

# Y<-Y-mean(Y)

if (is.null(X1)==FALSE){

X1<-X1-mean(X1)

# X1<-X1/sd(X1)

}

if (is.null(X2)==FALSE){

X2<-X2-mean(X2)

#X2<-X2/sd(X2)

}

## create random effect

R1<-numeric(n)

R2<-numeric(n)

k<-1

for(i in 1:(npop-1)){

for(j in (i+1):npop){

R1[k]<-i

R2[k]<-j

k<-k+1

}

## specifies the linear model for jags, single path or differences

jagsModel="

model {

## normal likelihood and linear model: mu = beta0 + beta X + alpha_i + alpha_j

for (i in 1:n) {

Y[i] ~ dnorm(mu[i],taue)

mu[i]<-beta0 + inprod(X[i,],beta) + alpha[pop1[i]] + alpha[pop2[i]]

}

## covariate priors

for (k in 1:p) {

beta[k] ~ dnorm(0,0.001)

}

## intercept prior

beta0 ~ dnorm(0,0.001)

## population effect priors

for (j in 1:(npop-1)){

alpha[j] ~ dnorm(0, taua)

}

alpha[npop]<- -1 * sum(alpha[1:(npop-1)])

## residual priors

taua ~ dgamma(1,0.01)

taue ~ dgamma(1,0.01)

}

"

## format data

if (is.null(X1)){

X<-as.matrix(as.vector(X2))

p<-1

}

else if (is.null(X2)){

X<-as.matrix(as.vector(X1))

p<-1

}

else{

X<-as.matrix(cbind(as.vector(X1),as.vector(X2)))

p<-2

}

data<-list(Y=as.vector(Y),X=X,n=n,p=p,npop=npop,pop1=R1,pop2=R2)

## model initiatlization

model<-jags.model(textConnection(jagsModel),data=data,n.chains=nchain,n.adapt=0) ## add back init

update(model,n.iter=burnin)

## mcmc estimation of model parameters

mcmcSamples<-coda.samples(model=model,variable.names=c("beta0","beta","taue","taua"), n.iter=nmcmc,thin=thin)

#dicSams<-dic.samples(model=model,n.iter=nmcmc,thin=thin,type="pD")

return(list(param=mcmcSamples,NA))

#return(list(param=mcmcSamples,dic=dicSams))

}

## read in data

pops<-25

snps<-206028

p<-matrix(scan("lmelFreqs.txt",n=pops*snps,sep=" "),nrow=pops,ncol=snps,byrow=TRUE)

pbar<-apply(p,2,mean)

common<-which(pbar > 0.05 & pbar < 0.95)

pc<-p[,common] ## 26,330 common SNPs

dat<-read.table("hosts-geo.txt",header=T)

## calculate fst; hudson fst = 1 - hw/hb

fst<-matrix(NA,nrow=pops,ncol=pops)

qc<-1-pc

het<-2 * pc * qc

for(i in 1:(pops-1)){

for(j in (i+1):pops){

hw<-mean((het[i,]+het[j,])/2)

pbar<-(pc[i,]+pc[j,])/2

hb<-mean(2 * pbar * (1-pbar))

fst[i,j]<-1-hw/hb

fst[j,i]<-fst[i,j]

}

y<-log(fst/(1-fst))

y<-as.dist(y)

## host

X1<-dist(as.numeric(dat$host))

X1[X1 > 0]<-1

X1<-dist(dat$host=='medicago')

## geo

library fields

X2<-rdist.earth(as.matrix(dat[,c(4,3)]),miles=FALSE)

X2<-as.dist(log(X2))

out<-mmrMcmc(Y=as.vector(y),X1=as.vector(X1),X2=as.vector(X2),npop=pops,nmcmc=30000,burnin=10000,thin=5,nchain=3)

summary(out[[1]])

#Iterations = 10005:40000

#Thinning interval = 5

#Number of chains = 3

#Sample size per chain = 6000

#1. Empirical mean and standard deviation for each variable,

# plus standard error of the mean:

# Mean SD Naive SE Time-series SE

#beta[1] -0.01732 0.03056 0.0002278 0.0002231

#beta[2] 0.43730 0.02332 0.0001738 0.0001845

#beta0 -2.53508 0.01501 0.0001119 0.0001126

#taua 5.61333 1.56541 0.0116679 0.0116677

#taue 14.97438 1.27082 0.0094721 0.0094738

#2. Quantiles for each variable:

# 2.5% 25% 50% 75% 97.5%

#beta[1] -0.07747 -0.03807 -0.01738 0.003403 0.04228

#beta[2] 0.39186 0.42155 0.43747 0.452798 0.48301

#beta0 -2.56462 -2.54522 -2.53496 -2.525029 -2.50572

#taua 2.93607 4.50076 5.47019 6.592635 9.01333

#taue 12.58692 14.09272 14.94700 15.808209 17.52748

## ess > 16,000, gelman rubin = 1.00

## plot

pdf("ibd.pdf",width=5,height=5)

par(mar=c(5,5,0.5,0.5))

plot(X2,y,xlab="log distance (km)",ylab=expression(paste("logit ",F[ST])),cex.lab=1.4,cex.axis=1.2,col=c("darkgray","orange")[X1+1],pch=19)

abline(a=-2.5-mean(X2)*0.44,b=0.44)

legend(2.9,-1.3,c("same host","different host"),pch=19,col=c("darkgray","orange"))

dev.off()