Post date: Jan 31, 2018 10:28:59 PM
Considering the three species we have the best data for-T. cristinae, T. chumash, and T. bartmani-we wanted to know whether those species with the simplest (in terms of structural variation) had the least distinct color pattern morphs and the most similarity between stem and leaf colors of their hosts. Thus, overlap (for morphs and hosts) should be T. chumash > T. bartmani > T. cristinae.
We used the RG and GB data (lateral for bugs) from Clarissa in 2015Timema_phenotype_gwas_radiation.csv and 2015host_colour.data.csv.
We assigned hosts based on Patrik's natural history knowledge:
- T. bartmani = WF and WP
- T. chumash = Q and MM
- T. cristinae = A and C
Sample sizes were as follows:
- T. bartman = 150
- T. chumash = 541
- T. cristinae = 190
- WF = 101, WP = 87, total = 188
- Q = 287, MM = 98, total = 385
- A = 102, C = 86, total = 188
We used the UPGMA algorithm in hclust (from R 3.2.3) to cluster Timema into two groups based on the RG and GB color measurements. Host plants were categorized as stems (which tend to be browner) or leaves (which tend to be greener). We then used a Bayesian approach to fit the color pattern data for each group (Timema cluster or host plant sample type) to a bivariate normal distribution. We placed relatively uninformative priors on the mean vectors (normal with mu = 0 and tau = 1e-3 for both means) and for the precision matrix (Wishart with with 2 degrees of freedom and a diagonal scale matrix = 0.001 I, where I is the identity matrix). We used MCMC to obtain samples from the posterior distribution via the rjags (version 4.6) interface with JAGS (version 4.1.0) (1000 iteration burnin, 5000 sampling iterations and a thinning interval of 4). We then estimated the Kullback Leibler distances (that is, the Kullback-Leibler divergence in both directions, e.g., from stems to leaves and leaves to stems). This was calculated over the posterior distribution of the bivariate normal parameters, and thus accounts for uncertainty in these parameters.
Posterior means (point estimates) of the Kullback Leibler distance between host stem and leave colors were 10.1 for T. chumash hosts, 21.8 for T. bartmani hosts, and 39.3 for T. cristinae hosts, with a posterior probability > 0.99 that the distance was greater for T. bartmani than T. chumash, and of 0.97 that T. cristinae was greater than T. bartmani (the pp that T. cristinae had a greater distance than T. chumash was ~1.0). Similarly, posterior means (point estimates) of the Kullback Leibler distance between color morphs of Timema were T. chumash = 14.0, T. bartmani = 17.4, and T. cristinae = 29.2, with a posterior probability 0.87 that the distance was greater for T. bartmani than T. chumash, and of 0.98 that T. cristinae was greater than T. bartmani (the pp that T. cristinae had a greater distance than T. chumash was ~1.0).
Here is the code (from overlap2d.R). This was run on my computer, but is now on kingspeak at /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/color_overlap/.
library(ggplot2)
library(dplyr)
library(mclust)
library(NbClust)
timema <- read.csv("2015Timema_phenotype_gwas_radiation.csv", header=TRUE)
host <- read.csv("2015host_colour.data.csv", header=TRUE)
#=====================
# Clustering analyses
#=====================
# This is the function I used to obtain the best number of clusters using
# hierarchical clustering using UPGMA
upgma <- function(data)
{ set.seed(123)
data_scaled <- scale(data)
d <- (dist(data_scaled, method = "euclidean"))^2
nb <- NbClust(data_scaled, distance=NULL, diss = d, min.nc = 1,
max.nc = 10, method = "average", index ="silhouette")
k <- 2 ## ONLY CONSIDER K = 2, best for all but podura anyway, and podura
## has a very low sample size of N = 20
#k <- nb$Best.nc[1]
upgma <- hclust(d, method = "average")
plot <- plot(upgma, cex = 0.6, hang = -1)
if(k > 1){
rect <- rect.hclust(upgma, k = k, border = 2:5)
grp <- cutree(upgma, k = k)
table <- table(grp)
table <- as.data.frame(table)
} else {rect = NULL; table=NULL}
result <- list("dend"=plot, "rect"=rect, "table"=table, "best.nc" = nb$Best.nc)
return(result)
}
# cluster criteria: RG
cluster <- function(data){
group <- upgma(data)
grp1 <- as.data.frame(group$rect[1])
t1 <- cbind(grp1, rep(0,length(grp1)))
names(t1) <- c("row", "col")
grp2 <- as.data.frame(group$rect[2])
t2 <- cbind(grp2, rep(1,length(grp2)))
names(t2) <- c("row", "col")
matrix <- rbind(t1, t2)
attach(matrix)
matrix <- matrix[order(row),]
result <- matrix$col
return(result)
}
## subset of species we want to work with
## T. bart, T. cali., T chumash, T. crist, T land, T. podura
spec<-c("T. bartmani","T. chumash","T. cristinae")
tclust<-vector("list",3)
for(i in 1:3){
x<-which(as.character(timema$species) == spec[i])
tclust[[i]]<-cluster(timema[x,6:7])
}
## visualize
for(i in 1:3){
x<-which(as.character(timema$species) == spec[i])
plot(timema$latRG[x], timema$latGB[x], pch=19, col=as.factor(tclust[[i]]), xlab = "RG", ylab="GB", main= spec[i])
}
## KL div
kldiv<-function(mu0, mu1, sig0, sig1){
a<-sum(diag(solve(sig1) %*% sig0))
b<-t(mu1 - mu0) %*% solve(sig1) %*% (mu1 - mu0)
c<-log(det(sig1)/det(sig0))
kl<-1/2 * (a + b - 2 + c)
return(kl)
}
## calculate MVN summaries for each species and group
parms <- list()
parms$n.iter <- 5 * 10^3
parms$n.burnin <- 1000
parms$n.thin <- 4
parms$n.chains <- 2
priors <- list()
priors$R <- 0.001 * diag(2)
priors$k <- 2
priors$tau.mu <- 1.0E-3
library(rjags)
mnsA<-vector("list",3)
vcvA<-vector("list",3)
mnsB<-vector("list",3)
vcvB<-vector("list",3)
o_kldiv<-matrix(NA,nrow=3,ncol=2)
o_dist<-matrix(NA,nrow=3,ncol=1)
o_kldiv_mcmcA<-rep(NA,500)
o_kldiv_mcmcB<-rep(NA,500)
o_kldiv_dist_mcmc<-vector("list",3)
for(i in 1:3){
x<-which(as.character(timema$species) == spec[i])
X<-timema[x,6:7]
A<-X[tclust[[i]]==0,]
B<-X[tclust[[i]]==1,]
## MCMC estiamte
ppA<-fitEllipse(x=A[,1],y=A[,2],parms=parms,priors=priors)
ppB<-fitEllipse(x=B[,1],y=B[,2],parms=parms,priors=priors)
for(j in 1:500){
vcvA<-matrix(ppA[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)
vcvB<-matrix(ppB[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)
o_kldiv_mcmcA[j]<-kldiv(ppA[[1]][j,5:6],ppB[[1]][j,5:6],vcvA,vcvB)
o_kldiv_mcmcB[j]<-kldiv(ppB[[1]][j,5:6],ppA[[1]][j,5:6],vcvB,vcvA)
}
o_kldiv_dist_mcmc[[i]]<-(o_kldiv_mcmcA+o_kldiv_mcmcB)/2
o_kldiv[i,1]<-mean(o_kldiv_mcmcA)
o_kldiv[i,2]<-mean(o_kldiv_mcmcB)
}
kldist_timema<-apply(o_kldiv,1,mean)
mean(o_kldiv_dist_mcmc[[1]] > o_kldiv_dist_mcmc[[2]])
##[1] 0.868 chum < bart
mean(o_kldiv_dist_mcmc[[3]] > o_kldiv_dist_mcmc[[2]])
##[1] 1 chum < tcr
mean(o_kldiv_dist_mcmc[[3]] > o_kldiv_dist_mcmc[[1]])
##[1] 0.982 bart < tcr
## now host
## define hosts for each species
## T. bart = WF, WP
## T. chumash = Q, MM
## T. crist = A, C
## based on Patrik's assignments
plantsToKeepA<-vector("list",3)
plantsToKeepB<-vector("list",3)
plantsToKeepA[[1]]<-list(which(host$host=='WF' & host$organ=='leaf'),which(host$host=='WP' & host$organ=='leaf'))
plantsToKeepB[[1]]<-list(which(host$host=='WF' & host$organ=='stem'),which(host$host=='WP' & host$organ=='stem'))
plantsToKeepA[[2]]<-list(which(host$host=='Q' & host$organ=='leaf'),which(host$host=='MM' & host$organ=='leaf'))
plantsToKeepB[[2]]<-list(which(host$host=='Q' & host$organ=='stem'),which(host$host=='MM' & host$organ=='stem'))
plantsToKeepA[[3]]<-list(which(host$host=='A' & host$organ=='leaf'),which(host$host=='C' & host$organ=='leaf'))
plantsToKeepB[[3]]<-list(which(host$host=='A' & host$organ=='stem'),which(host$host=='C' & host$organ=='stem'))
parms <- list()
parms$n.iter <- 5 * 10^3
parms$n.burnin <- 1000
parms$n.thin <- 4
parms$n.chains <- 2
priors <- list()
priors$R <- 0.001 * diag(2)
priors$k <- 2
priors$tau.mu <- 1.0E-3
mnsA<-vector("list",3)
vcvA<-vector("list",3)
mnsB<-vector("list",3)
vcvB<-vector("list",3)
ho_kldiv_list<-vector("list",2)
ho_kldiv<-matrix(NA,nrow=3,ncol=2)
for(p in 1:2){
ho_kldiv_list[[p]]<-ho_kldiv
}
ho_dist<-matrix(NA,nrow=3,ncol=1)
ho_kldiv_mcmcA<-rep(NA,500)
ho_kldiv_mcmcB<-rep(NA,500)
ho_kldiv_dist_mcmcL<-vector("list",3)
ho_kldiv_dist_mcmc<-vector("list",3)
for(i in 1:3){
ho_kldiv_dist_mcmcL[[i]]<-matrix(NA,nrow=500,ncol=2)
for(p in 1:2){
A<-host[plantsToKeepA[[i]][[p]],7:8]
B<-host[plantsToKeepB[[i]][[p]],7:8]
ppA<-fitEllipse(x=A[,1],y=A[,2],parms=parms,priors=priors)
ppB<-fitEllipse(x=B[,1],y=B[,2],parms=parms,priors=priors)
for(j in 1:500){
vcvA<-matrix(ppA[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)
vcvB<-matrix(ppB[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)
ho_kldiv_mcmcA[j]<-kldiv(ppA[[1]][j,5:6],ppB[[1]][j,5:6],vcvA,vcvB)
ho_kldiv_mcmcB[j]<-kldiv(ppB[[1]][j,5:6],ppA[[1]][j,5:6],vcvB,vcvA)
}
ho_kldiv_dist_mcmcL[[i]][,p]<-(ho_kldiv_mcmcA+ho_kldiv_mcmcB)/2
ho_kldiv[i,1]<-mean(ho_kldiv_mcmcA)
ho_kldiv[i,2]<-mean(ho_kldiv_mcmcB)
ho_kldiv_list[[p]][i,]<-ho_kldiv[i,]
}
ho_kldiv_dist_mcmc[[i]]<-apply(ho_kldiv_dist_mcmcL[[i]],1,mean)
}
ho_kldiv<-(ho_kldiv_list[[1]] + ho_kldiv_list[[2]])/2
kldist_host<-apply(ho_kldiv,1,mean)
mean(ho_kldiv_dist_mcmc[[1]] > ho_kldiv_dist_mcmc[[2]])
##[1] 0.996 chum < bart
mean(ho_kldiv_dist_mcmc[[3]] > ho_kldiv_dist_mcmc[[2]])
##[1] 1 chum < tcr
mean(ho_kldiv_dist_mcmc[[3]] > ho_kldiv_dist_mcmc[[1]])
##[1] 0.974 bart < tcr
## make plot
tits<-c("(a) T. chumash hosts","(b) T. bartmani hosts","(c) T. cristinae hosts",
"(e) T. chumash","(f) T. bartmani","(g) T. cristinae","(d) Host distances","(h) Timema distances")
pdf("colorOverlap.pdf",width=6,height=12)
layout(matrix(c(1:3,7,4:6,8),nrow=4,ncol=2,byrow=F),widths=c(4,4),heights=c(4,4,4,4))
par(mar=c(4.5,5.5,2.5,1.5))
re_ho_kldiv_dist_mcmc<-ho_kldiv_dist_mcmc
re_ho_kldiv_dist_mcmc[[1]]<-ho_kldiv_dist_mcmc[[2]]
re_ho_kldiv_dist_mcmc[[2]]<-ho_kldiv_dist_mcmc[[1]]
re_o_kldiv_dist_mcmc<-o_kldiv_dist_mcmc
re_o_kldiv_dist_mcmc[[1]]<-o_kldiv_dist_mcmc[[2]]
re_o_kldiv_dist_mcmc[[2]]<-o_kldiv_dist_mcmc[[1]]
cs<-c("forestgreen","brown")
## plants
j<-1
for(i in c(2,1,3)){
A<-rbind(host[plantsToKeepA[[i]][[1]],7:8],host[plantsToKeepA[[i]][[2]],7:8])
B<-rbind(host[plantsToKeepB[[i]][[1]],7:8],host[plantsToKeepB[[i]][[2]],7:8])
X<-rbind(A,B)
plot(X[,1],X[,2],type='n',xlab = "RG", ylab="GB", main= tits[j],cex.lab=1.4)
points(A[,1],A[,2], pch=19,col=cs[1])
points(B[,1],B[,2], pch=19,col=cs[2])
j<-j+1
}
## Timema
for(i in c(2,1,3)){
x<-which(as.character(timema$species) == spec[i])
plot(timema$latRG[x], timema$latGB[x], pch=19, col=cs[1+as.numeric(tclust[[i]])], xlab = "RG", ylab="GB", main= tits[j],cex.lab=1.4)
j<-j+1
}
## post dist
o<-barplot(unlist(lapply(re_ho_kldiv_dist_mcmc,median)),ylim=c(0,50),ylab="KL distance",cex.lab=1.4,names.arg=c("T. chumash","T. bartmani","T. cristinae"))
etpi<-matrix(unlist(lapply(re_ho_kldiv_dist_mcmc,quantile,probs=c(.025,.975))),nrow=3,ncol=2,byrow=T)
title(main=tits[7])
segments(o[,1],etpi[,1],o[,1],etpi[,2],lwd=2)
o<-barplot(unlist(lapply(re_o_kldiv_dist_mcmc,median)),ylim=c(0,45),ylab="KL distance",cex.lab=1.4,names.arg=c("T. chumash","T. bartmani","T. cristinae"))
etpi<-matrix(unlist(lapply(re_o_kldiv_dist_mcmc,quantile,probs=c(.025,.975))),nrow=3,ncol=2,byrow=T)
segments(o[,1],etpi[,1],o[,1],etpi[,2],lwd=2)
title(main=tits[8])
dev.off()