Color overlap

Post date: Jan 31, 2018 10:28:59 PM

Considering the three species we have the best data for-T. cristinae, T. chumash, and T. bartmani-we wanted to know whether those species with the simplest (in terms of structural variation) had the least distinct color pattern morphs and the most similarity between stem and leaf colors of their hosts. Thus, overlap (for morphs and hosts) should be T. chumash > T. bartmani > T. cristinae.

We used the RG and GB data (lateral for bugs) from Clarissa in 2015Timema_phenotype_gwas_radiation.csv and 2015host_colour.data.csv.

We assigned hosts based on Patrik's natural history knowledge:

- T. bartmani = WF and WP

- T. chumash = Q and MM

- T. cristinae = A and C

Sample sizes were as follows:

- T. bartman = 150

- T. chumash = 541

- T. cristinae = 190

- WF = 101, WP = 87, total = 188

- Q = 287, MM = 98, total = 385

- A = 102, C = 86, total = 188

We used the UPGMA algorithm in hclust (from R 3.2.3) to cluster Timema into two groups based on the RG and GB color measurements. Host plants were categorized as stems (which tend to be browner) or leaves (which tend to be greener). We then used a Bayesian approach to fit the color pattern data for each group (Timema cluster or host plant sample type) to a bivariate normal distribution. We placed relatively uninformative priors on the mean vectors (normal with mu = 0 and tau = 1e-3 for both means) and for the precision matrix (Wishart with with 2 degrees of freedom and a diagonal scale matrix = 0.001 I, where I is the identity matrix). We used MCMC to obtain samples from the posterior distribution via the rjags (version 4.6) interface with JAGS (version 4.1.0) (1000 iteration burnin, 5000 sampling iterations and a thinning interval of 4). We then estimated the Kullback Leibler distances (that is, the Kullback-Leibler divergence in both directions, e.g., from stems to leaves and leaves to stems). This was calculated over the posterior distribution of the bivariate normal parameters, and thus accounts for uncertainty in these parameters.

Posterior means (point estimates) of the Kullback Leibler distance between host stem and leave colors were 10.1 for T. chumash hosts, 21.8 for T. bartmani hosts, and 39.3 for T. cristinae hosts, with a posterior probability > 0.99 that the distance was greater for T. bartmani than T. chumash, and of 0.97 that T. cristinae was greater than T. bartmani (the pp that T. cristinae had a greater distance than T. chumash was ~1.0). Similarly, posterior means (point estimates) of the Kullback Leibler distance between color morphs of Timema were T. chumash = 14.0, T. bartmani = 17.4, and T. cristinae = 29.2, with a posterior probability 0.87 that the distance was greater for T. bartmani than T. chumash, and of 0.98 that T. cristinae was greater than T. bartmani (the pp that T. cristinae had a greater distance than T. chumash was ~1.0).

Here is the code (from overlap2d.R). This was run on my computer, but is now on kingspeak at /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/color_overlap/.

library(ggplot2)

library(dplyr)

library(mclust)

library(NbClust)

timema <- read.csv("2015Timema_phenotype_gwas_radiation.csv", header=TRUE)

host <- read.csv("2015host_colour.data.csv", header=TRUE)

#=====================

# Clustering analyses

#=====================

# This is the function I used to obtain the best number of clusters using

# hierarchical clustering using UPGMA

upgma <- function(data)

{ set.seed(123)

data_scaled <- scale(data)

d <- (dist(data_scaled, method = "euclidean"))^2

nb <- NbClust(data_scaled, distance=NULL, diss = d, min.nc = 1,

max.nc = 10, method = "average", index ="silhouette")

k <- 2 ## ONLY CONSIDER K = 2, best for all but podura anyway, and podura

## has a very low sample size of N = 20

#k <- nb$Best.nc[1]

upgma <- hclust(d, method = "average")

plot <- plot(upgma, cex = 0.6, hang = -1)

if(k > 1){

rect <- rect.hclust(upgma, k = k, border = 2:5)

grp <- cutree(upgma, k = k)

table <- table(grp)

table <- as.data.frame(table)

} else {rect = NULL; table=NULL}

result <- list("dend"=plot, "rect"=rect, "table"=table, "best.nc" = nb$Best.nc)

return(result)

}

# cluster criteria: RG

cluster <- function(data){

group <- upgma(data)

grp1 <- as.data.frame(group$rect[1])

t1 <- cbind(grp1, rep(0,length(grp1)))

names(t1) <- c("row", "col")

grp2 <- as.data.frame(group$rect[2])

t2 <- cbind(grp2, rep(1,length(grp2)))

names(t2) <- c("row", "col")

matrix <- rbind(t1, t2)

attach(matrix)

matrix <- matrix[order(row),]

result <- matrix$col

return(result)

}

## subset of species we want to work with

## T. bart, T. cali., T chumash, T. crist, T land, T. podura

spec<-c("T. bartmani","T. chumash","T. cristinae")

tclust<-vector("list",3)

for(i in 1:3){

x<-which(as.character(timema$species) == spec[i])

tclust[[i]]<-cluster(timema[x,6:7])

}

## visualize

for(i in 1:3){

x<-which(as.character(timema$species) == spec[i])

plot(timema$latRG[x], timema$latGB[x], pch=19, col=as.factor(tclust[[i]]), xlab = "RG", ylab="GB", main= spec[i])

}

## KL div

kldiv<-function(mu0, mu1, sig0, sig1){

a<-sum(diag(solve(sig1) %*% sig0))

b<-t(mu1 - mu0) %*% solve(sig1) %*% (mu1 - mu0)

c<-log(det(sig1)/det(sig0))

kl<-1/2 * (a + b - 2 + c)

return(kl)

}

## calculate MVN summaries for each species and group

parms <- list()

parms$n.iter <- 5 * 10^3

parms$n.burnin <- 1000

parms$n.thin <- 4

parms$n.chains <- 2

priors <- list()

priors$R <- 0.001 * diag(2)

priors$k <- 2

priors$tau.mu <- 1.0E-3

library(rjags)

mnsA<-vector("list",3)

vcvA<-vector("list",3)

mnsB<-vector("list",3)

vcvB<-vector("list",3)

o_kldiv<-matrix(NA,nrow=3,ncol=2)

o_dist<-matrix(NA,nrow=3,ncol=1)

o_kldiv_mcmcA<-rep(NA,500)

o_kldiv_mcmcB<-rep(NA,500)

o_kldiv_dist_mcmc<-vector("list",3)

for(i in 1:3){

x<-which(as.character(timema$species) == spec[i])

X<-timema[x,6:7]

A<-X[tclust[[i]]==0,]

B<-X[tclust[[i]]==1,]

## MCMC estiamte

ppA<-fitEllipse(x=A[,1],y=A[,2],parms=parms,priors=priors)

ppB<-fitEllipse(x=B[,1],y=B[,2],parms=parms,priors=priors)

for(j in 1:500){

vcvA<-matrix(ppA[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)

vcvB<-matrix(ppB[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)

o_kldiv_mcmcA[j]<-kldiv(ppA[[1]][j,5:6],ppB[[1]][j,5:6],vcvA,vcvB)

o_kldiv_mcmcB[j]<-kldiv(ppB[[1]][j,5:6],ppA[[1]][j,5:6],vcvB,vcvA)

}

o_kldiv_dist_mcmc[[i]]<-(o_kldiv_mcmcA+o_kldiv_mcmcB)/2

o_kldiv[i,1]<-mean(o_kldiv_mcmcA)

o_kldiv[i,2]<-mean(o_kldiv_mcmcB)

}

kldist_timema<-apply(o_kldiv,1,mean)

mean(o_kldiv_dist_mcmc[[1]] > o_kldiv_dist_mcmc[[2]])

##[1] 0.868 chum < bart

mean(o_kldiv_dist_mcmc[[3]] > o_kldiv_dist_mcmc[[2]])

##[1] 1 chum < tcr

mean(o_kldiv_dist_mcmc[[3]] > o_kldiv_dist_mcmc[[1]])

##[1] 0.982 bart < tcr

## now host

## define hosts for each species

## T. bart = WF, WP

## T. chumash = Q, MM

## T. crist = A, C

## based on Patrik's assignments

plantsToKeepA<-vector("list",3)

plantsToKeepB<-vector("list",3)

plantsToKeepA[[1]]<-list(which(host$host=='WF' & host$organ=='leaf'),which(host$host=='WP' & host$organ=='leaf'))

plantsToKeepB[[1]]<-list(which(host$host=='WF' & host$organ=='stem'),which(host$host=='WP' & host$organ=='stem'))

plantsToKeepA[[2]]<-list(which(host$host=='Q' & host$organ=='leaf'),which(host$host=='MM' & host$organ=='leaf'))

plantsToKeepB[[2]]<-list(which(host$host=='Q' & host$organ=='stem'),which(host$host=='MM' & host$organ=='stem'))

plantsToKeepA[[3]]<-list(which(host$host=='A' & host$organ=='leaf'),which(host$host=='C' & host$organ=='leaf'))

plantsToKeepB[[3]]<-list(which(host$host=='A' & host$organ=='stem'),which(host$host=='C' & host$organ=='stem'))

parms <- list()

parms$n.iter <- 5 * 10^3

parms$n.burnin <- 1000

parms$n.thin <- 4

parms$n.chains <- 2

priors <- list()

priors$R <- 0.001 * diag(2)

priors$k <- 2

priors$tau.mu <- 1.0E-3

mnsA<-vector("list",3)

vcvA<-vector("list",3)

mnsB<-vector("list",3)

vcvB<-vector("list",3)

ho_kldiv_list<-vector("list",2)

ho_kldiv<-matrix(NA,nrow=3,ncol=2)

for(p in 1:2){

ho_kldiv_list[[p]]<-ho_kldiv

}

ho_dist<-matrix(NA,nrow=3,ncol=1)

ho_kldiv_mcmcA<-rep(NA,500)

ho_kldiv_mcmcB<-rep(NA,500)

ho_kldiv_dist_mcmcL<-vector("list",3)

ho_kldiv_dist_mcmc<-vector("list",3)

for(i in 1:3){

ho_kldiv_dist_mcmcL[[i]]<-matrix(NA,nrow=500,ncol=2)

for(p in 1:2){

A<-host[plantsToKeepA[[i]][[p]],7:8]

B<-host[plantsToKeepB[[i]][[p]],7:8]

ppA<-fitEllipse(x=A[,1],y=A[,2],parms=parms,priors=priors)

ppB<-fitEllipse(x=B[,1],y=B[,2],parms=parms,priors=priors)

for(j in 1:500){

vcvA<-matrix(ppA[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)

vcvB<-matrix(ppB[[1]][j,1:4],nrow=2,ncol=2,byrow=FALSE)

ho_kldiv_mcmcA[j]<-kldiv(ppA[[1]][j,5:6],ppB[[1]][j,5:6],vcvA,vcvB)

ho_kldiv_mcmcB[j]<-kldiv(ppB[[1]][j,5:6],ppA[[1]][j,5:6],vcvB,vcvA)

}

ho_kldiv_dist_mcmcL[[i]][,p]<-(ho_kldiv_mcmcA+ho_kldiv_mcmcB)/2

ho_kldiv[i,1]<-mean(ho_kldiv_mcmcA)

ho_kldiv[i,2]<-mean(ho_kldiv_mcmcB)

ho_kldiv_list[[p]][i,]<-ho_kldiv[i,]

}

ho_kldiv_dist_mcmc[[i]]<-apply(ho_kldiv_dist_mcmcL[[i]],1,mean)

}

ho_kldiv<-(ho_kldiv_list[[1]] + ho_kldiv_list[[2]])/2

kldist_host<-apply(ho_kldiv,1,mean)

mean(ho_kldiv_dist_mcmc[[1]] > ho_kldiv_dist_mcmc[[2]])

##[1] 0.996 chum < bart

mean(ho_kldiv_dist_mcmc[[3]] > ho_kldiv_dist_mcmc[[2]])

##[1] 1 chum < tcr

mean(ho_kldiv_dist_mcmc[[3]] > ho_kldiv_dist_mcmc[[1]])

##[1] 0.974 bart < tcr

## make plot

tits<-c("(a) T. chumash hosts","(b) T. bartmani hosts","(c) T. cristinae hosts",

"(e) T. chumash","(f) T. bartmani","(g) T. cristinae","(d) Host distances","(h) Timema distances")

pdf("colorOverlap.pdf",width=6,height=12)

layout(matrix(c(1:3,7,4:6,8),nrow=4,ncol=2,byrow=F),widths=c(4,4),heights=c(4,4,4,4))

par(mar=c(4.5,5.5,2.5,1.5))

re_ho_kldiv_dist_mcmc<-ho_kldiv_dist_mcmc

re_ho_kldiv_dist_mcmc[[1]]<-ho_kldiv_dist_mcmc[[2]]

re_ho_kldiv_dist_mcmc[[2]]<-ho_kldiv_dist_mcmc[[1]]

re_o_kldiv_dist_mcmc<-o_kldiv_dist_mcmc

re_o_kldiv_dist_mcmc[[1]]<-o_kldiv_dist_mcmc[[2]]

re_o_kldiv_dist_mcmc[[2]]<-o_kldiv_dist_mcmc[[1]]

cs<-c("forestgreen","brown")

## plants

j<-1

for(i in c(2,1,3)){

A<-rbind(host[plantsToKeepA[[i]][[1]],7:8],host[plantsToKeepA[[i]][[2]],7:8])

B<-rbind(host[plantsToKeepB[[i]][[1]],7:8],host[plantsToKeepB[[i]][[2]],7:8])

X<-rbind(A,B)

plot(X[,1],X[,2],type='n',xlab = "RG", ylab="GB", main= tits[j],cex.lab=1.4)

points(A[,1],A[,2], pch=19,col=cs[1])

points(B[,1],B[,2], pch=19,col=cs[2])

j<-j+1

}

## Timema

for(i in c(2,1,3)){

x<-which(as.character(timema$species) == spec[i])

plot(timema$latRG[x], timema$latGB[x], pch=19, col=cs[1+as.numeric(tclust[[i]])], xlab = "RG", ylab="GB", main= tits[j],cex.lab=1.4)

j<-j+1

}

## post dist

o<-barplot(unlist(lapply(re_ho_kldiv_dist_mcmc,median)),ylim=c(0,50),ylab="KL distance",cex.lab=1.4,names.arg=c("T. chumash","T. bartmani","T. cristinae"))

etpi<-matrix(unlist(lapply(re_ho_kldiv_dist_mcmc,quantile,probs=c(.025,.975))),nrow=3,ncol=2,byrow=T)

title(main=tits[7])

segments(o[,1],etpi[,1],o[,1],etpi[,2],lwd=2)

o<-barplot(unlist(lapply(re_o_kldiv_dist_mcmc,median)),ylim=c(0,45),ylab="KL distance",cex.lab=1.4,names.arg=c("T. chumash","T. bartmani","T. cristinae"))

etpi<-matrix(unlist(lapply(re_o_kldiv_dist_mcmc,quantile,probs=c(.025,.975))),nrow=3,ncol=2,byrow=T)

segments(o[,1],etpi[,1],o[,1],etpi[,2],lwd=2)

title(main=tits[8])

dev.off()

Page updated

Google Sites

Report abuse