Post date: Apr 20, 2017 3:12:31 PM
We have data on stripe frequencies at HV across 17 consecutive years. My goal was to fit a model to this data to assess predictive ability. I settled on a generalized autoregressive moving average model (GARMA), which extends ARMA models to non-Gaussian error models (binomial, Poisson, etc.). See Benjamin et al. (2003) for a nice overview. I specifically used a Bayesian variant of this model, described in de Andrade et al. 2016. These models differ, specifically for the relevant binomial case, in whether they use the log of the count data or the logit of the frequency data for the ARMA component. The latter makes more sense to me and is proposed in the Benhamin paper, but the former (log y) mixed much, much better and was suggested in the Bayesian paper. It is a bit odd to me, but it works empirically so it is what I have stuck with.
The data and code are in /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_fluct/timemaFluctMorph/ (the melanic time series stuff is there too).
An initial round of model comparison across GARMA (0,0), (1,0), (0,1), (1,1), (2,1), (1,2), and (2,2) suggested GARMA(2,2) was the best based on DIC (see dics object, but I will re-run this for quantitative details once we have 2017 data).
Here is the model block (for rjags):
model{
## GARMA(2,2) model
for (t in 1:N){
y[t] ~ dbinom(m[t],n[t])
}
for (t in 3:N){
logit(m[t]) <- c + theta[1] * log(y[t-1]) + theta[2] * log(y[t-2]) + phi[1] * eps[t-1] + phi[2] * eps[t-2]
eps[t] <- log(y[t]) - log(m[t] * n[t])
}
for (t in 1:2){
m[t] ~ dbeta(0.5,0.5)
eps[t] <- log(y[t]) - log(m[t]*n[t])
}
## priors
for (k in 1:2){
theta[k] ~ dnorm(0,1e-5)
phi[k] ~ dnorm(0,1e-5)
}
c ~ dnorm(0,1e-5)
}
I ran leave-one-out cross-validation for all years except the first two.
80% of the true stripe frequencies are included in the 95% prediction CIs.
The squared correlation between the true and predicted frequencies is 0.345 (r = 0.588). This is probably the most important number as it lets us explicitly quantify prediction such that we can say that our predictions explain 35% of the variability in the data.
The first attached plot has the same format as the last one but with the predicted values from cross-validation. Note that the first two years are not including as these essentially have a different simpler model as the data from the two years before them don't exist. I have also included a plot of observed vs. predicted frequency.
Here is the relevant R code from stripeAnalysis.R:
dat<-read.csv("Tcris_master_26.csv")
#dat<-read.csv("Tcris_master_locality_host_year_17.csv")
hv<-which(dat$location=="HV")
y<-tapply(X=dat$striped[hv],INDEX=dat$year[hv],sum)
n<-tapply(X=dat$striped[hv]+dat$unstriped[hv],INDEX=dat$year[hv],sum)
7)
## fit GARMA(0,0) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma00model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("c","m"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[1]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(0,1) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma01model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("c","m","phi"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[3]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(1,0) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma10model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[2]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(1,1) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma11model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[4]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(2,1) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma21model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[5]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(1,2) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma12model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[6]]<-dsam
o<-summary(sam)[[2]]
plot(dat$y/dat$n,type='l',ylim=c(0.3,1))
points(o[2:18,3],col="blue")
segments(1:17,o[2:18,1],1:17,o[2:18,5],col="blue")
## fit GARMA(2,2) model
dat<-list(y=y[-c(1:2)],n=n[-c(1:2)],N=17)
mod<-jags.model(file="garma22model",data=dat,n.chains=3)
update(mod,n.iter=10000)
sam<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
dsam<-dic.samples(mod,n.iter=100000,thin=50,type="pD")
dics[[7]]<-dsam
o<-summary(sam)[[2]]
yr<-2000:2016
pdf("stripeTimeSeriesGARMA22.pdf",width=6,height=5)
par(mar=c(5,5,0.5,0.5))
plot(yr,dat$y/dat$n,type='l',ylim=c(0.3,1),cex.lab="1.5",xlab="year",ylab="stripe frequency")
points(yr,o[2:18,3],col="blue")
segments(yr,o[2:18,1],yr,o[2:18,5],col="blue")
dev.off()
## cross validation
cve<-matrix(NA,nrow=17,ncol=5)
for(x in 3:17){
datx<-dat
datx$y[x]<-NA
mod<-jags.model(file="garma22model",data=datx,n.chains=3)
update(mod,n.iter=10000)
samx<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
ox<-summary(samx)[[2]]
cve[x,]<-ox[x+1,]
}
pdf("stripeTimeSeriesPredict.pdf",width=6,height=5)
par(mar=c(5,5,0.5,0.5))
plot(yr,dat$y/dat$n,type='l',ylim=c(0.3,1),cex.lab="1.5",xlab="year",ylab="stripe frequency")
points(yr,cve[,3],col="blue")
segments(yr,cve[,1],yr,cve[,5],col="blue")
dev.off()
I tried another run predicting the last five years when the model was fit on the first 12. I had to add a fudge factor to the GARMA model to avoid problems when y = 0 (log(0) is bad). Here is the tweaked model and relevant additional R code (below). The cross-validated r^2 was 0.11 (r=0.3261581).
## GARMA(2,2) model
for (t in 1:N){
y[t] ~ dbinom(m[t],n[t])
}
for (t in 3:N){
logit(m[t]) <- c + theta[1] * log(y[t-1]+0.01) + theta[2] * log(y[t-2]+0.01) + phi[1] * eps[t-1] + phi[2] * eps[t-2]
eps[t] <- log(y[t]+0.01) - log(m[t] * n[t])
}
for (t in 1:2){
m[t] ~ dbeta(0.5,0.5)
eps[t] <- log(y[t]+0.01) - log(m[t]*n[t])
}
## priors
for (k in 1:2){
theta[k] ~ dnorm(0,1e-5)
phi[k] ~ dnorm(0,1e-5)
}
c ~ dnorm(0,1e-5)
}
## 5yr cross validation
cve2<-matrix(NA,nrow=17,ncol=5)
datx<-dat
datx$y[13:17]<-NA
mod<-jags.model(file="garma22xmodel",data=datx,n.chains=3)
update(mod,n.iter=10000)
samx<-coda.samples(mod,n.iter=100000,thin=50,variable.names=c("theta","c","m","phi"))
ox<-summary(samx)[[2]]
cve2[13:17,]<-ox[(13:17)+1,]
pdf("stripeObsVpredict5yr.pdf",width=5,height=5)
par(mar=c(5,5,0.5,0.5))
plot(p,cve2[,3],pch=19,xlab="observed frequency",ylab="predicted frequency")
abline(a=0,b=1)
dev.off()