Post date: Mar 14, 2018 9:8:26 PM
I made a few tweaks to this, including getting the clusters right. Here is the final code.
I defined the deletion based on a HMM with state 2 set to mean = 0.2 quantile and sd = 0.8 * empirical, and inclusizve of all species (for morphs) = 4.97 mb to 6.19 mb.
For genotypes (collapsing stripe axis) T. cristinae and T. bartmani only:
## fit HMMs for coverage data
## this version is for the PCA clusters
## read in data, or load from workspace
L<-15771578
N<-12
Cmat<-matrix(scan("covdat_cluster.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=TRUE)
save(list=ls(),file="coverage_cluster.rdat")
load("coverage_cluster.rdat")
library(HiddenMarkov)
## combine columns for genotypes
## from notes in het,
## Tbart should be gr/gr = 1,2,3, me/me = 4, het = 5,6 ... ignoring PC2
## Tcristinae should be gr/gr = 1,4,5, me/me = 2, het = 3,6 ... ignoring stripe
## bart than cristinae
## dropping hets
CmatTemp<-matrix(NA,nrow=L,ncol=4)
CmatTemp[,1]<-apply(Cmat[,1:3],1,sum,na.rm=TRUE)
CmatTemp[,2]<-Cmat[,4]
CmatTemp[,3]<-apply(Cmat[,c(1,4,5)+6],1,sum,na.rm=TRUE)
CmatTemp[,4]<-Cmat[,2+6]
Cmat<-CmatTemp
N<-dim(Cmat)[2]
## standardize coverage matrix
CmatS<-Cmat
for(i in 1:N){
xna<-which(is.na(Cmat[,i])==TRUE)
CmatS[xna,i]<-0
mn<-mean(CmatS[,i])
s<-sd(CmatS[,i])
CmatS[,i]<-(CmatS[,i]-mn)/s
}
## cap at +5 sds
for(i in 1:N){
x5<-which(CmatS[,i] > 5)
CmatS[x5,i]<-5
}
L<-dim(CmatS)[1]
L20k<-floor(L/20000)
bnds<-matrix(NA,nrow=L20k,ncol=2)
bnds[,1]<-seq(1,(L-20000),20000)
bnds[,2]<-seq(20000,L,20000)
CmatS20k<-matrix(NA,nrow=L20k,ncol=N)
for(i in 1:N){
for(j in 1:L20k){
CmatS20k[j,i]<-mean(CmatS[bnds[j,1]:bnds[j,2],i])
}
}
## Gaussian HMM with mean and sd fixed
Mstep.normc <- function(x, cond, pm, pn){
nms <- sort(names(pm))
n <- length(x)
m <- ncol(cond$u)
if (all(nms==c("mean", "sd"))){
mean <- pm$mean
sd <- pm$sd
return(list(mean=mean, sd=sd))
}
if (all(nms == "mean")) {
mean <- pm$mean
return(list(mean = mean))
}
if (all(nms == "sd")) {
sd <- pm$sd
return(list(sd = sd))
}
stop("Invalid specification of parameters")
}
rnormc <- rnorm
dnormc <- dnorm
pnormc <- pnorm
qnormc <- qnorm
## set initial values and means
delta<-c(0.05,0.95)
PiA<-matrix(c(0.9,0.1,0.1,0.9),nrow=2,byrow=T)
PiB<-matrix(c(0.8,0.2,0.01,0.99),nrow=2,byrow=T)
Pi<-list(PiA,PiB)
## variables for storing results
fitA<-vector("list",N)
fitB<-vector("list",N)
paramA<-vector("list",N)
paramB<-vector("list",N)
## loop over species/groups
for(i in 1:N){
mns<-c(mean(CmatS20k[,i]),quantile(CmatS20k[,i],probs=c(0.2)))
sds<-c(sd(CmatS20k[,i]),0.8*sd(CmatS20k[,i]))
## rep A
init<-dthmm(CmatS20k[,i],PiA,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
paramA[[i]]<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fitA[[i]]<-Viterbi(paramA[[i]])
## rep B
init<-dthmm(CmatS20k[,i],PiB,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
paramB[[i]]<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fitB[[i]]<-Viterbi(paramB[[i]])
}
## identical results for two fits in terms of classification, just use A for plots
wins<-788 # all 788 windows
## ids
mel<-c(2,4)
grn<-c(1,3)
cs<-rep(NA,N)
cs[mel]<-"chocolate"
cs[grn]<-"darkgreen"
## define bounds of high windows
bndsHmm<-vector("list",N)
for(i in 1:N){
b<-which(fitA[[i]][-1]!=fitA[[i]][-wins])
bn<-length(b)## 3 = redhead is funky
if(bn > 1){bndsHmm[[i]]<-matrix(b,nrow=bn/2,ncol=2,byrow=TRUE)}
else{bndsHmm[[i]]<-NA}
}
hdrL<-LETTERS[1:4]
sp<-rep(c("T. bartmani","T. cristinae"),times=c(2,2))
## compare to other windows of the same size
ss<-1:4
ssL<-length(ss)
out<-matrix(NA,nrow=ssL,ncol=3)
## obs, null, p
L<-length(249:310)
o<-249:310
for(i in 1:ssL){
x<-ss[i]
obs<-mean(CmatS20k[o,x])
null<-rep(NA,wins-L)
for(j in 1:(wins-L)){
null[j]<-mean(CmatS20k[j:(j+L-1),x])
}
p<-mean(obs > null)
out[i,]<-c(obs,mean(null),p)
}
ss<-c(1:4)
pdf("fig3_indel_cluster.pdf",width=8,height=6.5)
par(mfrow=c(2,2))
par(mar=c(4,4,3,0.5))
a<-1
for(i in ss){
xwin<-floor(apply(bnds,1,mean))
myc<-c(cs[i],"darkgray")[fitB[[i]]]
plot(xwin,CmatS20k[,i],col=myc,pch=19,cex=.8,xlab="Position (bp)",ylab="Coverage",cex.lab=1.3)
offset<-1.04
polygon(xwin[c(249,249,310,310)],
c(min(CmatS20k[,i])*offset,max(CmatS20k[,i])*offset,
max(CmatS20k[,i])*offset,min(CmatS20k[,i])*offset),
lty=2,lwd=.8)
A<-hdrL[a];B<-sp[i]
if(cs[i]=='darkgreen'){col<-"green"}
else{col<-"melanistic"}
title(main=paste("(",A,") ",B,", ",col,", P = ",round(out[a,3],3),sep=""))
a<-a+1
}
dev.off()
save(list=ls(),file="covhmm_cluster.rdat")
For morphs, all species, just green vs. brown/melanistic:
## fit HMMs for coverage data
## read in data, or load from workspace
#L<-15771578
#N<-18
#Cmat<-matrix(scan("covdat.txt",n=N*L,sep=" "),nrow=L,ncol=N,byrow=TRUE)
#save(list=ls(),file="coverage.rdat")
load("coverage.rdat")
library(HiddenMarkov)
N<-dim(Cmat)[2]
## standardize coverage matrix
CmatS<-Cmat
for(i in 1:N){
xna<-which(is.na(Cmat[,i])==TRUE)
CmatS[xna,i]<-0
mn<-mean(CmatS[,i])
s<-sd(CmatS[,i])
CmatS[,i]<-(CmatS[,i]-mn)/s
}
## cap at +5 sds
for(i in 1:N){
x5<-which(CmatS[,i] > 5)
CmatS[x5,i]<-5
}
L<-dim(CmatS)[1]
L20k<-floor(L/20000)
bnds<-matrix(NA,nrow=L20k,ncol=2)
bnds[,1]<-seq(1,(L-20000),20000)
bnds[,2]<-seq(20000,L,20000)
CmatS20k<-matrix(NA,nrow=L20k,ncol=N)
for(i in 1:N){
for(j in 1:L20k){
CmatS20k[j,i]<-mean(CmatS[bnds[j,1]:bnds[j,2],i])
}
}
## Gaussian HMM with mean and sd fixed
Mstep.normc <- function(x, cond, pm, pn){
nms <- sort(names(pm))
n <- length(x)
m <- ncol(cond$u)
if (all(nms==c("mean", "sd"))){
mean <- pm$mean
sd <- pm$sd
return(list(mean=mean, sd=sd))
}
if (all(nms == "mean")) {
mean <- pm$mean
return(list(mean = mean))
}
if (all(nms == "sd")) {
sd <- pm$sd
return(list(sd = sd))
}
stop("Invalid specification of parameters")
}
rnormc <- rnorm
dnormc <- dnorm
pnormc <- pnorm
qnormc <- qnorm
## set initial values and means
delta<-c(0.05,0.95)
PiA<-matrix(c(0.9,0.1,0.1,0.9),nrow=2,byrow=T)
PiB<-matrix(c(0.8,0.2,0.01,0.99),nrow=2,byrow=T)
Pi<-list(PiA,PiB)
## variables for storing results
fitA<-vector("list",N)
fitB<-vector("list",N)
paramA<-vector("list",N)
paramB<-vector("list",N)
## loop over species/groups
for(i in 1:N){
mns<-c(mean(CmatS20k[,i]),quantile(CmatS20k[,i],probs=c(0.2)))
sds<-c(sd(CmatS20k[,i]),0.8*sd(CmatS20k[,i]))
## rep A
init<-dthmm(CmatS20k[,i],PiA,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
paramA[[i]]<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fitA[[i]]<-Viterbi(paramA[[i]])
## rep B
init<-dthmm(CmatS20k[,i],PiB,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
paramB[[i]]<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fitB[[i]]<-Viterbi(paramB[[i]])
}
## identical results for two fits in terms of classification, just use A for plots
wins<-788 # all 788 windows
## ids
ids<-read.table("names_covdat.txt",sep="_",header=FALSE)
mel<-which(ids[,3]=='melanic')
grn<-which(ids[,3]=='green' | ids[,3]=="pooled")
cs<-rep(NA,N)
cs[mel]<-"chocolate"
cs[grn]<-"darkgreen"
## define bounds of high windows
bndsHmm<-vector("list",N)
for(i in 1:N){
b<-which(fitA[[i]][-1]!=fitA[[i]][-wins])
bn<-length(b)## 3 = redhead is funky
if(bn > 1 & i!=3){bndsHmm[[i]]<-matrix(b,nrow=bn/2,ncol=2,byrow=TRUE)}
else{bndsHmm[[i]]<-NA}
}
hdrL<-LETTERS[1:16]
sp<-rep(c("T. bartmani","T. californicum","T. chumash","T. cristinae","T. curi","T. knulli","T. landelsensis","T. petita","T. podura","T. poppensis"),times=c(3,2,2,2,2,1,2,1,2,1))
## based on mean = 0.2 quant., sd = 0.8 * empirical
## inclusizve = 4.97 mb to 6.19 mb
## tcr = 5.03 mb to 6.15 mb
## compare to other windows of the same size
ssL<-length(ss)
out<-matrix(NA,nrow=ssL,ncol=3)
## obs, null, p
L<-length(249:310)
o<-249:310
for(i in 1:ssL){
x<-ss[i]
obs<-mean(CmatS20k[o,x])
null<-rep(NA,wins-L)
for(j in 1:(wins-L)){
null[j]<-mean(CmatS20k[j:(j+L-1),x])
}
p<-mean(obs > null)
out[i,]<-c(obs,mean(null),p)
}
ss<-c(c(10,12,15,18),c(4:5,13:14,8:9,16:17,1:2,6:7))
pdf("fig3_indel.pdf",width=8,height=14)
par(mfrow=c(8,2))
par(mar=c(4,4,3,0.5))
a<-1
for(i in ss){
xwin<-floor(apply(bnds,1,mean))
myc<-c(cs[i],"darkgray")[fitA[[i]]]
plot(xwin,CmatS20k[,i],pch=19,col=myc,cex=.5,xlab="Position (bp)",ylab="Coverage",cex.lab=1.3)
offset<-1.04
polygon(xwin[c(249,249,310,310)],
c(min(CmatS20k[,i])*offset,max(CmatS20k[,i])*offset,
max(CmatS20k[,i])*offset,min(CmatS20k[,i])*offset),
lty=2,lwd=.8)
A<-hdrL[a];B<-sp[i]
if(cs[i]=='darkgreen'){col<-"green"}
else{col<-"melanistic"}
title(main=paste("(",A,") ",B,", ",col,", P = ",round(out[a,3],3),sep=""))
a<-a+1
}
dev.off()
save(list=ls(),file="covhmm.rdat")