Post date: Mar 26, 2018 11:41:20 PM
We used patterns of LD (melanistic vs. green homozygotes from K mean clustering/PCA) and comparative alignments of the green vs. melanistic morph genomes to delimit the putative inversion. All of this work was done in /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/inversion/.
1. Get the data together.
## Get LG8 data for FHA T. cristinae
cd /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/mapping/Tcristinae_FHA2013_gemma_lgNA_excluded_v1.3b2/latRG
grep ^lg8 Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno > ../LG8_Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno
perl -p -i -e 's/lg8_ord\d+_scaf([0-9\.]+)\:(\d+)\s+[A-Z]\s+[A-Z]/\1 \2/' LG8_Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno
## from inversion
mv ../mapping/Tcristinae_FHA2013_gemma_lgNA_excluded_v1.3b2/LG8_Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno ./
2. The following script performs the key LD calculations and makes several plots, but not all of this was used in the end. Instead the invLD*txt files it writes were used in a second script.
FindInversion.R
## read files
dat<-read.table("LG8_Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno",header=FALSE)
G702<-dat[dat[,1]==702.1,-c(1:2)]
G128<-dat[dat[,1]==128,-c(1:2)]
## positions
p702<-dat[dat[,1]==702.1,2]
p128<-dat[dat[,1]==128,2]
## flip 702 positions
top<-14171514
p702<-top-p702
## get colors
grp<-scan("../het/TcrPCAclusters.txt")
## green = 1,4,5; melanistic = 2
grn<-which(grp %in% c(1,4,5))
brn<-which(grp == 2)
## LD calcs
LD702brown<-cor(t(G702[,brn]))^2
LD702green<-cor(t(G702[,grn]))^2
LD128brown<-cor(t(G128[,brn]))^2
LD128green<-cor(t(G128[,grn]))^2
## LD combined
LD702<-cor(t(G702))^2
LD128<-cor(t(G128))^2
## green and brown LD mats, snps in windows, and quantile to compare
ldwin<-function(LDg=NA,LDb=NA,win1=NA,win2=NA,q=0.9){
LDgWin<-as.numeric(LDg[win1,win2])
LDbWin<-as.numeric(LDb[win1,win2])
o<-c(mean(LDbWin),mean(LDgWin))
#o<-c(quantile(LDbWin,probs=q),quantile(LDgWin,probs=q))
return(o)
}
## pips
gwaRG<-read.table("../mapping/Tcristinae_FHA2013_gemma_lgNA_excluded_v1.3b2/latRG/output/Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded_sparse_effects_all_SNPs_except_MAFcutoff_formatted.txt",header=TRUE)
x<-which(gwaRG$scaffold==702.1)
gwa702<-gwaRG[x,]
gwa702$PS<-top-gwa702$PS
x<-which(gwaRG$scaffold==128)
gwa128<-gwaRG[x,]
## gwa genabel
gwaGenRg<-read.table("../mapping/Tcristinae_FHA2013_GenABEL_v1.3b2/latRG/Tcristinae_FHA.latRG.GenABEL_sparse_without_correction_for_pop_structure.txt",header=TRUE)
x<-which(gwaGenRg$scaf==702.1)
gwaG702<-gwaGenRg[x,]
gwaG702$pos<-top-gwaG702$pos
x<-which(gwaGenRg$scaf==128)
gwaG128<-gwaGenRg[x,]
align<-top-4139489
###################################################################
library(HiddenMarkov)
## Gaussian HMM with mean and sd fixed
Mstep.normc <- function(x, cond, pm, pn){
nms <- sort(names(pm))
n <- length(x)
m <- ncol(cond$u)
if (all(nms==c("mean", "sd"))){
mean <- pm$mean
sd <- pm$sd
return(list(mean=mean, sd=sd))
}
if (all(nms == "mean")) {
mean <- pm$mean
return(list(mean = mean))
}
if (all(nms == "sd")) {
sd <- pm$sd
return(list(sd = sd))
}
stop("Invalid specification of parameters")
}
rnormc <- rnorm
dnormc <- dnorm
pnormc <- pnorm
qnormc <- qnorm
########################################################################33
pdf("LDcomps.pdf",width=12,height=25)
par(mfrow=c(5,2))
## 702.1 comparisons
ub<-max(p702)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
lddif702<-matrix(NA,nrow=L,ncol=2)
for(i in 1:L){
w1<-c(wins[i],wins[i+3])
w2<-c(wins[i+4],wins[i+6])
win1<-which(p702 >= w1[1] & p702 <= w1[2])
win2<-which(p702 >= w2[1] & p702 <= w2[2])
if(length(win1) > 2 & length(win2) > 2){
lddif702[i,]<-ldwin(LD702green,LD702brown,win1,win2,q=0.9)
}
}
mat702<-matrix(NA,nrow=L,ncol=4)
## suggests break with brown higher LD than green
plot(wins[1:L],lddif702[,1]/lddif702[,2],type='l',main="LD brown/green 702.1",xlab="position",ylab="LD ratio")
abline(v=align,col="red",lty=2)
wins702<-wins
## 128 comparisons
ub<-max(p128)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
lddif128<-matrix(NA,nrow=L,ncol=2)
for(i in 1:L){
w1<-c(wins[i],wins[i+3])
w2<-c(wins[i+4],wins[i+6])
win1<-which(p128 >= w1[1] & p128 <= w1[2])
win2<-which(p128 >= w2[1] & p128 <= w2[2])
if(length(win1) > 2 & length(win2) > 2){
lddif128[i,]<-ldwin(LD128green,LD128brown,win1,win2,q=0.9)
}
}
mat128<-matrix(NA,nrow=L,ncol=4)
plot(wins[1:L],lddif128[,1]/lddif128[,2],type='l',main="LD brown/green 128",xlab="position",ylab="LD ratio")
wins128<-wins
## now stand dif
y<-(lddif702[,1]-lddif702[,2])/apply(lddif702,1,mean)
### fit hmm ###
## set initial values and means
delta<-c(0.05,0.95)
Pi<-matrix(c(0.9,0.1,0.1,0.9),nrow=2,byrow=T)
yx<-which(is.na(y)==FALSE)
yy<-y[yx]
## between 0.9 and 0.99
mns<-c(mean(yy,na.rm=TRUE),quantile(yy,probs=0.95,na.rm=TRUE))
sds<-rep(sd(yy,na.rm=TRUE) * 0.8,2)
init<-dthmm(yy,Pi,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
parm702<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fit702<-Viterbi(parm702)
xx<-wins[-c(1:3,1414:1416)]
xx702<-xx[yx]
yy702<-yy
plot(xx702,yy702,col=c("gray","orange")[fit702])
mat702[,1]<-xx
mat702[,2]<-y
mat702[yx,3]<-fit702
y<-(lddif128[,1]-lddif128[,2])/apply(lddif128,1,mean)
### fit hmm ###
## set initial values and means
delta<-c(0.05,0.95)
Pi<-matrix(c(0.9,0.1,0.1,0.9),nrow=2,byrow=T)
yx<-which(is.na(y)==FALSE)
yy<-y[yx]
## between 0.9 and 0.99
mns<-c(mean(yy,na.rm=TRUE),quantile(yy,probs=0.95,na.rm=TRUE))
sds<-rep(sd(yy,na.rm=TRUE) * 0.8,2)
init<-dthmm(yy,Pi,delta,"normc",list(mean=mns,sd=sds),discrete=FALSE)
parm128<-BaumWelch(init,bwcontrol(maxiter = 500, tol = 1e-04,
prt = TRUE, posdiff = FALSE,converge = expression(diff < tol)))
fit128<-Viterbi(parm128)
xx<-wins[-c(1:3,1574:1576)]
xx128<-xx[yx]
yy128<-yy
plot(xx128,yy128,col=c("gray","orange")[fit128])
abline(v=c(4970000,6190000),col="green")
mat128[,1]<-xx
mat128[,2]<-y
mat128[yx,3]<-fit128
## within LD wins
ub<-max(p702)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
ldx702<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(p702 >= w1[1] & p702 <= w1[2])
if(length(win1) > 2){
XX<-as.matrix(LD702[win1,win1])
ldx702[i,]<-mean(XX[upper.tri(XX)],na.rm=TRUE)
}
}
plot(wins[1:L],ldx702,type='l',main="LD combined 702.1",xlab="position",ylab="LD (r2)")
abline(v=align,col="red",lty=2)
mat702[,4]<-ldx702
ub<-max(p128)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
ldx128<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(p128 >= w1[1] & p128 <= w1[2])
if(length(win1) > 2){
XX<-as.matrix(LD128[win1,win1])
ldx128[i,]<-mean(XX[upper.tri(XX)])
}
}
plot(wins[1:L],ldx128,type='l',main="LD combined 128",xlab="position",ylab="LD (r2)")
mat128[,4]<-ldx128
#################### write key results ########################
rite.table(mat702,file="invLD702.txt",row.names=F,col.names=F,quote=F)
write.table(mat128,file="invLD702.txt",row.names=F,col.names=F,quote=F)
###############################################################
## sum model average effects
ub<-max(gwa702$PS)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
beta702<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwa702$PS >= w1[1] & gwa702$PS <= w1[2])
if(length(win1) > 2){
beta702[i,]<-mean(gwa702$abs.beta.g[win1])
}
}
plot(wins[1:L],beta702,type='l',main="GWA 702.1",xlab="position",ylab="Model-averaged effect")
abline(v=align,col="red",lty=2)
ub<-max(gwa128$PS)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
beta128<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwa128$PS >= w1[1] & gwa128$PS <= w1[2])
if(length(win1) > 2){
beta128[i,]<-mean(gwa128$abs.beta.g[win1])
}
}
plot(wins[1:L],beta128,type='l',main="GWA 128",xlab="position",ylab="Model-averaged effect")
## genabel
ub<-max(gwaG702$pos)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
pv702<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwaG702$pos >= w1[1] & gwaG702$pos <= w1[2])
if(length(win1) > 2){
pv702[i,]<-mean(gwaG702$neg.log.p[win1])
}
}
plot(wins[1:L],pv702,type='l',main="GWA Genabel 702.1",xlab="position",ylab="Neg. log p")
abline(v=align,col="red",lty=2)
ub<-max(gwaG128$pos)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
pv128<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwaG128$pos >= w1[1] & gwaG128$pos <= w1[2])
if(length(win1) > 2){
pv128[i,]<-mean(gwaG128$neg.log.p[win1])
}
}
plot(wins[1:L],pv128,type='l',main="GWA Genabel 128",xlab="position",ylab="Neg. log p")
dev.off()
save(list=ls(),file="inversion.rdat")
3. The script mkFigure3Combined.R makes figure 3, which combines the indel delienation (mostly in the indel subdirectory) with the final plot with my LD difference metric and the GWA signal from genabel (also in windows). Here is the script:
ld128<-read.table("invLD128.txt",header=F)
ld702<-read.table("invLD702.txt",header=F)
## gwa genabel
gwaGenRg<-read.table("../mapping/Tcristinae_FHA2013_GenABEL_v1.3b2/latRG/Tcristinae_FHA.latRG.GenABEL_sparse_without_correction_for_pop_structure.txt",header=TRUE)
x<-which(gwaGenRg$scaf==702.1)
gwaG702<-gwaGenRg[x,]
gwaG702$pos<-top-gwaG702$pos
x<-which(gwaGenRg$scaf==128)
gwaG128<-gwaGenRg[x,]
ldwin<-function(LDg=NA,LDb=NA,win1=NA,win2=NA,q=0.9){
LDgWin<-as.numeric(LDg[win1,win2])
LDbWin<-as.numeric(LDb[win1,win2])
o<-c(mean(LDbWin),mean(LDgWin))
#o<-c(quantile(LDbWin,probs=q),quantile(LDgWin,probs=q))
return(o)
}
## genabel
ub<-max(gwaG702$pos)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
pv702<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwaG702$pos >= w1[1] & gwaG702$pos <= w1[2])
if(length(win1) > 2){
pv702[i,]<-mean(gwaG702$neg.log.p[win1])
}
}
L702<-L
wins702<-wins
ub<-max(gwaG128$pos)
ub<-round(ub,-4)
wins<-seq(1,ub,10000)
L<-length(wins)-6
pv128<-matrix(NA,nrow=L,ncol=1)
for(i in 1:L){
w1<-c(wins[i],wins[i+6])
win1<-which(gwaG128$pos >= w1[1] & gwaG128$pos <= w1[2])
if(length(win1) > 2){
pv128[i,]<-mean(gwaG128$neg.log.p[win1])
}
}
L128<-L
wins128<-wins
## indel results
load("../indel/covhmm_cluster.rdat")
pdf("fig3_inversion_deletion.pdf",width=8,height=13)
par(mfrow=c(4,2))
xx<-which(ld702[,1] < (14171514-4139489))
ld702[xx,3]<-1
xx<-which(ld128[,1] > 6190000)
ld128[xx,3]<-1
cs<-c("darkgray","orange")
par(mar=c(4.5,4.5,3,0))
plot(ld702[,1],ld702[,2],col=cs[ld702[,3]],ylim=c(-2,2),axes=FALSE,xlab="Position 702.1 (bp)",ylab="Standardized LD dif.",cex.lab=1.5,pch=20)
axis(1)
axis(2)
abline(v=14171514-4139489,col="red",lty=1,lwd=2)
title(main="(a) LD melanistic vs. green")
par(mar=c(4.5,0.25,3,0.5))
plot(ld128[,1],ld128[,2],col=cs[ld128[,3]],axes=FALSE,ylim=c(-2,2),xlab="Position 128 (bp)",ylab="",cex.lab=1.5,pch=20)
axis(1)
offset<-1.04
indel<-c(4970000,6190000)
polygon(c(indel,rev(indel)),
c(2*offset,2*offset,-2*offset,-2*offset),
lty=2,lwd=.8)
par(mar=c(4.5,4.5,3,0))
plot(wins702[c(1:L702)+3],rev(pv702),col="black",ylim=c(0,15.5),axes=FALSE,xlab="Position 702.1 (bp)",ylab="Neg. log p-value",cex.lab=1.5,pch=20)
axis(1)
axis(2)
abline(v=14171514-4139489,col="red",lty=1,lwd=2)
title(main="(b) GWA mapping of RG color")
par(mar=c(4.5,0.25,3,0.5))
plot(wins128[c(1:L128)+3],pv128,col="black",axes=FALSE,ylim=c(0,15.5),xlab="Position 128 (bp)",ylab="",cex.lab=1.5,pch=20)
axis(1)
offset<-1.04
indel<-c(4970000,6190000)
polygon(c(indel,rev(indel)),
c(15.5*offset,15.5*offset,0*offset,0*offset),
lty=2,lwd=.8)
ss<-1:4
## ids
mel<-c(2,4)
grn<-c(1,3)
css<-rep(NA,N)
css[mel]<-"brown"
css[grn]<-"darkgreen"
hdrL<-letters[3:6]
par(mar=c(4.5,4.5,3,0.5))
a<-1
for(i in ss){
xwin<-floor(apply(bnds,1,mean))
myc<-c(css[i],"darkgray")[fitB[[i]]]
plot(xwin,CmatS20k[,i],col=myc,pch=19,cex=.8,xlab="Position (bp)",ylab="Coverage",cex.lab=1.5)
offset<-1.04
polygon(xwin[c(249,249,310,310)],
c(min(CmatS20k[,i])*offset,max(CmatS20k[,i])*offset,
max(CmatS20k[,i])*offset,min(CmatS20k[,i])*offset),
lty=2,lwd=.8)
A<-hdrL[a];B<-sp[i]
if(css[i]=='darkgreen'){col<-"green"}
else{col<-"melanistic"}
title(main=paste("(",A,") ",B,", ",col,", P = ",round(out[a,3],3),sep=""))
a<-a+1
}
dev.off()
## 702.1 11.69--12.9 mb
## 128 4.98--6.43
And here is my text summary of the whole thing for the paper:
Several lines of evidence were used to delineate the approximate breakpoints for a putative large inversion in T. cristinae associated with green versus melanistic color morphs. We focused on two scaffolds on LG 8, 702.1 and 128, where we had previously mapped color and where a large number of SNPs were associated with color suggestive of an inversion or an otherwise region of reduced recombination (Nosil et al., 2018). We first used a comparative alignment of de novo genome assemblies from melanistic and green T. cristinae morphs to constrain the possible bounds of the putative inversion. Both genome assemblies combined data from standard fragment libraries, mate-pair libraries and Chicago libraries. These assemblies along with the comparative alignment were described in detail in Nosil et al. (2018). The alignment between melanistic morph scaffold 702.1 and green morph scaffold 1575 indicated that these genomes were co-linear along scaffold 702.1 up to bp 10,032,025 (the end of the alignment between these two scaffolds). Likewise, we found that scaffold 128 from the melanistic genome aligned to xxx from the green genome and that these two scaffolds were co-linear beyond the boundary of a putative large indel (see below) at 6.19 mb on scaffold 128. Thus, we fixed the breakpoints for the putative inversion between ~10 mb on melanistic scaffold 702.1 and ~6 mb on melanistic scaffold 128. This region in slightly narrower than the broad region of elevated GWA signal for color from the single SNP analysis in genabel (see Fig. 3 in the main text). Within this region, our green morph genome comprises many very small scaffolds, preventing a clear identification of the inversion based on these data alone. We think the reason for this poor assembly was that the green morph individual used for the de novo assembly was actually heterozygote for the green and melanistic haplotypes, and thus for the inversion, creating difficulty with the assembly.
We fit a hidden Markov Model (HMM) based on patterns of LD across scaffolds 702.1 and 128 to explicitly test for and better resolve the bounds of the putative inversion. In T. cristinae individuals homozygous for the brown morph haplotype, we would expect normal/high LD for SNPs on either side of the breakpoint when sequences are aligned to the melanistic morph reference genome. In contrast, for individuals homozygous for the green haplotype, LD should be lower when SNPs span the inversion breakpoint as such SNPs are not actually near each other and thus have a greater opportunity for recombination to reduce LD. This is the specific signal we searched for. First, we classified T. cristinae individuals from FHA as homozygous for the melanistic (N = 52) or green haplotype (N = 217) using PCA and k-means clustering as described below (heterozgyous individuals were ignored; see Heterozygote deficiency or excess).
We then divided scaffolds 702.1 and 128 into non-overlapping 10kb windows. We tooks sets of seven 10 kb windows at a time, and for each group (i.e., melanistic homozygotes and green homozygotes) we calculated the mean LD between all the SNPs in the first three and last three windows in the set of seven (i.e., two 30 kb windows separated by a 10 kb window). We measured LD as the coefficient of determination calculated from the genotype estimates. Again, we would expect mean LD to be lower in the green homozygotes if the breakpoint occurred within the set of seven 10 kb windows, and particularly so if it was in the middle 10 kb window. To capture this, we calculated what we refer to as the standardized difference in LD between melanistic and green homozygotes at ΔLDi = (LDmeli - LDgreeni)/(LDmeli + LDgreeni), where LDmeli and LDgreeni are the mean LD for window set i for melanistic and green homozygotes, respectively. Note that this metric is analogous to a signed coefficient of variation in LD between groups. We then computed this statistic in sliding sets of seven 10 kb windows with 10 kb window shifts.
We fit a HMM to the ΔLDi metrics using the R (version 3.4.2) package HiddenMarkov (version 1.8.11, Harte 2017) to fit the models, but modified the Mstep function to allow for these fixed parameter values. Doing so allowed us to focus on hidden states of interest for detecting the inversion. We assumed a Gaussian error distribution with means set to the empirical mean of the ΔLDi vector (‘normal’ state) and to the 95th quantile of the ΔLDi distribution (‘high’ or breakpoint state). Standard deviations for both states were set at 80% of the empirical standard deviation. We used the Baum-Welch algorithm with 500 iterations and a tolerance of 0.0001 to estimate the transition matrix between hidden states and the Viterbi algorithm to estimate the hidden states themselves (Baum et al., 1970).
We found several clustered regions of the ‘high’ LD state on scaffold 702.1 within the possible bounds of the inversion given by the comparative alignment (a smaller region of the high LD state was found outside of this region, and was ignored). We used the combination of these clustered regions to define the likely location of the ‘left’ breakpoint of the putative inversion between 11.69 and 12.90 mb on scaffold 702.1. The ‘right’ bound was similarly defined on scaffold 128 between 4.98 and 6.19 mb, which corresponds nearly perfectly with the green morph deletion. Note that while this region does have reduced coverage in green/green homozygotes (see below) we had sufficient data for some SNPs to calculate LD here. This could mean that there is copy number variation or that the deletion isn’t fixed.