Post date: May 08, 2019 4:23:26 PM
Scripts and results for the whole genome phylogenetic analyses are in /uufs/chpc.utah.edu/common/home/u6000989/projects/lyc_hybanc/phylo_2019_V2/. The goals of these analyses are to generate a consensus species tree, and then to ask whether autosomes or the Z chromosome support this true more (i.e., if more chunks of the genome support the tree more). Trees were inferred with RAxML (version 8.2.9). Note that most analyses use the standard GTR model with no rate heterogeneity (because the program suggested that is what I should do given the values of alpha inferred). There is a model that corrects for ascertainment bias, but that only works if sites are not heterozygous (so can't use it).
0. In this second version, I am using the *V2* files that dropped SNPs withing 5 bp of each other. See morefilterV2_filtered6x_LycWgVars.vcf and sorted*V2 in /uufs/chpc.utah.edu/common/home/u6000989/projects/lyc_hybanc/genome_variants_2019/. This includes 2,054,096 SNPs.
Here are the details:
0a. Filter SNPs within 5 bp dropped and high coverage = 450
perl filterSomeMoreV2.pl filtered6x_LycWgVars.vcf
Retained 2118336 variable loci
0b. Extract genotype estimates and sort them by LG
perl getGenotypesV2.pl
perl addLgInfo.pl lgs.txt LycWGSnpDataV2.txt
perl addLgInfo.pl lgs.txt LycWGDatV2.txt
this includes 2054096 SNPs on the LG scaffolds
0c. Convert the genotypes an alignment; assign putative heterozygotes based on IUPAC ambiguity codes
perl mkAlignmentV2.pl
1. Generate alignment file for just autosomes (drops the Z). This is what I want to use for the "species tree"
ls 1kb/LG*/RA*best*win* | perl -p -i -e 's/^\S+lg([0-9Z]+)win(\d+)/\1 \2/g' > treeLgWin1kb.txt
perl getWinScafPos.pl 1000 ../genome_variants_2019/sorted_LycWGSnpDataV2.txt > wins1kb.tx
perl appendWins.pl treeLgWin1kb.txt wins1kb.txt > treeLgWinDat1kb.txt
raxmlHPC-SSE3 -E subAutos -s LycWGSnpAlignmentV2 -m GTRGAMMA -n LycWGSnpAlignAutoV2
You have excluded 40895 out of 2054096 columns
An alignment file with excluded columns is printed to file LycWGSnpAlignmentV2.subAutos
raxmlHPC-SSE3 -f a -x 12345 -p 12345 -# 100 -m GTRGAMMAX -s LycWGSnpAlignmentV2.subAutos -n raxmlAuto
Model Parameters of Partition 0, Name: No Name Provided, Type of Data: DNA
alpha: 1.877112
Tree-Length: 0.163444
rate A <-> C: 0.998534
rate A <-> G: 3.734758
rate A <-> T: 2.236187
rate C <-> G: 0.620290
rate C <-> T: 3.741286
rate G <-> T: 1.000000
freq pi(A): 0.240313
freq pi(C): 0.259204
freq pi(G): 0.259646
freq pi(T): 0.240838
2. Then, I split the alignment by LG (including Z = 23).
perl mkLgSubsets.pl
perl moveFiles.pl LycWGSnpAlignmentV2.subLg*
3. I then fit ML tress for non-overlapping windows of 1000 SNPs for each LG.
sbatch SubWrapWindows.sh
#!/bin/sh
##SBATCH --time=72:00:00
##SBATCH --nodes=1
##SBATCH --ntasks=20
##SBATCH --account=gompert-kp
##SBATCH --partition=gompert-kp
##SBATCH --job-name=raxml
##SBATCH --mail-type=FAIL
##SBATCH --mail-user=zach.gompert@usu.edu
cd /uufs/chpc.utah.edu/common/home/u6000989/projects/lyc_hybanc/phylo_2019_V2
for i in {1..23}
do
echo "working on LG $i\n"
cd LG$i
perl ../mkRunWindows.pl LycWGSnpAlignmentV2.subLg$i > log
cd ..
done
4. Add notes from labptop!!
5. My main analyses and plots are all in phylo.R. Some of the required packages do not install successfully on the cluster, so this needs to be run on my desktop or laptop.
library(ape)
library(phytools)
trs<-read.tree("catWindowTrees1kb")
#####
myUniqueTrees<-function (x, incomparables = FALSE, use.edge.length = FALSE,
use.tip.label = TRUE)
{
n <- length(x)
cnts<-NULL
keep <- 1L
old.index <- seq_len(n)
for (i in 2:n) {
already.seen <- FALSE
for (j in keep) {
if (all.equal(x[[j]], x[[i]], use.edge.length = use.edge.length,
use.tip.label = use.tip.label)) {
already.seen <- TRUE
old.index[i] <- j
cnts[j]<-cnts[j]+1
break
}
}
if (!already.seen) {
keep <- c(keep, i)
cnts<-c(cnts,1) ## here
}
}
res <- x[keep]
attr(res, "old.index") <- old.index
out <- list(res,old.index)
out
}
## unrooted
o<-myUniqueTrees(trs)
cnts<-table(o[[2]])
props<-cnts/sum(cnts)
#midroot
Nt<-length(trs)
mtrs<-trs
for(k in 1:Nt){
mtrs[[k]]<-midpoint.root(trs[[k]])
}
om<-myUniqueTrees(mtrs)
pdf("trees.pdf",width=8,height=8)
trord<-rev(as.numeric(names(sort(table(om[[2]])))))
par(mfrow=c(3,3))
for(k in 1:9){
i<-trord[k]
plot.phylo(mtrs[[i]],use.edge.length=FALSE)
}
trord<-rev(as.numeric(names(sort(table(o[[2]])))))
par(mfrow=c(3,3))
for(k in 1:9){
i<-trord[k]
plot.phylo(trs[[i]],"unrooted",use.edge.length=FALSE)
}
dev.off()
lgs<-scan("treeLGs1kb.txt")
spTr<-tapply(X=o[[2]]==1,INDEX=lgs,mean)
barplot(spTr)
spTr<-tapply(X=om[[2]]==1,INDEX=lgs,mean)
barplot(spTr)
a<-rep(0,length(lgs))
a[lgs==23]<-1
spTr<-tapply(X=om[[2]]==1,INDEX=a,mean)
spTr<-tapply(X=o[[2]]==1,INDEX=a,mean)
## barplots comparing top trees
## mid-point rooted
N<-length(table(om[[2]]))
prmat<-matrix(NA,nrow=N,ncol=2)
trNum<-as.numeric(names(table(om[[2]])))
for(i in 1:N){
prmat[i,]<-tapply(X=om[[2]]==trNum[i],INDEX=lgs==23,mean)
}
barplot(t(prmat),beside=TRUE,horiz=FALSE,names.arg=trNum)
## unrooted
N<-length(table(o[[2]]))
prmat<-matrix(NA,nrow=N,ncol=2)
trNum<-as.numeric(names(table(o[[2]])))
for(i in 1:N){
prmat[i,]<-tapply(X=o[[2]]==trNum[i],INDEX=lgs==23,mean)
}
barplot(t(prmat),beside=TRUE,horiz=FALSE,names.arg=trNum)
## big mat for image plot
library(RColorBrewer)
library(fields)
prM<-matrix(NA,nrow=N,ncol=23)
trNum<-as.numeric(names(table(o[[2]])))
for(i in 1:N){
prM[i,]<-tapply(X=o[[2]]==trNum[i],INDEX=lgs,mean)
}
rmns<-apply(prM,1,mean)
prM<-prM[rev(order(rmns)),rev(1:23)]
brks<-c(-0.001,0.025,0.05,0.075,0.1,0.15,0.2,0.3,0.4,0.6)
cs<-brewer.pal(n=9,"BuGn")
pdf("treeXlg.pdf",width=7,height=7)
image.plot(prM,breaks=brks,col=cs,axes=FALSE,xlab="Tree no.",ylab="Linkage group",cex.lab=1.6)
axis(2,at=seq(0,23,24/23)/23,rev(c(1:22,"Z")),las=2)
axis(1,at=seq(0,15,16/15)/15,1:15,las=2)
box()
dev.off()
trNum[rev(order(rmns))]
# [1] 1 4 9 11 3 15 37 82 66 30 201 47 20 139 608
## tree 1 = species tree
## tree 37 = SN and W grouped
## tree 4 = warner with melissa
## tree 15 = idas with anna, sierra closer to meliass
## tree 1
obs<-sum(o[[2]][lgs==23]==1)
null<-rep(NA,1000)
null[1]<-obs
x<-o[[2]]
for(i in 2:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[lgs==23]==1)
}
mean(obs > null)
#[1] 0.643
obs/mean(null)
#[1] 1.116427
## tree 37
obs<-sum(o[[2]][lgs==23]==37)
null<-rep(NA,1000)
null[1]<-obs
x<-o[[2]]
for(i in 2:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[lgs==23]==37)
}
mean(obs <= null)
#[1] 0.001
#obs/mean(null)
#[1] 5.358013
## tree 4
obs<-sum(o[[2]][lgs==23]==4)
null<-rep(NA,1000)
null[1]<-obs
x<-o[[2]]
for(i in 2:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[lgs==23]==4)
}
mean(obs <= null)
#[1] 0.001
obs/mean(null)
#[1] 0.09882399
### tree 15 = idas with anna, sierra closer to meliass
obs<-sum(o[[2]][lgs==23]==15)
null<-rep(NA,1000)
null[1]<-obs
x<-o[[2]]
for(i in 2:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[lgs==23]==15)
}
mean(obs >= null)
#[1] 0.079
## comparison with Dubois/JH
snpWins<-read.table("treeLgWinDat1kb.txt")
## top beta SNPs ##
b<-read.table("betaSnps.txt",header=TRUE)
bwins<-NULL
for(i in 1:dim(b)[1]){
awin<-which(snpWins[,3]==b[i,1] & snpWins[,4] <= b[i,2] & snpWins[,6] >= b[i,2])
if(length(awin) > 0){
bwins<-c(awin,bwins)
}
}
ubwins<-unique(bwins)
## ubwins tree 4
obs<-sum(o[[2]][ubwins]==4)
null<-rep(NA,1000)
x<-o[[2]]
for(i in 1:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[ubwins]==4)
}
mean(obs >= null)
#[1] 0.047
obs/mean(null)
#[1] 0.4254112
## ubwins tree 37
obs<-sum(o[[2]][ubwins]==37)
null<-rep(NA,1000)
x<-o[[2]]
for(i in 1:1000){
rx<-sample(x,length(x),replace=FALSE)
null[i]<-sum(rx[ubwins]==37)
}
mean(obs <= null)
#[1] 0.008
obs/mean(null)
#[1] 3.615329
### MAKE FIGURE ###
mltr<-read.tree("RAxML_bestTree.raxmlAuto")
mltr$tip.label<-c("Sierra_Nevada","Warner_Mts","L_melissa","L_idas","L_anna")
## figure
cs<-c("black","#E69F00","royalblue4","hotpink4","lightseagreen")
css<-c("seagreen","black","#E69F00","royalblue4","hotpink4","lightseagreen")
library(phangorn)
pdf("phylo.pdf",width=9,height=9)
layout(matrix(c(1,1,2,2,3,4,4,5,6,7,7,8),nrow=3,ncol=4,byrow=TRUE),widths=c(4,2,2,4),heights=c(4,4,4))
par(mar=c(0,0,2.5,0))
cm<-1.7
ct<-1.4
cl<-1.6
ca<-1.2
## map
popDat<-read.table("popLatLong.txt",header=TRUE)
library(ggplot2)
library(ggmap)
library(maps)
library(mapdata)
library(grid)
library(gridBase)
usa <- map_data("usa")
state_dat<-map_data("state")
map_dat<-rbind(state_dat,usa)
p0 <- ggplot() +
geom_polygon(data=map_dat,aes(x=long,y=lat,group=group, fill=region),fill="white",color="black", show.legend=FALSE) +
coord_map("gilbert",xlim=c(-125,-108),ylim=c(37.5,48)) +
labs(x=expression("Longitude"*~degree*W), y=expression("Latitude"*~degree*N)) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
#plot.margin=unit(c(1,0.5,1,1),'cm'),
legend.position='right') + theme(axis.title.y = element_text(size = 15, vjust=3,hjust=0.5)) + theme(axis.ticks = element_blank(),axis.text = element_blank(),axis.title.x = element_text(size = 15,vjust=-1.3,hjust=0.5))
p1 <- p0 + geom_point(data=popDat,aes(x=longitude,y=latitude,colour=species,show.legend=TRUE)) + scale_color_manual(values=css)
plot.new()
vps <- baseViewports()
pushViewport(vps$figure)
vp1 <-plotViewport(c(0.2,0.2,1.5,0.1))
print(p1, vp=vp1)
title(main="(A) Map of localities",cex.main = cm, adj=0)
## ML tree
plot(midpoint(mltr),tip.color=cs[rank(mltr$tip.label)],cex=ct)
add.scale.bar()
title(main="(B) Whole genome phylogeny",cex.main = cm, adj=0)
plot.phylo(trs[[1]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[1]]$tip.label)],cex=ct)
title(main="(C) Consensus tree",cex.main = cm, adj=0)
plot.phylo(trs[[4]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[4]]$tip.label)],cex=ct,
rotate.tree=270)
title(main="(D) Warner-melissa tree",cex.main = cm, adj=0)
plot.phylo(trs[[37]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[37]]$tip.label)],cex=ct,
rotate.tree=45)
title(main="(E) Warner-sierra tree",cex.main = cm, adj=0)
par(mar=c(5.5,4.5,2.5,1))
trx<-1
grNams=c("Autosomes","Z chrom.","Barrier loci")
props<-c(mean(o[[2]][lgs!=23] == trx),mean(o[[2]][lgs==23]==trx),mean(o[[2]][ubwins]==trx))
barplot(props,names.arg=grNams,cex.names=ca,ylab="Proportion",
xlab="Subset",cex.lab=cl)
title(main="(F) Prop. consensus",cex.main = cm, adj=0)
trx<-4
props<-c(mean(o[[2]][lgs!=23] == trx),mean(o[[2]][lgs==23]==trx),mean(o[[2]][ubwins]==trx))
barplot(props,names.arg=grNams,cex.names=ca,ylab="Proportion",
xlab="Subset",cex.lab=cl)
title(main="(G) Prop. warner-melissa",cex.main = cm, adj=0)
trx<-37
props<-c(mean(o[[2]][lgs!=23] == trx),mean(o[[2]][lgs==23]==trx),mean(o[[2]][ubwins]==trx))
barplot(props,names.arg=grNams,cex.names=ca,ylab="Proportion",
xlab="Subset",cex.lab=cl)
title(main="(F) Prop. warner-sierra",cex.main = cm, adj=0)
dev.off()
## plot along chroms
barTrees<-vector("list",23)
for(i in 1:23){
a<-which(snpWins[,1]==i)
winsA<-snpWins[a,2]
Nw<-length(a)
aMat<-matrix(NA,nrow=Nw,ncol=5)
for(j in 1:Nw){
if(o[[2]][a][order(winsA)][j] == 1){
aMat[j,]<-c(1,0,0,0,0)
}
else if(o[[2]][a][order(winsA)][j] == 4){
aMat[j,]<-c(0,1,0,0,0)
}
else if(o[[2]][a][order(winsA)][j] == 37){
aMat[j,]<-c(0,0,1,0,0)
}
else if(o[[2]][a][order(winsA)][j] == 9){
aMat[j,]<-c(0,0,0,1,0)
}
else{
aMat[j,]<-c(0,0,0,0,1)
}
}
barTrees[[i]]<-aMat
}
pdf("treesByLg.pdf",width=9,height=12)
par(mfrow=c(23,1))
par(mar=c(0,3,0,0))
for(i in 1:23){
barplot(t(barTrees[[i]]),col=c("black","#FB9A99","#E31A1C","darkgray","lightgray"),xlim=c(0,1),
axes=FALSE,width=1/160,border=NA)
if(i < 23){
mtext(i,2,las=2,cex=1.5)
}
else{
mtext("Z",2,las=2,cex=1.5)
}
}
dev.off()
pdf("legendPlust.pdf",width=9,height=4)
layout(matrix(1:5,nrow=1,ncol=5),widths=c(2,4,4,4,4),heights=5)
par(mar=c(0,0,1.5,0))
cm<-1.4
ct<-1.2
cl<-1.6
ca<-1.2
plot(c(0,1),c(0,1),type='n',xlab="",ylab="",axes=FALSE)
legend(0,1,c("Tree 1","Tree 2","Tree 7","Tree 3","other"),fill=c("black","#FB9A99","#E31A1C","darkgray","lightgray"),cex=1.5, bty='n')
plot.phylo(trs[[1]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[1]]$tip.label)],cex=ct)
title("Tree 1",cex.main=cm)
plot.phylo(trs[[4]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[4]]$tip.label)],cex=ct,
rotate.tree=270)
title("Tree 2",cex.main=cm)
plot.phylo(trs[[37]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[37]]$tip.label)],cex=ct,
rotate.tree=45)
title("Tree 7",cex.main=cm)
plot.phylo(trs[[9]],"unrooted",use.edge.length=FALSE,
tip.color=cs[rank(trs[[9]]$tip.label)],cex=ct,
rotate.tree=345)
title("Tree 3",cex.main=cm)
dev.off()