Post date: Dec 05, 2017 9:39:39 PM
I calculated heterozygosity excess/deficits for various species/populations. I worked with the genotype files from Romain, which are basically (but not exactly) those used for final gemma runs. In particular, for T. bartmani I examined all three populations and included individuals without phenotype data. I also considered only species with green and dark morphs (same as other recent analyses). We can always expand this to other species/populations. Even with these criteria, some populations/species had to be dropped because the clusters were not informative. This included T. podura (because of chromosomal forms I think), and the PCT and BMCG3 T. bartmani populations (weird issues with one or a few individuals standing out by themselves and compressing the signal, plus small sample size for BMCG3).
The general approach was to run PCAs on the genotypes for the color locus (mel-stipe) only, and then to cluster with k-means. I always did this first at k=6, but dropped if one of the six clusters appeared to be missing. I then basically collapsed PC 2 clusters to get homozygotes and hets (always the middle cluster). I only report results for those that I was pretty comfortable with. Everything is in /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/het/.
For those that 'worked' the table below gives obs and expected het, the difference between the two, and results form a x-squared contingency test. Basically, we have an excess of hets and T. cristinae (FHA) and a deficit in T. bartmani (JL), and nothing else that we can be confident in.
Here is the code (commands.R in /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_genarch/het/):
## pca and kmeans
pcaKmns<-function(G=NA,snp=NA,k=6){
L<-dim(snp)[1]
snpDat<-matrix(NA,nrow=L,ncol=2)
for(i in 1:L){
o<-strsplit(as.character(snp[i,1]),split="f")
oo<-strsplit(as.character(o[[1]][2]),split=":")
snpDat[i,]<-as.numeric(oo[[1]])
}
mlb<-4139489
mub<-6414835
mel<-c(which(snpDat[,1]==702.1 & snpDat[,2] < mlb),
which(snpDat[,1]==128 & snpDat[,2] < mub))
Gx<-matrix(as.numeric(G[mel,-c(1:3)]),nrow=length(mel),ncol=dim(G)[2]-3,byrow=FALSE)
## standardize Gx
gmns<-apply(Gx,1,mean)
p<-(apply(Gx,1,sum)+1)/(2+2 *dim(Gx)[2])
Gxs<-Gx
for(i in 1:dim(Gx)[1]){
Gxs[i,]<-(Gx[i,]-gmns[i])/sqrt(p[i] * (1-p[i]))
}
pc<-prcomp(t(Gxs),center=FALSE,scale=FALSE)
library(MASS)
## kmeans clustering
ko<-kmeans(x=pc$x[,1:2],centers=k,iter.max=100,nstart=100)
plot(pc$x[,1],pc$x[,2],type='n')
text(pc$x[,1],pc$x[,2],ko$cluster)
return(ko)
}
summarizePca<-function(Ko=Ko,h1=NA,h2=NA,he=NA){
gfrq<-table(Ko$cluster)/length(Ko$cluster)
## order by green vs. melanic
hom1<-sum(gfrq[h1])
hom2<-sum(gfrq[h2])
het<-sum(gfrq[he])
p<-hom1+.5*het
q<-1-p
expH<-2*p*q
o<-c(het,expH,het-expH)
names(o)<-c("obs","hw","obs-hw")
cnts<-table(Ko$cluster)
## order by green vs. melanic
hom1<-sum(cnts[h1])
hom2<-sum(cnts[h2])
het<-sum(cnts[he])
N<-sum(cnts)
ex<-c(p^2, 2 * p * q, q^2)
obs<-c(hom1,het,hom2)
oo<-chisq.test(x=obs,p=ex,simulate.p.value=TRUE,B=10000)
out<-c(o,oo$statistic,oo$p.value)
return(out)
}
hetExcess<-matrix(NA,nrow=10,ncol=5)
colnames(hetExcess)<-c("obs_het","HW_het","obs-Hw","X-squared","P")
rownames(hetExcess)<-rep("Tsp",10)
## timema cristinae
G<-as.matrix(read.table("Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tcristinae_FHA.latRG.bial.noindel.qs20.cov50.mdp295Mdp5900.maf0.01.lgNA.excluded.snploc",header=FALSE)
trcKo<-pcaKmns(G=G,snp=snp,k=6)
hetExcess[1,]<-summarizePca(Ko=trcKo,h1=c(1,4,5),h2=c(2),he=c(3,6))
row.names(hetExcess)[1]<-"Tcristinae_FHA"
## timema landelsensis
G<-as.matrix(read.table("Tlandelsensis_BCBOG-BCHC-BCHR-BCOG-BCSUM.latRG.bial.noindel.qs20.cov50.mdp47Mdp940.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tlandelsensis_BCBOG-BCHC-BCHR-BCOG-BCSUM.latRG.bial.noindel.qs20.cov50.mdp47Mdp940.maf0.01.lgNA.excluded.snploc",header=FALSE)
tlanKo<-pcaKmns(G=G,snp=snp,k=5)
hetExcess[2,]<-summarizePca(Ko=tlanKo,h1=c(3,4),h2=c(2),he=c(1,5))
row.names(hetExcess)[2]<-"Tlandelsensis"
## timema californicum
G<-as.matrix(read.table("Tcalifornicum_LPLickSM.latRG.bial.noindel.qs20.cov50.mdp41Mdp830.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tcalifornicum_LPLickSM.latRG.bial.noindel.qs20.cov50.mdp41Mdp830.maf0.01.lgNA.excluded.snploc",header=FALSE)
tcalKo<-pcaKmns(G=G,snp=snp,k=5)
## timema podura
G<-as.matrix(read.table("Tpodura_BSC.latRG.bial.noindel.qs20.cov50.mdp25Mdp420.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tpodura_BSC.latRG.bial.noindel.qs20.cov50.mdp25Mdp420.maf0.01.lgNA.excluded.snploc",header=FALSE)
tpodKo<-pcaKmns(G=G,snp=snp,k=3) ## NA, chromosome forms, no hets
## timema chumash
G<-as.matrix(read.table("Tchumash_GR8.06.latRG.bial.noindel.qs20.cov50.mdp265Mdp5310.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tchumash_GR8.06.latRG.bial.noindel.qs20.cov50.mdp265Mdp5310.maf0.01.lgNA.excluded.snploc",header=FALSE)
tchumKo<-pcaKmns(G=G,snp=snp,k=5)
hetExcess[4,]<-summarizePca(Ko=tchumKo,h1=c(1,4),h2=c(5),he=c(2,3))
row.names(hetExcess)[4]<-"Tchumash_GR8"
## timema bartmani JL
G<-as.matrix(read.table("Tbartmani_JL.allgreen.bin.bial.noindel.qs20.cov50.mdp204Mdp4080.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tbartmani_JL.allgreen.bin.bial.noindel.qs20.cov50.mdp204Mdp4080.maf0.01.lgNA.excluded.snploc",header=FALSE)
tbartKo<-pcaKmns(G=G,snp=snp,k=6)
hetExcess[5,]<-summarizePca(Ko=tbartKo,h1=c(1,2,3),h2=c(4),he=c(5,6))
row.names(hetExcess)[5]<-"Tbartmani_JL"
## timema bartmani PCT
G<-as.matrix(read.table("Tbartmani_PCT.allgreen.bin.bial.noindel.qs20.cov50.mdp121Mdp2420.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tbartmani_PCT.allgreen.bin.bial.noindel.qs20.cov50.mdp121Mdp2420.maf0.01.lgNA.excluded.snploc",header=FALSE)
tbartPctKo<-pcaKmns(G=G,snp=snp,k=6) ## not very clean, a few individuals stand way out on PC2
## timema bartmani BMCG3
G<-as.matrix(read.table("Tbartmani_BMCG3.allgreen.bin.bial.noindel.qs20.cov50.mdp26Mdp520.maf0.01.lgNA.excluded.geno",header=F))
snp<-read.table("Tbartmani_BMCG3.allgreen.bin.bial.noindel.qs20.cov50.mdp26Mdp520.maf0.01.lgNA.excluded.snploc",header=FALSE)
tbartBmcgKo<-pcaKmns(G=G,snp=snp,k=6) ## also weird indidiuals standing out, and not too many inds
# obs_het HW_het obs-Hw X-squared P
#Tcristinae_FHA 0.5440678 0.4608949 0.08317294 19.2137608 0.00009999
#Tlandelsensis 0.2553191 0.2962879 -0.04096876 1.7972385 0.38426157
#Tcalifornicum_LPLickSM 0.5662651 0.4927421 0.07352301 1.8479294 0.40185981
#Tchumash_GR8 0.2824859 0.2864793 -0.00399346 0.1031826 0.94890511
#Tbartmani_JL 0.4166667 0.4844291 -0.06776240 7.9832058 0.01809819
write.table(round(hetExcess[1:5,],5),"hetExcess.txt",quote=FALSE)
save(list=ls(),file="timemaHet.rdat")
Here are the results:
obs_het HW_het obs-Hw X-squared P
Tcristinae_FHA 0.5441 0.4609 0.0832 19.2138 0.0001
Tlandelsensis 0.2553 0.2963 -0.0410 1.7972 0.3843
Tcalifornicum_LPLickSM 0.5663 0.4927 0.0735 1.8479 0.4019
Tchumash_GR8 0.2825 0.2865 -0.0040 0.1032 0.9489
Tbartmani_JL 0.4167 0.4844 -0.0678 7.9832 0.0181