Post date: Oct 16, 2015 1:42:7 AM
We used six metrics to discriminate between diploids and triploids. Two were haplotype-based (prop. of haplotype loci with 2 or more or 3 or more alleles), one SNP based (prop. het.) and three were based on uneven coverage (proportion of SNPs in different coverage bins). The first three were normalized by coverage. Here is the R code, which uses PCA to create two composite variable from these six, k-means clustering to find two groups (roughly dips vs. trips), and LDA to assign individuals probabilistically to groups:
ids<-read.table("ids.txt",header=F)
G<-as.matrix(read.table("pntest_filt2xAspenMaf1.txt",header=F))
Ghat<-round(G,0)
Het<-apply(Ghat==1,2,mean)
het<-as.matrix(read.table("hetAlleleDepth.txt",header=F))
a<-seq(1,380,2)
b<-seq(2,380,2)
ratio<-het[,a]/het[,b]
depth<-het[,a]+het[,b]
mnDpth<-apply(depth,2,mean,na.rm=T)
vlratio<-apply(log10(ratio),2,var,na.rm=T)
hn<-read.table("hapnum.txt",header=TRUE)
## relationship between depth and number of haplotype alleles
d<-which(ids[,3]=='D')
t<-which(ids[,3]=='T')
## 2 or more or 3 or more haplotype alleles
plot(mnDpth,hn$exH3ormor,type='n')
points(mnDpth[t],hn$exH3ormor[t],col="orange")
points(mnDpth[d],hn$exH3ormor[d],col="blue")
outd<-lm(hn$exH3ormor[d] ~ mnDpth[d])
outt<-lm(hn$exH3ormor[t] ~ mnDpth[t])
abline(a=-4.869e-03,b=8.601e-04,col="orange")
abline(a=-7.504e-04,b=2.212e-04,col="blue")
plot(out2$residuals,out3$residuals,type='n',xlim=c(-0.05,0.06),ylim=c(-0.012,0.02))
points(out2$residuals[d],out3$residuals[d],col="blue")
points(out2$residuals[t],out3$residuals[t],col="orange")
## heterozygosity
plot(mnDpth,Het,type='n')
points(mnDpth[t],Het[t],col="orange")
points(mnDpth[d],Het[d],col="blue")
out3<-lm(hn$exH3ormor ~ mnDpth)
out2<-lm(hn$exH2ormor ~ mnDpth)
outH<-lm(Het ~ mnDpth)
plot(outH$residuals,out3$residuals,type='n')
points(outH$residuals[d],out3$residuals[d],col="blue")
points(outH$residuals[t],out3$residuals[t],col="orange")
plot(Het,hn$exH3ormor,type='n')
points(Het[d],hn$exH3ormor[d],col="blue")
points(Het[t],hn$exH3ormor[t],col="orange")
##
rngs<-matrix(NA,nrow=3,ncol=2)
rngs[1,]<-c(-.15,0.15)
rngs[2,]<-c(-.45,-0.15)
rngs[3,]<-c(.15,.45)
props<-matrix(NA,nrow=190,ncol=3)
for(i in 1:190){
total<-sum(is.na(log10(ratio[,i]))==FALSE)
for(j in 1:3){
props[i,j]<-sum(log10(ratio[,i]) > rngs[j,1] & log10(ratio[,i]) < rngs[j,2],na.rm=TRUE)/total
}
}
datamatrix<-cbind(out2$residuals,out3$residuals,outH$residuals,props[-129,])
dt<-ids[-129,3]
pcout<-prcomp(datamatrix,center=TRUE,scale=TRUE)
kout<-kmeans(x=pcout$x[,1:2],centers=2)
plot(pcout$x[,1],pcout$x[,2],type='n')
a<-which(kout$cluster==1)
b<-which(kout$cluster==2)
points(pcout$x[a,1],pcout$x[a,2],col="blue")
points(pcout$x[b,1],pcout$x[b,2],col="orange")
lout<-lds(kout$cluster ~ pcout$x[,1] + pcout$x[,2],CV=TRUE,prior=c(0.5,0.5))
plot(density(lout$posterior[which(dt=='D'),1],from=0,to=1),col="blue")
lines(density(lout$posterior[which(dt=='T'),1],from=0,to=1),col="orange")
out<-cbind(ids[-129,],round(lout$posterior,3))
olnames(out)<-c("pop","id","ploidy","prD","prT")
write.csv(out,"diptripProbs.csv",row.names=FALSE)
save(list=ls(),file="aspenDT.Rdata")
This seems to have worked well. Most individuals are accurately assigned if we take previous data as the truth. Interestingly, we found that uneven of coverage was biased by orientation, with an excess of 2:1 for trips and 1:2 for dips. This is odd, but could be explained if the reference sequence is from a diploid and triploids are hybrids. We need to look into this.