To do:
compare high alpha with high ancmeans
compare low alpha with low ancmeans
compare high beta with folded high ancmeans
Check Z and autosomes for each of them
After redo the SNP annotations
### read in the popanc files ##########
bcr3<-read.table("../popanc/outfiles/aims/out_aims3_BCR.txt", header=T, sep=",")
bic3<-read.table("../popanc/outfiles/aims/out_aims3_BIC.txt", header=T, sep=",")
bld3<-read.table("../popanc/outfiles/aims/out_aims3_BLD.txt", header=T, sep=",")
bnp3<-read.table("../popanc/outfiles/aims/out_aims3_BNP.txt", header=T, sep=",")
btb3<-read.table("../popanc/outfiles/aims/out_aims3_BTB.txt", header=T, sep=",")
frc3<-read.table("../popanc/outfiles/aims/out_aims3_FRC.txt", header=T, sep=",")
pin3<-read.table("../popanc/outfiles/aims/out_aims3_PIN.txt", header=T, sep=",")
psp3<-read.table("../popanc/outfiles/aims/out_aims3_PSP.txt", header=T, sep=",")
rdl3<-read.table("../popanc/outfiles/aims/out_aims3_RDL.txt", header=T, sep=",")
rnv3<-read.table("../popanc/outfiles/aims/out_aims3_RNV.txt", header=T, sep=",")
## take ancestry means across populations ####
ancmean<-(bcr3$mean + bic3$mean + bld3$mean + bnp3$mean + btb3$mean + frc3$mean + rdl3$mean + rnv3$mean + pin3$mean + psp3$mean) / 10
## folding the values ####
ancmean_fold<-ancmean
ancmean_fold[ancmean_fold < 0.5]<-1-ancmean_fold[ancmean_fold < 0.5]
### read in the clines files. Here I am using the medians across MCMC and different chains for each parameter alpha and beta ###############
cline3<-read.table("../clines/outfiles/pct3_alphabeta_mcmcmedian.txt", header=TRUE)
### get the scaffold position file #########
mappos3<-read.table("../popanc/infiles/aims/mappos3.txt", header=F, sep=":")
### create a master file with scaffold position and clines and ancestry values
datfile<-cbind(mappos3, ancmean, ancmean_fold, cline3)
colnames(datfile)<-c("scaffold","position","ancmean", "ancfold","alpha","beta")
head(datfile)
##########################################################################
## trying out randomization for top 10% quantiles ###
##########################################################################
### high alpha ####
anctop = quantile(datfile$ancmean,probs=c(0.9,0.99,0.999,0.9999))
alphatop = quantile(datfile$alpha,probs=c(0.9,0.99,0.999,0.9999))
length(which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1]))
#subset the common rows from the master data file
comsnps<-datfile[which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1]),]
## randomization for shared snps
null_ha<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile$ancmean),0)
ar<-sample(1:length(datfile$ancmean),x, replace=FALSE)
cr<-sample(1:length(datfile$alpha),x,replace=FALSE)
null_ha[i]<- sum(ar %in% cr)
}
obs_ha<-length(which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1])) #22
p_ha<-mean(null_ha >= obs_ha) #7e-04
xfold_ha<-obs_ha / mean(null_ha) #1.909092
obs_ha
p_ha
xfold_ha
## randomization for excess on Z
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] == 1631)
}
obs = sum(comsnps$scaffold == 1631) #12
p<-mean(null >= obs) #3e-04
xfold<- obs/mean(null) #2.821339
cbind(obs, xfold, p)
## randomization for excess on autosomes
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] != 1631)
}
obs = sum(comsnps$scaffold != 1631) #10
p<-mean(null >= obs) #1
xfold<- obs/mean(null) #0.5639713
cbind(obs, xfold, p)
### low alpha #####
anclo = quantile(datfile$ancmean,probs=c(0.1,0.01,0.001))
alphalo = quantile(datfile$alpha,probs=c(0.1,0.01,0.001))
length(which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1]))
#subset the common rows from the master data file
comsnps_al<-datfile[which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1]),]
## randomization for shared snps
null_la<-rep(NA, 10000)
for (i in 1:10000){
x<-round(0.1*length(datfile$ancmean),0)
ar<-sample(1:length(datfile$ancmean),x, replace=FALSE)
cr<-sample(1:length(datfile$alpha),x,replace=FALSE)
null_la[i]<- sum(ar %in% cr)
}
obs_la<-length(which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1])) #9
p_la<-mean(null_la >= obs_la) #0.8394
xfold_la<-obs_la / mean(null_la) #0.7797001
obs_la
p_la
xfold_la
## randomization for excess on Z
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round(0.1*length(datfile[,1]),0)
n = length(comsnps_al[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] == 1631)
}
obs = sum(comsnps_al$scaffold == 1631) #2
p<-mean(null >= obs) #0.5378
xfold<- obs/mean(null) #1.157675
cbind(obs, xfold, p)
## randomization for excess on autosomes
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round((0.1)*length(datfile[,1]),0)
n = length(comsnps_al[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] != 1631)
}
obs = sum(comsnps_al$scaffold != 1631) #7
p<-mean(null >= obs) #0.7529
xfold<- obs/mean(null) #0.9631393
cbind(obs, xfold, p)
### high beta ####
anctop = quantile(datfile$ancfold,probs=c(0.9,0.99,0.999,0.9999))
betatop = quantile(datfile$beta,probs=c(0.9,0.99,0.999,0.9999))
length(which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1]))
#subset the common rows from the master data file
comsnps_b<-datfile[which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1]),]
## randomization for shared snps
null_hb<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile$ancfold),0)
ar<-sample(1:length(datfile$ancfold),x, replace=FALSE)
cr<-sample(1:length(datfile$beta),x,replace=FALSE)
null_hb[i]<- sum(ar %in% cr)
}
obs_hb<-length(which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1])) #49
p_hb<-mean(null_hb >= obs_hb) #<0
xfold_hb<-obs_hb / mean(null_hb) #4.234395
obs_hb
p_hb
xfold_hb
## randomization for excess on Z
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile[,1]),0)
n = length(comsnps_b[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] == 1631)
}
obs = sum(comsnps_b$scaffold == 1631) #37
p<-mean(null >= obs) #<0
xfold<- obs/mean(null) #3.919575
cbind(obs, xfold, p)
## randomization for excess on autosomes
null<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(datfile[,1]),0)
n = length(comsnps_b[,1])
cnull = sample(1:length(datfile[,1]), n , replace=F)
null[i] = sum(datfile$scaffold[cnull] != 1631)
}
obs = sum(comsnps_b$scaffold != 1631) #12
p<-mean(null >= obs) #1
xfold<- obs/mean(null) #0.3036829
cbind(obs, xfold, p)
#######################################################################################
####### randomization functions for shared SNPs in top quantiles ######################
#######################################################################################
quants<-c(0.99,0.98, 0.97, 0.96, 0.95,0.94,0.93,0.92,0.91,0.9)
quants.lo<-c(0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1)
randomshared_high <- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]
## Randomization 1: randomization for shared snps ##########################
null_shared<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-q)*length(dfile[[anc]]),0)
ar<-sample(1:length(dfile[[anc]]),x, replace=FALSE)
cr<-sample(1:length(dfile[[cline]]),x,replace=FALSE)
null_shared[i]<- sum(ar %in% cr)
}
obs_shared<-length(which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop))
p_shared<-mean(null_shared >= obs_shared)
xfold_shared<-obs_shared / mean(null_shared)
result_shared<-cbind(obs_shared, xfold_shared, p_shared)
}
return(result_shared)
}
#randomization_high(datfile, 'ancmean','alpha',0.9)
enrichshared_hialpha<-vector("list",10)
enrichshared_hibeta<-vector("list",10)
for (i in 1:10){
#hi alpha, hi raw ancmeans
enrichshared_hialpha[[i]]<-randomshared_high(datfile, 'ancmean', 'alpha', quants[i])
#hi beta, hi folded ancmeans
enrichshared_hibeta[[i]]<-randomshared_high(datfile, 'ancfold', 'beta', quants[i])
}
### randomization for low alpha ####
randomshared_low <- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]
## Randomization 1: randomization for shared snps ##########################
null_shared<-rep(NA, 10000)
for (i in 1:10000){
x<-round((q)*length(dfile[[anc]]),0)
ar<-sample(1:length(dfile[[anc]]),x, replace=FALSE)
cr<-sample(1:length(dfile[[cline]]),x,replace=FALSE)
null_shared[i]<- sum(ar %in% cr)
}
obs_shared<-length(which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop))
p_shared<-mean(null_shared >= obs_shared)
xfold_shared<-obs_shared / mean(null_shared)
result_shared<-cbind(obs_shared, xfold_shared, p_shared)
}
return(result_shared)
}
enrichshared_loalpha<-vector("list",10)
for (i in 1:10){
#low alpha, low raw ancmean
enrichshared_loalpha[[i]]<-randomshared_low(datfile, 'ancmean', 'alpha', quants.lo[i])
}
### randomization for excess on z ##########
randomsex_high<- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]
## Randomization 2: randomization for excess on Z ##########################
null_sex<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(dfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(dfile[,1]), n , replace=F)
null_sex[i] = sum(dfile$scaffold[cnull] == 1631)
}
obs_sex<-sum(comsnps$scaffold == 1631)
p_sex<-mean(null_sex >= obs_sex)
xfold_sex<- obs_sex/mean(null_sex)
result_sex<-cbind(obs_sex, xfold_sex, p_sex)
}
return(result_sex)
}
enrichsex_hialpha<-vector("list",10)
enrichsex_hibeta<-vector("list",10)
for (i in 1:10){
#hi alpha, hi raw ancmeans
enrichsex_hialpha[[i]]<-randomsex_high(datfile, 'ancmean', 'alpha', quants[i])
#hi beta, hi folded ancmeans
enrichsex_hibeta[[i]]<-randomsex_high(datfile, 'ancfold', 'beta', quants[i])
}
## for low alpha ###
randomsex_low<- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]
## Randomization 2: randomization for excess on Z ##########################
null_sex<-rep(NA, 10000)
for (i in 1:10000){
x<-round(q*length(dfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(dfile[,1]), n , replace=F)
null_sex[i] = sum(dfile$scaffold[cnull] == 1631)
}
obs_sex<-sum(comsnps$scaffold == 1631)
p_sex<-mean(null_sex >= obs_sex)
xfold_sex<- obs_sex/mean(null_sex)
result_sex<-cbind(obs_sex, xfold_sex, p_sex)
}
return(result_sex)
}
enrichsex_loalpha<-vector("list",10)
for (i in 1:10){
#low alpha, low raw ancmean
enrichsex_loalpha[[i]]<-randomsex_low(datfile, 'ancmean', 'alpha', quants.lo[i])
}
### randomization for excess on autosomes ##########
randomauto_high<- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]
## Randomization 2: randomization for excess on Z ##########################
null_auto<-rep(NA, 10000)
for (i in 1:10000){
x<-round((1-0.9)*length(dfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(dfile[,1]), n , replace=F)
null_auto[i] = sum(dfile$scaffold[cnull] != 1631)
}
obs_auto<-sum(comsnps$scaffold != 1631)
p_auto<-mean(null_auto >= obs_auto)
xfold_auto<- obs_auto/mean(null_auto)
result_auto<-cbind(obs_auto, xfold_auto, p_auto)
}
return(result_auto)
}
enrichauto_hialpha<-vector("list",10)
enrichauto_hibeta<-vector("list",10)
for (i in 1:10){
#hi alpha, hi raw ancmeans
enrichauto_hialpha[[i]]<-randomauto_high(datfile, 'ancmean', 'alpha', quants[i])
#hi beta, hi folded ancmeans
enrichauto_hibeta[[i]]<-randomauto_high(datfile, 'ancfold', 'beta', quants[i])
}
## for low alpha ###
randomauto_low<- function(dfile = NA, anc = NA, cline = NA, q = NA) {
{
anctop = quantile(dfile[[anc]],probs=q)
clinetop = quantile(dfile[[cline]],probs=q)
#subset the common rows from the master data file
comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]
## Randomization 2: randomization for excess on Z ##########################
null_auto<-rep(NA, 10000)
for (i in 1:10000){
x<-round(q*length(dfile[,1]),0)
n = length(comsnps[,1])
cnull = sample(1:length(dfile[,1]), n , replace=F)
null_auto[i] = sum(dfile$scaffold[cnull] != 1631)
}
obs_auto<-sum(comsnps$scaffold != 1631)
p_auto<-mean(null_auto >= obs_auto)
xfold_auto<- obs_auto/mean(null_auto)
result_auto<-cbind(obs_auto, xfold_auto, p_auto)
}
return(result_auto)
}
enrichauto_loalpha<-vector("list",10)
for (i in 1:10){
#low alpha, low raw ancmean
enrichauto_loalpha[[i]]<-randomauto_low(datfile, 'ancmean', 'alpha', quants.lo[i])
}
##################### plotting ############################################################
### reorganizing data for plotting #####
organize <- function(file=NA){
a = file
obs = a[seq(1, length(a), 3)]
x = a[seq(2, length(a), 3)]
p = a[seq(3, length(a), 3)]
finaltab = cbind(obs, x, p)
colnames(finaltab) = c("observed","x-fold", "p")
return(finaltab)
#print(finaltab[1])
}
#### Z ####
zalphahi<-organize(t(as.data.frame(enrichsex_hialpha)))
zalphalo<-organize(t(as.data.frame(enrichsex_loalpha)))
zbeta<-organize(t(as.data.frame(enrichsex_hibeta)))
### for pch #####
ha_z<-zalphahi[,3]
la_z<-zalphalo[,3]
b_z<-zbeta[,3]
ha_z[ha_z > 0.05] <- 1
ha_z[ha_z <= 0.05]<-2
la_z[la_z > 0.05] <- 1
la_z[la_z <= 0.05] <-2
la_z[la_z = NaN] <- 2
b_z[b_z > 0.05]<-1
b_z[b_z <= 0.05]<-2
#### autosomes #####
aalphahi<-organize(t(as.data.frame(enrichauto_hialpha)))
aalphalo<-organize(t(as.data.frame(enrichauto_loalpha)))
abeta<-organize(t(as.data.frame(enrichauto_hibeta)))
### for pch #####
ha_a<-aalphahi[,3]
la_a<-aalphalo[,3]
b_a<-abeta[3]
ha_a[ha_a > 0.05] <- 1
ha_a[ha_a <= 0.05]<-2
la_a[la_a > 0.05] <- 1
la_a[la_a <= 0.05] <-2
la_a[la_a = NaN] <- 2
b_a[b_a > 0.05]<-1
b_a[b_a <= 0.05]<-2
######## actual plot code #############################################
library("wesanderson")
cols<-wes_palette("BottleRocket2")
pdf("predictability.pdf", width=15, height=10,bg="white")
## Split the screen into two rows and one column, defining screens 1 and 2.
split.screen( figs = c( 2, 1 ) )
## Split screen 1 into one row and three columns, defining screens 3, 4, and 5.
split.screen( figs = c( 1, 3 ), screen = 1 )
## Split screen 2 into one row and two columns, defining screens 6 and 7.
split.screen( figs = c( 1, 2 ), screen = 2 )
## The first plot is located in screen 3:
screen( 3 )
par(mar=c(5,5,3,0.1))
#high alpha
hist(null_ha, col="gray", xlim=c(0,50), ylim=c(0,3000), ylab="Density", xlab = "No. of SNPs", cex.lab = 2, cex.axis=1.5, main="")
abline(v=obs_ha, col=cols[1], lwd=4)
title(main= expression(paste("(A) High ", alpha)),cex.main = 2, adj=0)
## The second plot is located in screen 4:
screen( 4 )
par(mar=c(5,2.5,3,0.1))
#low alpha
hist(null_la, ylab="", col="gray", xlim=c(0,50), ylim=c(0,3000), xlab = "No. of SNPs",cex.lab = 2, cex.axis=1.5, main="")
abline(v=obs_la, col=cols[2], lwd=4)
title(main=expression(paste("(B) Low ", alpha)),cex.main = 2, adj=0)
## The third plot is located in screen 5:
screen( 5 )
par(mar=c(5,2.5,3,0.1))
#beta
hist(null_hb, ylab="", col="gray", xlim=c(0,50), ylim=c(0,3000), xlab = "No. of SNPs",cex.lab = 2, cex.axis=1.5, ,main="")
abline(v=obs_hb, col=cols[3], lwd=4)
title(main=expression(paste("(C) High ", beta)),cex.main = 2, adj=0)
## The fourth plot is located in screen 6:
screen( 6 )
par(mar=c(5,6,3.5,.5))
plot(zbeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "X-folds", cex.lab=2, cex.axis = 1, ylim=c(0,7), pch=c(1,16)[b_z], cex=2)
lines(zalphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_z], cex=2, lty=1)
lines(zalphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_z], cex=2, lty=1)
#legend(7.5,17, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", beta))), fill = c(cols[1],cols[2], cols[3]), cex=1.5)
abline(h=1,lty=2,lwd=4)
axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))
title(main="(D) ",cex.main = 2, adj=0)
## The fifth plot is located in screen 7:
screen( 7 )
## autosomes #########################
par(mar=c(5,6,3.5,.5))
plot(abeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "", cex.lab=2, cex.axis = 1, ylim=c(0,2), pch=c(1,16)[b_a], cex=2)
lines(aalphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_a], cex=2, lty=1)
lines(aalphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_a], cex=2, lty=1)
legend(7.5,1.9, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", alpha))), fill = c(cols[1],cols[2],cols[3]), cex=1.5)
abline(h=1,lty=2,lwd=4)
axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))
title(main="(E) ",cex.main = 2, adj=0)
dev.off()
##################### plot the quantiles for just shared SNPs ##################################
salphahi<-organize(t(as.data.frame(enrichshared_hialpha)))
salphalo<-organize(t(as.data.frame(enrichshared_loalpha)))
sbeta<-organize(t(as.data.frame(enrichshared_hibeta)))
### for pch #####
ha_s<-salphahi[,3]
la_s<-salphalo[,3]
b_s<-sbeta[,3]
ha_s[ha_s > 0.05] <- 1
ha_s[ha_s <= 0.05]<-2
la_s[la_s > 0.05] <- 1
la_s[la_s <= 0.05] <-2
la_s[la_s = NaN] <- 2
b_s[b_s > 0.05]<-1
b_s[b_s <= 0.05]<-2
pdf("shared_enrichments.pdf", width=6, height=6,bg="white")
par(mar=c(5,6,3.5,.5))
plot(sbeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "X-folds", cex.lab=2, cex.axis = 1, ylim=c(0,43), pch=c(1,16)[b_s], cex=2)
lines(salphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_s], cex=2, lty=1)
lines(salphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_s], cex=2, lty=1)
legend(7,42, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", beta))), fill = c(cols[1],cols[2], cols[3]), cex=1.5)
abline(h=1,lty=2,lwd=4)
axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))
#title(main="(D) ",cex.main = 2, adj=0)
dev.off()