predictability

To do:

compare high alpha with high ancmeans
compare low alpha with low ancmeans
compare high beta with folded high ancmeans
Check Z and autosomes for each of them
After redo the SNP annotations

### read in the popanc files ##########

bcr3<-read.table("../popanc/outfiles/aims/out_aims3_BCR.txt", header=T, sep=",")

bic3<-read.table("../popanc/outfiles/aims/out_aims3_BIC.txt", header=T, sep=",")

bld3<-read.table("../popanc/outfiles/aims/out_aims3_BLD.txt", header=T, sep=",")

bnp3<-read.table("../popanc/outfiles/aims/out_aims3_BNP.txt", header=T, sep=",")

btb3<-read.table("../popanc/outfiles/aims/out_aims3_BTB.txt", header=T, sep=",")

frc3<-read.table("../popanc/outfiles/aims/out_aims3_FRC.txt", header=T, sep=",")

pin3<-read.table("../popanc/outfiles/aims/out_aims3_PIN.txt", header=T, sep=",")

psp3<-read.table("../popanc/outfiles/aims/out_aims3_PSP.txt", header=T, sep=",")

rdl3<-read.table("../popanc/outfiles/aims/out_aims3_RDL.txt", header=T, sep=",")

rnv3<-read.table("../popanc/outfiles/aims/out_aims3_RNV.txt", header=T, sep=",")

## take ancestry means across populations ####

ancmean<-(bcr3$mean + bic3$mean + bld3$mean + bnp3$mean + btb3$mean + frc3$mean + rdl3$mean + rnv3$mean + pin3$mean + psp3$mean) / 10

## folding the values ####

ancmean_fold<-ancmean

ancmean_fold[ancmean_fold < 0.5]<-1-ancmean_fold[ancmean_fold < 0.5]

### read in the clines files. Here I am using the medians across MCMC and different chains for each parameter alpha and beta ###############

cline3<-read.table("../clines/outfiles/pct3_alphabeta_mcmcmedian.txt", header=TRUE)

### get the scaffold position file #########

mappos3<-read.table("../popanc/infiles/aims/mappos3.txt", header=F, sep=":")

### create a master file with scaffold position and clines and ancestry values

datfile<-cbind(mappos3, ancmean, ancmean_fold, cline3)

colnames(datfile)<-c("scaffold","position","ancmean", "ancfold","alpha","beta")

head(datfile)

##########################################################################

## trying out randomization for top 10% quantiles ###

##########################################################################

### high alpha ####

anctop = quantile(datfile$ancmean,probs=c(0.9,0.99,0.999,0.9999))

alphatop = quantile(datfile$alpha,probs=c(0.9,0.99,0.999,0.9999))

length(which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1]))

#subset the common rows from the master data file

comsnps<-datfile[which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1]),]

## randomization for shared snps

null_ha<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile$ancmean),0)

ar<-sample(1:length(datfile$ancmean),x, replace=FALSE)

cr<-sample(1:length(datfile$alpha),x,replace=FALSE)

null_ha[i]<- sum(ar %in% cr)

}

obs_ha<-length(which(datfile$ancmean > anctop[1] & datfile$alpha > alphatop[1])) #22

p_ha<-mean(null_ha >= obs_ha) #7e-04

xfold_ha<-obs_ha / mean(null_ha) #1.909092

obs_ha

p_ha

xfold_ha

## randomization for excess on Z

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] == 1631)

}

obs = sum(comsnps$scaffold == 1631) #12

p<-mean(null >= obs) #3e-04

xfold<- obs/mean(null) #2.821339

cbind(obs, xfold, p)

## randomization for excess on autosomes

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] != 1631)

}

obs = sum(comsnps$scaffold != 1631) #10

p<-mean(null >= obs) #1

xfold<- obs/mean(null) #0.5639713

cbind(obs, xfold, p)

### low alpha #####

anclo = quantile(datfile$ancmean,probs=c(0.1,0.01,0.001))

alphalo = quantile(datfile$alpha,probs=c(0.1,0.01,0.001))

length(which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1]))

#subset the common rows from the master data file

comsnps_al<-datfile[which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1]),]

## randomization for shared snps

null_la<-rep(NA, 10000)

for (i in 1:10000){

x<-round(0.1*length(datfile$ancmean),0)

ar<-sample(1:length(datfile$ancmean),x, replace=FALSE)

cr<-sample(1:length(datfile$alpha),x,replace=FALSE)

null_la[i]<- sum(ar %in% cr)

}

obs_la<-length(which(datfile$ancmean < anclo[1] & datfile$alpha < alphalo[1])) #9

p_la<-mean(null_la >= obs_la) #0.8394

xfold_la<-obs_la / mean(null_la) #0.7797001

obs_la

p_la

xfold_la

## randomization for excess on Z

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round(0.1*length(datfile[,1]),0)

n = length(comsnps_al[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] == 1631)

}

obs = sum(comsnps_al$scaffold == 1631) #2

p<-mean(null >= obs) #0.5378

xfold<- obs/mean(null) #1.157675

cbind(obs, xfold, p)

## randomization for excess on autosomes

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round((0.1)*length(datfile[,1]),0)

n = length(comsnps_al[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] != 1631)

}

obs = sum(comsnps_al$scaffold != 1631) #7

p<-mean(null >= obs) #0.7529

xfold<- obs/mean(null) #0.9631393

cbind(obs, xfold, p)

### high beta ####

anctop = quantile(datfile$ancfold,probs=c(0.9,0.99,0.999,0.9999))

betatop = quantile(datfile$beta,probs=c(0.9,0.99,0.999,0.9999))

length(which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1]))

#subset the common rows from the master data file

comsnps_b<-datfile[which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1]),]

## randomization for shared snps

null_hb<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile$ancfold),0)

ar<-sample(1:length(datfile$ancfold),x, replace=FALSE)

cr<-sample(1:length(datfile$beta),x,replace=FALSE)

null_hb[i]<- sum(ar %in% cr)

}

obs_hb<-length(which(datfile$ancfold > anctop[1] & datfile$beta > betatop[1])) #49

p_hb<-mean(null_hb >= obs_hb) #<0

xfold_hb<-obs_hb / mean(null_hb) #4.234395

obs_hb

p_hb

xfold_hb

## randomization for excess on Z

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile[,1]),0)

n = length(comsnps_b[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] == 1631)

}

obs = sum(comsnps_b$scaffold == 1631) #37

p<-mean(null >= obs) #<0

xfold<- obs/mean(null) #3.919575

cbind(obs, xfold, p)

## randomization for excess on autosomes

null<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(datfile[,1]),0)

n = length(comsnps_b[,1])

cnull = sample(1:length(datfile[,1]), n , replace=F)

null[i] = sum(datfile$scaffold[cnull] != 1631)

}

obs = sum(comsnps_b$scaffold != 1631) #12

p<-mean(null >= obs) #1

xfold<- obs/mean(null) #0.3036829

cbind(obs, xfold, p)

#######################################################################################

####### randomization functions for shared SNPs in top quantiles ######################

#######################################################################################

quants<-c(0.99,0.98, 0.97, 0.96, 0.95,0.94,0.93,0.92,0.91,0.9)

quants.lo<-c(0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1)

randomshared_high <- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]

## Randomization 1: randomization for shared snps ##########################

null_shared<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-q)*length(dfile[[anc]]),0)

ar<-sample(1:length(dfile[[anc]]),x, replace=FALSE)

cr<-sample(1:length(dfile[[cline]]),x,replace=FALSE)

null_shared[i]<- sum(ar %in% cr)

}

obs_shared<-length(which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop))

p_shared<-mean(null_shared >= obs_shared)

xfold_shared<-obs_shared / mean(null_shared)

result_shared<-cbind(obs_shared, xfold_shared, p_shared)

}

return(result_shared)

}

#randomization_high(datfile, 'ancmean','alpha',0.9)

enrichshared_hialpha<-vector("list",10)

enrichshared_hibeta<-vector("list",10)

for (i in 1:10){

#hi alpha, hi raw ancmeans

enrichshared_hialpha[[i]]<-randomshared_high(datfile, 'ancmean', 'alpha', quants[i])

#hi beta, hi folded ancmeans

enrichshared_hibeta[[i]]<-randomshared_high(datfile, 'ancfold', 'beta', quants[i])

}

### randomization for low alpha ####

randomshared_low <- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]

## Randomization 1: randomization for shared snps ##########################

null_shared<-rep(NA, 10000)

for (i in 1:10000){

x<-round((q)*length(dfile[[anc]]),0)

ar<-sample(1:length(dfile[[anc]]),x, replace=FALSE)

cr<-sample(1:length(dfile[[cline]]),x,replace=FALSE)

null_shared[i]<- sum(ar %in% cr)

}

obs_shared<-length(which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop))

p_shared<-mean(null_shared >= obs_shared)

xfold_shared<-obs_shared / mean(null_shared)

result_shared<-cbind(obs_shared, xfold_shared, p_shared)

}

return(result_shared)

}

enrichshared_loalpha<-vector("list",10)

for (i in 1:10){

#low alpha, low raw ancmean

enrichshared_loalpha[[i]]<-randomshared_low(datfile, 'ancmean', 'alpha', quants.lo[i])

}

### randomization for excess on z ##########

randomsex_high<- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]

## Randomization 2: randomization for excess on Z ##########################

null_sex<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(dfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(dfile[,1]), n , replace=F)

null_sex[i] = sum(dfile$scaffold[cnull] == 1631)

}

obs_sex<-sum(comsnps$scaffold == 1631)

p_sex<-mean(null_sex >= obs_sex)

xfold_sex<- obs_sex/mean(null_sex)

result_sex<-cbind(obs_sex, xfold_sex, p_sex)

}

return(result_sex)

}

enrichsex_hialpha<-vector("list",10)

enrichsex_hibeta<-vector("list",10)

for (i in 1:10){

#hi alpha, hi raw ancmeans

enrichsex_hialpha[[i]]<-randomsex_high(datfile, 'ancmean', 'alpha', quants[i])

#hi beta, hi folded ancmeans

enrichsex_hibeta[[i]]<-randomsex_high(datfile, 'ancfold', 'beta', quants[i])

}

## for low alpha ###

randomsex_low<- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]

## Randomization 2: randomization for excess on Z ##########################

null_sex<-rep(NA, 10000)

for (i in 1:10000){

x<-round(q*length(dfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(dfile[,1]), n , replace=F)

null_sex[i] = sum(dfile$scaffold[cnull] == 1631)

}

obs_sex<-sum(comsnps$scaffold == 1631)

p_sex<-mean(null_sex >= obs_sex)

xfold_sex<- obs_sex/mean(null_sex)

result_sex<-cbind(obs_sex, xfold_sex, p_sex)

}

return(result_sex)

}

enrichsex_loalpha<-vector("list",10)

for (i in 1:10){

#low alpha, low raw ancmean

enrichsex_loalpha[[i]]<-randomsex_low(datfile, 'ancmean', 'alpha', quants.lo[i])

}

### randomization for excess on autosomes ##########

randomauto_high<- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] > anctop & dfile[[cline]] > clinetop),]

## Randomization 2: randomization for excess on Z ##########################

null_auto<-rep(NA, 10000)

for (i in 1:10000){

x<-round((1-0.9)*length(dfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(dfile[,1]), n , replace=F)

null_auto[i] = sum(dfile$scaffold[cnull] != 1631)

}

obs_auto<-sum(comsnps$scaffold != 1631)

p_auto<-mean(null_auto >= obs_auto)

xfold_auto<- obs_auto/mean(null_auto)

result_auto<-cbind(obs_auto, xfold_auto, p_auto)

}

return(result_auto)

}

enrichauto_hialpha<-vector("list",10)

enrichauto_hibeta<-vector("list",10)

for (i in 1:10){

#hi alpha, hi raw ancmeans

enrichauto_hialpha[[i]]<-randomauto_high(datfile, 'ancmean', 'alpha', quants[i])

#hi beta, hi folded ancmeans

enrichauto_hibeta[[i]]<-randomauto_high(datfile, 'ancfold', 'beta', quants[i])

}

## for low alpha ###

randomauto_low<- function(dfile = NA, anc = NA, cline = NA, q = NA) {

{

anctop = quantile(dfile[[anc]],probs=q)

clinetop = quantile(dfile[[cline]],probs=q)

#subset the common rows from the master data file

comsnps<-dfile[which(dfile[[anc]] < anctop & dfile[[cline]] < clinetop),]

## Randomization 2: randomization for excess on Z ##########################

null_auto<-rep(NA, 10000)

for (i in 1:10000){

x<-round(q*length(dfile[,1]),0)

n = length(comsnps[,1])

cnull = sample(1:length(dfile[,1]), n , replace=F)

null_auto[i] = sum(dfile$scaffold[cnull] != 1631)

}

obs_auto<-sum(comsnps$scaffold != 1631)

p_auto<-mean(null_auto >= obs_auto)

xfold_auto<- obs_auto/mean(null_auto)

result_auto<-cbind(obs_auto, xfold_auto, p_auto)

}

return(result_auto)

}

enrichauto_loalpha<-vector("list",10)

for (i in 1:10){

#low alpha, low raw ancmean

enrichauto_loalpha[[i]]<-randomauto_low(datfile, 'ancmean', 'alpha', quants.lo[i])

}

##################### plotting ############################################################

### reorganizing data for plotting #####

organize <- function(file=NA){

a = file

obs = a[seq(1, length(a), 3)]

x = a[seq(2, length(a), 3)]

p = a[seq(3, length(a), 3)]

finaltab = cbind(obs, x, p)

colnames(finaltab) = c("observed","x-fold", "p")

return(finaltab)

#print(finaltab[1])

}

#### Z ####

zalphahi<-organize(t(as.data.frame(enrichsex_hialpha)))

zalphalo<-organize(t(as.data.frame(enrichsex_loalpha)))

zbeta<-organize(t(as.data.frame(enrichsex_hibeta)))

### for pch #####

ha_z<-zalphahi[,3]

la_z<-zalphalo[,3]

b_z<-zbeta[,3]

ha_z[ha_z > 0.05] <- 1

ha_z[ha_z <= 0.05]<-2

la_z[la_z > 0.05] <- 1

la_z[la_z <= 0.05] <-2

la_z[la_z = NaN] <- 2

b_z[b_z > 0.05]<-1

b_z[b_z <= 0.05]<-2

#### autosomes #####

aalphahi<-organize(t(as.data.frame(enrichauto_hialpha)))

aalphalo<-organize(t(as.data.frame(enrichauto_loalpha)))

abeta<-organize(t(as.data.frame(enrichauto_hibeta)))

### for pch #####

ha_a<-aalphahi[,3]

la_a<-aalphalo[,3]

b_a<-abeta[3]

ha_a[ha_a > 0.05] <- 1

ha_a[ha_a <= 0.05]<-2

la_a[la_a > 0.05] <- 1

la_a[la_a <= 0.05] <-2

la_a[la_a = NaN] <- 2

b_a[b_a > 0.05]<-1

b_a[b_a <= 0.05]<-2

######## actual plot code #############################################

library("wesanderson")

cols<-wes_palette("BottleRocket2")

pdf("predictability.pdf", width=15, height=10,bg="white")

## Split the screen into two rows and one column, defining screens 1 and 2.

split.screen( figs = c( 2, 1 ) )

## Split screen 1 into one row and three columns, defining screens 3, 4, and 5.

split.screen( figs = c( 1, 3 ), screen = 1 )

## Split screen 2 into one row and two columns, defining screens 6 and 7.

split.screen( figs = c( 1, 2 ), screen = 2 )

## The first plot is located in screen 3:

screen( 3 )

par(mar=c(5,5,3,0.1))

#high alpha

hist(null_ha, col="gray", xlim=c(0,50), ylim=c(0,3000), ylab="Density", xlab = "No. of SNPs", cex.lab = 2, cex.axis=1.5, main="")

abline(v=obs_ha, col=cols[1], lwd=4)

title(main= expression(paste("(A) High ", alpha)),cex.main = 2, adj=0)

## The second plot is located in screen 4:

screen( 4 )

par(mar=c(5,2.5,3,0.1))

#low alpha

hist(null_la, ylab="", col="gray", xlim=c(0,50), ylim=c(0,3000), xlab = "No. of SNPs",cex.lab = 2, cex.axis=1.5, main="")

abline(v=obs_la, col=cols[2], lwd=4)

title(main=expression(paste("(B) Low ", alpha)),cex.main = 2, adj=0)

## The third plot is located in screen 5:

screen( 5 )

par(mar=c(5,2.5,3,0.1))

#beta

hist(null_hb, ylab="", col="gray", xlim=c(0,50), ylim=c(0,3000), xlab = "No. of SNPs",cex.lab = 2, cex.axis=1.5, ,main="")

abline(v=obs_hb, col=cols[3], lwd=4)

title(main=expression(paste("(C) High ", beta)),cex.main = 2, adj=0)

## The fourth plot is located in screen 6:

screen( 6 )

par(mar=c(5,6,3.5,.5))

plot(zbeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "X-folds", cex.lab=2, cex.axis = 1, ylim=c(0,7), pch=c(1,16)[b_z], cex=2)

lines(zalphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_z], cex=2, lty=1)

lines(zalphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_z], cex=2, lty=1)

#legend(7.5,17, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", beta))), fill = c(cols[1],cols[2], cols[3]), cex=1.5)

abline(h=1,lty=2,lwd=4)

axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))

title(main="(D) ",cex.main = 2, adj=0)

## The fifth plot is located in screen 7:

screen( 7 )

## autosomes #########################

par(mar=c(5,6,3.5,.5))

plot(abeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "", cex.lab=2, cex.axis = 1, ylim=c(0,2), pch=c(1,16)[b_a], cex=2)

lines(aalphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_a], cex=2, lty=1)

lines(aalphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_a], cex=2, lty=1)

legend(7.5,1.9, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", alpha))), fill = c(cols[1],cols[2],cols[3]), cex=1.5)

abline(h=1,lty=2,lwd=4)

axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))

title(main="(E) ",cex.main = 2, adj=0)

dev.off()

##################### plot the quantiles for just shared SNPs ##################################

salphahi<-organize(t(as.data.frame(enrichshared_hialpha)))

salphalo<-organize(t(as.data.frame(enrichshared_loalpha)))

sbeta<-organize(t(as.data.frame(enrichshared_hibeta)))

### for pch #####

ha_s<-salphahi[,3]

la_s<-salphalo[,3]

b_s<-sbeta[,3]

ha_s[ha_s > 0.05] <- 1

ha_s[ha_s <= 0.05]<-2

la_s[la_s > 0.05] <- 1

la_s[la_s <= 0.05] <-2

la_s[la_s = NaN] <- 2

b_s[b_s > 0.05]<-1

b_s[b_s <= 0.05]<-2

pdf("shared_enrichments.pdf", width=6, height=6,bg="white")

par(mar=c(5,6,3.5,.5))

plot(sbeta[,2],xaxt ='n', type = "o", col=cols[3],lwd=4, xlab = "Top % SNPs", ylab = "X-folds", cex.lab=2, cex.axis = 1, ylim=c(0,43), pch=c(1,16)[b_s], cex=2)

lines(salphahi[,2], type = "o", col=cols[1],lwd=4, pch=c(1,16)[ha_s], cex=2, lty=1)

lines(salphalo[,2], type = "o", col=cols[2], lwd = 4, pch=c(1,16)[la_s], cex=2, lty=1)

legend(7,42, c(expression(paste("High ", alpha)),expression(paste("Low ", alpha)), expression(paste("High ", beta))), fill = c(cols[1],cols[2], cols[3]), cex=1.5)

abline(h=1,lty=2,lwd=4)

axis(side=1, at=1:10, labels=c("0.01", "0.02", "0.03", "0.04","0.05","0.06", "0.07","0.08","0.09","0.1"))

#title(main="(D) ",cex.main = 2, adj=0)

dev.off()

Page updated

Google Sites

Report abuse