Structural Annotation

File Names and information:

bfmeansWithScafpositions_all, east, west #files with bf and scaffold from baypass #206,028 SNPs
main_structAnnot.txt #structural annot file #206,049
bf_strucAnnot_combine.py #script to combine file above
all_structAnnot_bfmeans, west_structAnnot_bfmeans, east_structAnnot_bfmeans, shared58_structAnnot_bfmeans #output for all, east and west and 58 shared SNPs from above script
top58snps_scafPosBayesfactors : file baypass and scaffold and positions information of 58 shared SNPs
top_all : file with 58 shared snps info and just all bayesfactors. Used to merge bayesfactors with structural annotation info
structAnnot_barplot.pdf: made this plot for ESA meeting. Need to remake this again.

#R code for randomization example

sum(is.na(west$ongene)==FALSE)

sum(is.na(west$ongene[x])==FALSE)

mean(is.na(west$ongene[x])==FALSE)

mean(is.na(west$ongene)==FALSE)

#for top 0.001%

x=c(1:20600)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(all)[1],20600,replace=FALSE)

null[i]=mean(is.na(all$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(all$ongene[x])==FALSE)

obs

xfold

#barplot

counts = (c(0.39,0.37,0.28,0.27, 0.39,0.37,0.28,0.27,0.39,0.37,0.28,0.27))

barplot(counts, col = c("pink","green"), xlab = "On gene (pink) and On codon (green)", ylab = "Observed number of SNVs", cex.lab = 1.5, ylim = c(0,0.5))

#results on May 2nd, 2017 after dropping NA from the files

#getting the top quantiles for the snps

all = read.table("all_structAnnot_bfmeans", header=T)

east = read.table("east_structAnnot_bfmeans", header=T)

west = read.table("west_structAnnot_bfmeans", header=T)

shared = read.table("shared58_structAnnot_bfmeans", header=T)

#data frames without NA

#all populations

x.all = data.frame(all)

sum(is.na(x.all$bfmeans)) #19

cc = is.na(x.all$bfmeans)

m = which(cc==c("TRUE"))

DF.all = x.all[-m,]

dim(DF.all) #206028 15

#western populations

x.west = data.frame(west)

sum(is.na(x.west$bfmeans)) #19

cc = is.na(x.west$bfmeans)

m = which(cc==c("TRUE"))

DF.west = x.west[-m,]

dim(DF.west) #206028 15

#eastern populations

x.east = data.frame(east)

sum(is.na(x.east$bfmeans)) #19

cc = is.na(x.east$bfmeans)

m = which(cc==c("TRUE"))

DF.east = x.east[-m,]

## randomization

bndsw = quantile(DF.west$bfmeans,probs=c(0.9,0.99,0.999,0.9999))

sum(DF.west$bfmeans > bndsw[1]) #20603

sum(DF.west$bfmeans > bndsw[2]) #2061

sum(DF.west$bfmeans > bndsw[3]) #207

sum(DF.west$bfmeans > bndsw[4]) #27

sum(is.na(west$ongene)==FALSE) #58760

mean(is.na(west$ongene)==FALSE) #0.2851

#western

#for top 1%

#ongene

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.west)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.west$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.west)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.west$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$oncds[x])==FALSE)

obs

xfold

#top 0.01%

#ongene

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.west)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.west$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.west)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.west$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$oncds[x])==FALSE)

obs

xfold

#top 0.001%

#ongene

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.west)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.west$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.west)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.west$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.west$oncds[x])==FALSE)

obs

xfold

############### eastern ####################################

#for top 1%

#ongene

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.east)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.east$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.east)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.east$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$oncds[x])==FALSE)

obs

xfold

#top 0.01%

#ongene

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.east)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.east$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.east)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.east$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$oncds[x])==FALSE)

obs

xfold

#top 0.001%

#ongene

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.east)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.east$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.east)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.east$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.east$oncds[x])==FALSE)

obs

xfold

################# all ##################################

#for top 1%

#ongene

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.all)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.all$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:20603)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.all)[1],20603,replace=FALSE)

null[i]=mean(is.na(DF.all$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$oncds[x])==FALSE)

obs

xfold

#top 0.01%

#ongene

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.all)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.all$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:2061)

null<-rep(NA,100)

for(i in 1:100){

y<-sample(1:dim(DF.all)[1],2061,replace=FALSE)

null[i]=mean(is.na(DF.all$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$oncds[x])==FALSE)

obs

xfold

#top 0.001%

#ongene

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.all)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.all$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:207)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.all)[1],207,replace=FALSE)

null[i]=mean(is.na(DF.all$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.all$oncds[x])==FALSE)

obs

xfold

################# shared 58 SNPs ###########################

#ongene

x=c(1:58)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.shared)[1],58,replace=FALSE)

null[i]=mean(is.na(DF.shared$ongene[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.shared$ongene[x])==FALSE)

obs

xfold

#oncds

x=c(1:58)

null<-rep(NA,1000)

for(i in 1:1000){

y<-sample(1:dim(DF.shared)[1],58,replace=FALSE)

null[i]=mean(is.na(DF.shared$oncds[y])==FALSE)

}

p = mean(null >= obs)

xfold = obs / mean(null)

obs = mean(is.na(DF.shared$oncds[x])==FALSE)

obs

xfold

FINAL RESULTS