To do randomizations for parallelism across species, I will divide the entire genome into SNP windows. Here is the code to get the sequence positions from the genome:
Genome is in the folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/timema_adaptation/fasta_genome
grep -o -E "^>ScRvNkF_[0-9_-.,]+:LG-[0-9.,]+:SCAF-[0-9.,]+:MAPPOS-[0-9A-Z.;]+" re_mod_map_timema_06Jun2016_RvNkF702.fasta > mappos.txt
Then created a file containing scaffold lengths called scaffold_length.txt using calcScafLength.py (in the same folder).
Create mappos file for each species using vcf files (saved in folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/timema_adaptation/consensus_align/variantcalling)
#scaffolds
grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d ':' -f 3 > scafBart3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d ':' -f 3 > scafCali3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d ':' -f 3 > scafChum3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d ':' -f 3 > scafCris3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d ':' -f 3 > scafKnul3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d ':' -f 3 > scafLand3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d ':' -f 3 > scafPodu3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d ':' -f 3 > scafPopp3x.txt
#replace the "SCAF-" prefix from all scaf files
perl -p -i -e s/SCAF-//g scaf*3x.txt
#positions
grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -f 2 > posBart3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -f 2 > posCali3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -f 2 > posChum3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -f 2 > posCris3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -f 2 > posKnul3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -f 2 > posLand3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -f 2 > posPodu3x.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -f 2 > posPopp3x.txt
#mappos files
paste scafBart3x.txt posBart3x.txt > mappos_bart.txt
paste scafCali3x.txt posCali3x.txt > mappos_cali.txt
paste scafChum3x.txt posChum3x.txt > mappos_chum.txt
paste scafCris3x.txt posCris3x.txt > mappos_cris.txt
paste scafKnul3x.txt posKnul3x.txt > mappos_knul.txt
paste scafLand3x.txt posLand3x.txt > mappos_land.txt
paste scafPodu3x.txt posPodu3x.txt > mappos_podu.txt
paste scafPopp3x.txt posPopp3x.txt > mappos_popp.txt
1. run combine_mappos_scaflength.py to combine the mappos files for each species with scaffold length (I had to fix scaffold 702 in scaffold_length.txt file and change it to 702.1,702.2 and 702.3). This is in the genofiles folder.
python combine_mappos_scaflength.py scaffold_length.txt mappos_bart.txt mappos_len_bart.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_cali.txt mappos_len_cali.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_chum.txt mappos_len_chum.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_cris.txt mappos_len_cris.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_knul.txt mappos_len_knul.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_land.txt mappos_len_land.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_podu.txt mappos_len_podu.txt
python combine_mappos_scaflength.py scaffold_length.txt mappos_popp.txt mappos_len_popp.txt
2. Use the script combine_bfmeans_mapposlen.R to take bfmeans for each species across chains and combine them with mappos_scaflen files
Here is the script:
#this for loop iterates through species name and writes out bfmeans file for alt, lat and long
#define species name to iterate
species<-c("bart","cali","chum","cris","knul","land","podu","popp")
#for loop to calculate bfmeans and write out the file for each climate variable
for (i in 2:length(species)){
allfiles<-list.files(path=".", pattern = species[i],full.names = TRUE)
mapfiles<-list.files(path="../../genofiles", pattern = "mappos_len", full.names = TRUE)
sps_file<-mapfiles[grepl(species[i],mapfiles)]
spl<-read.table(sps_file, header=T, sep="\t")
#subset clims
alt<-allfiles[grepl("./alt",allfiles)]
alt<-alt[grepl("summary_betai_reg.out", alt)]
lat<-allfiles[grepl("./lat",allfiles)]
lat<-lat[grepl("summary_betai_reg.out", lat)]
long<-allfiles[grepl("./long",allfiles)]
long<-long[grepl("summary_betai_reg.out", long)]
#take means
#altitude
alt1<-read.table(alt[1], header=T)
alt2<-read.table(alt[2], header=T)
alt3<-read.table(alt[3], header=T)
alt4<-read.table(alt[4], header=T)
altmean<-(alt1[,5] + alt2[,5] + alt3[,5] + alt4[,5])/4
spl_altmean<-cbind(spl,altmean[-1])
colnames(spl_altmean)<-c("scaffold","position","length","bfmean")
outname<-paste('alt_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")
write.table(spl_altmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)
#latitude
lat1<-read.table(lat[1], header=T)
lat2<-read.table(lat[2], header=T)
lat3<-read.table(lat[3], header=T)
lat4<-read.table(lat[4], header=T)
latmean<-(lat1[,5] + lat2[,5] + lat3[,5] + lat4[,5])/4
spl_latmean<-cbind(spl,latmean[-1])
colnames(spl_latmean)<-c("scaffold","position","length","bfmean")
outname<-paste('lat_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")
write.table(spl_latmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)
#longitude
long1<-read.table(long[1], header=T)
long2<-read.table(long[2], header=T)
long3<-read.table(long[3], header=T)
long4<-read.table(long[4], header=T)
longmean<-(long1[,5] + long2[,5] + long3[,5] + long4[,5])/4
spl_longmean<-cbind(spl,longmean[-1])
colnames(spl_longmean)<-c("scaffold","position","length","bfmean")
outname<-paste('long_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")
write.table(spl_longmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)
}
This script will write out the lat_scaf_pos_len_bfmeans*, long_scaf_pos_len_bfmeans*, alt_scaf_pos_len_bfmeans* files for each species
3. Use the calcBFmeans.R file to calculate bayesfactor means for windows along scaffolds.
Here is the script for this:
##take bfmeans
species<-c("bart","cali","chum","cris","knul","land","podu","popp")
allfiles<-list.files(path=".", pattern = "_scaf_pos_len_bfmeans",full.names = TRUE)
for (i in 1:length(allfiles)){
mp_bf<-read.table(allfiles[i], header=T)
##psuedocode
#iterate through all unique scaffolds
#assign a bin ID based on position for scaffold
#divide position by bin width to get a bin ID
#floor the divided value and add 1 for the integer bin ID
window_size<-10000 #keeping 10kb window
window_ids<-floor(mp_bf$position/window_size) + 1 #higher value of the range is bin ID (5-6 then 6 is ID)
mp_bf<-cbind(mp_bf[,c(1:4)],window_ids)
head(mp_bf)
#iterate through all unique scaffolds and bin IDs
#take mean of all positions with same bin ID
#write this into a output file
scaf_ids<-unique(mp_bf$scaffold)
out_file <- data.frame(scaffold=numeric(), windowID=numeric(), bfmean=numeric(), stringsAsFactors=FALSE)
for (id in scaf_ids){
group_scafid<-mp_bf[mp_bf$scaffold == id,]
group_window<-unique(group_scafid$window_ids)
#print(id)
#print(group_window)
#print(dim(group_scafid))
group_window_dim<-dim(group_window)
#print(group_dim)
for (window_id in group_window){
window_group_scafid<-group_scafid[group_scafid$window_ids %in% window_id,]
#print(window_group_scafid)
window_group_scafid_mean<-mean(window_group_scafid[,4])
outdat<-cbind(unique(window_group_scafid$scaffold),unique(window_group_scafid$window_ids),window_group_scafid_mean)
out_file<-rbind(out_file,outdat)
#colnames(out_file)<-c("scaffold","windowID","bfmean")
outname<-paste('window_bfmeans', basename(allfiles[i]), sep = "_")
write.table(out_file, file=outname, row.names = F, col.names = T, sep=" ", quote = F)
}
}
}
This will create the files: window_bfmeans_alt_scaf_pos_len_bfmeans_*, window_bfmeans_lat_scaf_pos_len_bfmeans_*, window_bfmeans_long_scaf_pos_len_bfmeans_*
4. Do randomization tests for parallelism: Used the script randomizations_quants.R for this and wrote results for enrichments for different quantiles in files stored in the folder randomization_results
5. Go to folder randomization_results. Use the code in plot_enrichments.R to plot line plots for the enrichments.