correlations_randomizationsR

To do randomizations for parallelism across species, I will divide the entire genome into SNP windows. Here is the code to get the sequence positions from the genome:

Genome is in the folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/timema_adaptation/fasta_genome

grep -o -E "^>ScRvNkF_[0-9_-.,]+:LG-[0-9.,]+:SCAF-[0-9.,]+:MAPPOS-[0-9A-Z.;]+" re_mod_map_timema_06Jun2016_RvNkF702.fasta > mappos.txt

Then created a file containing scaffold lengths called scaffold_length.txt using calcScafLength.py (in the same folder).

Create mappos file for each species using vcf files (saved in folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/timema_adaptation/consensus_align/variantcalling)

#scaffolds

grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d ':' -f 3 > scafBart3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d ':' -f 3 > scafCali3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d ':' -f 3 > scafChum3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d ':' -f 3 > scafCris3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d ':' -f 3 > scafKnul3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d ':' -f 3 > scafLand3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d ':' -f 3 > scafPodu3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d ':' -f 3 > scafPopp3x.txt

#replace the "SCAF-" prefix from all scaf files

perl -p -i -e s/SCAF-//g scaf*3x.txt

#positions

grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -f 2 > posBart3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -f 2 > posCali3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -f 2 > posChum3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -f 2 > posCris3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -f 2 > posKnul3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -f 2 > posLand3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -f 2 > posPodu3x.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -f 2 > posPopp3x.txt

#mappos files

paste scafBart3x.txt posBart3x.txt > mappos_bart.txt

paste scafCali3x.txt posCali3x.txt > mappos_cali.txt

paste scafChum3x.txt posChum3x.txt > mappos_chum.txt

paste scafCris3x.txt posCris3x.txt > mappos_cris.txt

paste scafKnul3x.txt posKnul3x.txt > mappos_knul.txt

paste scafLand3x.txt posLand3x.txt > mappos_land.txt

paste scafPodu3x.txt posPodu3x.txt > mappos_podu.txt

paste scafPopp3x.txt posPopp3x.txt > mappos_popp.txt

1. run combine_mappos_scaflength.py to combine the mappos files for each species with scaffold length (I had to fix scaffold 702 in scaffold_length.txt file and change it to 702.1,702.2 and 702.3). This is in the genofiles folder.

python combine_mappos_scaflength.py scaffold_length.txt mappos_bart.txt mappos_len_bart.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_cali.txt mappos_len_cali.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_chum.txt mappos_len_chum.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_cris.txt mappos_len_cris.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_knul.txt mappos_len_knul.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_land.txt mappos_len_land.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_podu.txt mappos_len_podu.txt

python combine_mappos_scaflength.py scaffold_length.txt mappos_popp.txt mappos_len_popp.txt

2. Use the script combine_bfmeans_mapposlen.R to take bfmeans for each species across chains and combine them with mappos_scaflen files

Here is the script:

#this for loop iterates through species name and writes out bfmeans file for alt, lat and long

#define species name to iterate

species<-c("bart","cali","chum","cris","knul","land","podu","popp")

#for loop to calculate bfmeans and write out the file for each climate variable

for (i in 2:length(species)){

allfiles<-list.files(path=".", pattern = species[i],full.names = TRUE)

mapfiles<-list.files(path="../../genofiles", pattern = "mappos_len", full.names = TRUE)

sps_file<-mapfiles[grepl(species[i],mapfiles)]

spl<-read.table(sps_file, header=T, sep="\t")

#subset clims

alt<-allfiles[grepl("./alt",allfiles)]

alt<-alt[grepl("summary_betai_reg.out", alt)]

lat<-allfiles[grepl("./lat",allfiles)]

lat<-lat[grepl("summary_betai_reg.out", lat)]

long<-allfiles[grepl("./long",allfiles)]

long<-long[grepl("summary_betai_reg.out", long)]

#take means

#altitude

alt1<-read.table(alt[1], header=T)

alt2<-read.table(alt[2], header=T)

alt3<-read.table(alt[3], header=T)

alt4<-read.table(alt[4], header=T)

altmean<-(alt1[,5] + alt2[,5] + alt3[,5] + alt4[,5])/4

spl_altmean<-cbind(spl,altmean[-1])

colnames(spl_altmean)<-c("scaffold","position","length","bfmean")

outname<-paste('alt_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")

write.table(spl_altmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)

#latitude

lat1<-read.table(lat[1], header=T)

lat2<-read.table(lat[2], header=T)

lat3<-read.table(lat[3], header=T)

lat4<-read.table(lat[4], header=T)

latmean<-(lat1[,5] + lat2[,5] + lat3[,5] + lat4[,5])/4

spl_latmean<-cbind(spl,latmean[-1])

colnames(spl_latmean)<-c("scaffold","position","length","bfmean")

outname<-paste('lat_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")

write.table(spl_latmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)

#longitude

long1<-read.table(long[1], header=T)

long2<-read.table(long[2], header=T)

long3<-read.table(long[3], header=T)

long4<-read.table(long[4], header=T)

longmean<-(long1[,5] + long2[,5] + long3[,5] + long4[,5])/4

spl_longmean<-cbind(spl,longmean[-1])

colnames(spl_longmean)<-c("scaffold","position","length","bfmean")

outname<-paste('long_scaf_pos_len_bfmeans', basename(species[i]),sep = "_")

write.table(spl_longmean, file=outname, col.names=T, row.names=F, sep= " ", quote=F)

}

This script will write out the lat_scaf_pos_len_bfmeans*, long_scaf_pos_len_bfmeans*, alt_scaf_pos_len_bfmeans* files for each species

3. Use the calcBFmeans.R file to calculate bayesfactor means for windows along scaffolds.

Here is the script for this:

##take bfmeans

species<-c("bart","cali","chum","cris","knul","land","podu","popp")

allfiles<-list.files(path=".", pattern = "_scaf_pos_len_bfmeans",full.names = TRUE)

for (i in 1:length(allfiles)){

mp_bf<-read.table(allfiles[i], header=T)

##psuedocode

#iterate through all unique scaffolds

#assign a bin ID based on position for scaffold

#divide position by bin width to get a bin ID

#floor the divided value and add 1 for the integer bin ID

window_size<-10000 #keeping 10kb window

window_ids<-floor(mp_bf$position/window_size) + 1 #higher value of the range is bin ID (5-6 then 6 is ID)

mp_bf<-cbind(mp_bf[,c(1:4)],window_ids)

head(mp_bf)

#iterate through all unique scaffolds and bin IDs

#take mean of all positions with same bin ID

#write this into a output file

scaf_ids<-unique(mp_bf$scaffold)

out_file <- data.frame(scaffold=numeric(), windowID=numeric(), bfmean=numeric(), stringsAsFactors=FALSE)

for (id in scaf_ids){

group_scafid<-mp_bf[mp_bf$scaffold == id,]

group_window<-unique(group_scafid$window_ids)

#print(id)

#print(group_window)

#print(dim(group_scafid))

group_window_dim<-dim(group_window)

#print(group_dim)

for (window_id in group_window){

window_group_scafid<-group_scafid[group_scafid$window_ids %in% window_id,]

#print(window_group_scafid)

window_group_scafid_mean<-mean(window_group_scafid[,4])

outdat<-cbind(unique(window_group_scafid$scaffold),unique(window_group_scafid$window_ids),window_group_scafid_mean)

out_file<-rbind(out_file,outdat)

#colnames(out_file)<-c("scaffold","windowID","bfmean")

outname<-paste('window_bfmeans', basename(allfiles[i]), sep = "_")

write.table(out_file, file=outname, row.names = F, col.names = T, sep=" ", quote = F)

}

This will create the files: window_bfmeans_alt_scaf_pos_len_bfmeans_*, window_bfmeans_lat_scaf_pos_len_bfmeans_*, window_bfmeans_long_scaf_pos_len_bfmeans_*

4. Do randomization tests for parallelism: Used the script randomizations_quants.R for this and wrote results for enrichments for different quantiles in files stored in the folder randomization_results

5. Go to folder randomization_results. Use the code in plot_enrichments.R to plot line plots for the enrichments.

Page updated

Google Sites

Report abuse