Get the genotype likelihood files for each population for each species - this is the entropy input file
Subset the top SNPs for each species from each population
Get the pairwise LD for top SNPs
Pairwise LD for top snps squared correlation histogram, mean, quantiles
Pairwise LD for top species
Find the max LD for a given SNP with any SNP - collapse to max LD for each SNP - make a histogram of that - how associated is each SNP with which it is associated - if those values are low then independence or some are high
Use estimates from entropy
Each population pairwise LD X 3 PCs X 8 species
Average LD among the top SNPs = 45 values of each PC and then a histogram of all 45 values
1. Prepare the input file. I used the input file from entropy. I converted the gl file to genotype estimate file $dir = /uufs/chpc.utah.edu/common/home/u6007910/projects/timema_adaptation/analyses/ldcalc mkdir genest_inputfiles cp ../../entropy/bart/timemaBart.gl ./ cp ../../entropy/cali/timemaCali.gl ./ cp ../../entropy/chum/timemaChum.gl ./ cp ../../entropy/cris/timemaCris.gl ./ cp ../../entropy/land/timemaLand.gl ./ cp ../../entropy/knul/timemaKnul.gl ./ cp ../../entropy/podu/timemaPodu.gl ./ cp ../../entropy/popp/timemaPopp.gl ./ head -1 *gl ==> timemaBart.gl <== 195 3074 ==> timemaCali.gl <== 77 7858 ==> timemaChum.gl <== 358 4172 ==> timemaCris.gl <== 205 196252 ==> timemaKnul.gl <== 89 11139 ==> timemaLand.gl <== 125 8548 ==> timemaPodu.gl <== 255 6000 ==> timemaPopp.gl <== 116 7157 Drop the first line from each file using vim. 2. Subset the top SNPs from each species for each PC #get the top SNPs for each PC and get there lg,scaf, position #read in the file for baypass means for each species #function to get topsnps file for each species to subset them from the genotype likelihood file library(DataCombine) pcvars<-c("pc1","pc2","pc3") species<-c("bart","cali","chum","cris","knul","land","podu","popp") for(s in species){ dat_bfm<-read.table(paste("../../baypass/pcvars_run/outfiles/bfmeans/lgpc1_scaf_pos_len_bfmeans",s,sep="_"), header=T, sep=",") topsnps_bfm<-dat_bfm[which(dat_bfm[,5] > quantile(dat_bfm[,5],probs=0.9,na.rm=T)),] topsnps_bfm_o<-topsnps_bfm[order(topsnps_bfm$bfmeans, decreasing=T),][1:10,] tr<-rep("TRUE",10) write.table(cbind(topsnps_bfm_o[,c(1:3)],tr),file=paste("topsnps_pc1",s,sep="_"), sep=" ",quote=F,col.names=T,row.names=F) } for(s in species){ dat_bfm<-read.table(paste("../../baypass/pcvars_run/outfiles/bfmeans/lgpc2_scaf_pos_len_bfmeans",s,sep="_"), header=T, sep=",") topsnps_bfm<-dat_bfm[which(dat_bfm[,5] > quantile(dat_bfm[,5],probs=0.9,na.rm=T)),] topsnps_bfm_o<-topsnps_bfm[order(topsnps_bfm$bfmeans, decreasing=T),][1:10,] tr<-rep("TRUE",10) write.table(cbind(topsnps_bfm_o[,c(1:3)],tr),file=paste("topsnps_pc2",s,sep="_"), sep=" ",quote=F,col.names=T,row.names=F) } for(s in species){ dat_bfm<-read.table(paste("../../baypass/pcvars_run/outfiles/bfmeans/lgpc3_scaf_pos_len_bfmeans",s,sep="_"), header=T, sep=",") topsnps_bfm<-dat_bfm[which(dat_bfm[,5] > quantile(dat_bfm[,5],probs=0.9,na.rm=T)),] topsnps_bfm_o<-topsnps_bfm[order(topsnps_bfm$bfmeans, decreasing=T),][1:10,] tr<-rep("TRUE",10) write.table(cbind(topsnps_bfm_o[,c(1:3)],tr),file=paste("topsnps_pc3",s,sep="_"), sep=" ",quote=F,col.names=T,row.names=F) } #subset them from the entire matrix on command line using perl script gl2genest_subloci.pl perl gl2genest_subloci.pl topsnps_pc1_bart timemaBart.gl perl gl2genest_subloci.pl topsnps_pc1_cali timemaCali.gl perl gl2genest_subloci.pl topsnps_pc1_chum timemaChum.gl perl gl2genest_subloci.pl topsnps_pc1_cris timemaCris.gl perl gl2genest_subloci.pl topsnps_pc1_knul timemaKnul.gl perl gl2genest_subloci.pl topsnps_pc1_land timemaLand.gl perl gl2genest_subloci.pl topsnps_pc1_podu timemaPodu.gl perl gl2genest_subloci.pl topsnps_pc1_popp timemaPopp.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_bart timemaBart.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_cali timemaCali.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_chum timemaChum.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_cris timemaCris.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_knul timemaKnul.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_land timemaLand.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_podu timemaPodu.gl perl gl2genest_subloci_pc2.pl topsnps_pc2_popp timemaPopp.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_bart timemaBart.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_cali timemaCali.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_chum timemaChum.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_cris timemaCris.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_knul timemaKnul.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_land timemaLand.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_podu timemaPodu.gl perl gl2genest_subloci_pc3.pl topsnps_pc3_popp timemaPopp.gl #edit the pop files perl -p -i -e 's/_SRR\d+.fastq//g' *_pops #podura had a population PCTCR with just one individual so I dropped that column by editing in R ```R #then subset columns of each population #calculate LD and get summaries #repeat for each PC wherein I subset top SNPs for each PC, repeat for each population #read in snp lg, scaf, position file for a population, for PC1 species<-c("bart","cali","chum","cris","knul","land","podu","popp") ### pc1 ####### pdf("pc1_LD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc1_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #plot histogram par(mar=c(4,3,4,3)) hist(ld[lower.tri(ld, diag = F)],xlab = "Pairwise LD", main = paste(p,s)) } } dev.off() pdf("pc1_maxLD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc1_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #get max ld for a given SNP with any other SNP ld.max<-apply(ld,1,max,na.rm=T) #plot histogram par(mar=c(4,3,4,3)) hist(ld.max,xlab = "Pairwise max LD", main = paste(p,s)) } } dev.off() ### pc2 ####### pdf("pc2_LD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc2_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #plot histogram par(mar=c(4,3,4,3)) hist(ld[lower.tri(ld, diag = F)],xlab = "Pairwise LD", main = paste(p,s)) } } dev.off() pdf("pc2_maxLD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc2_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #get max ld for a given SNP with any other SNP ld.max<-apply(ld,1,max,na.rm=T) #plot histogram par(mar=c(4,3,4,3)) hist(ld.max,xlab = "Pairwise max LD", main = paste(p,s)) } } dev.off() ### pc3 ####### pdf("pc3_LD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc3_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #plot histogram par(mar=c(4,3,4,3)) hist(ld[lower.tri(ld, diag = F)],xlab = "Pairwise LD", main = paste(p,s)) } } dev.off() pdf("pc3_maxLD.pdf", height=20, width=16) par(mfrow=c(9,6)) for (s in species){ pops<-read.table(paste(s,"pops",sep="_"),header=F) #read in genotyope estimate file genest<-read.table(paste0("pntest_pc3_",s,".txt"),header=F) colnames(genest)<-t(pops[1:length(pops)]) genest<-as.data.frame(genest) posi<-unique(colnames(genest)) for (p in posi){ mat<-genest[,which(colnames(genest) == p)] #calculate LD ld<-cor(t(mat))^2 #get summaries diag(ld)<-NA #get max ld for a given SNP with any other SNP ld.max<-apply(ld,1,max,na.rm=T) #plot histogram par(mar=c(4,3,4,3)) hist(ld.max,xlab = "Pairwise max LD", main = paste(p,s)) } } dev.off() mean(apply(ld,1,mean,na.rm=TRUE)) ## 0.9933381 sd(apply(ld,1,mean,na.rm=TRUE)) ## 0.004128813 mean(apply(ld,1,max,na.rm=TRUE)) ## 0.9972744 quantile(ld, na.rm = T) ## 0% 25% 50% 75% 100% ## 0.9818334 0.9905138 0.9945487 0.9985837 1.0000000 ```