#create separate linkage group and scaffold files from the filtered vcfs for each species
grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_bart.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_bart.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_cali.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_cali.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_chum.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_chum.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_cris.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_cris.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_land.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_land.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_knul.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_knul.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_podu.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_podu.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_popp.txt
grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_popp.txt
#combine lg and scaffolds
paste lg_bart.txt scaffold_bart.txt > lg_scaffold_bart.txt
paste lg_cali.txt scaffold_cali.txt > lg_scaffold_cali.txt
paste lg_chum.txt scaffold_chum.txt > lg_scaffold_chum.txt
paste lg_cris.txt scaffold_cris.txt > lg_scaffold_cris.txt
paste lg_knul.txt scaffold_knul.txt > lg_scaffold_knul.txt
paste lg_land.txt scaffold_land.txt > lg_scaffold_land.txt
paste lg_podu.txt scaffold_podu.txt > lg_scaffold_podu.txt
paste lg_popp.txt scaffold_popp.txt > lg_scaffold_popp.txt
I then edited the files above manually to add comma delimiter between columns and added a column header. I then moved this final file to the timema_adaptation/analyses/baypass/pcvars_run/outfiles/bfmeans folder
## then in R
#list lg scaffold files for each species
lsfiles<-Sys.glob("lg_scaffold_*")
#list pc bfmeans files for each species
pc1files<-Sys.glob("pc1_scaf_pos_len_bfmeans_*")
pc2files<-Sys.glob("pc2_scaf_pos_len_bfmeans_*")
pc3files<-Sys.glob("pc3_scaf_pos_len_bfmeans_*")
#loops to create files for each species
#pc1
for (i in 1:length(pc1files)){
dat<-read.table(pc1files[i], header=T)
#print(i)
ls<-read.table(lsfiles[i], header=T, sep=",")
datc<-cbind(ls[,1],dat)
colnames(datc)<-c("lg","scaffold","position","length","bfmeans")
#print(head(datc))
write.table(datc, paste("lg",pc1files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)
}
#pc2
for (i in 1:length(pc2files)){
dat<-read.table(pc2files[i], header=T)
#print(i)
ls<-read.table(lsfiles[i], header=T, sep=",")
datc<-cbind(ls[,1],dat)
colnames(datc)<-c("lg","scaffold","position","length","bfmeans")
#print(head(datc))
write.table(datc, paste("lg",pc2files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)
}
#pc3
for (i in 1:length(pc3files)){
dat<-read.table(pc3files[i], header=T)
#print(i)
ls<-read.table(lsfiles[i], header=T, sep=",")
datc<-cbind(ls[,1],dat)
colnames(datc)<-c("lg","scaffold","position","length","bfmeans")
#print(head(datc))
write.table(datc, paste("lg",pc3files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)
}
Then I edited the TimemaAnalyses.py script to TimemaAnalyses_addLG.py to add linkage group column to the final pc file which is used for all downstream analyses.
python TimemaAnalysis_addLG.py lgpc1_scaf_pos_len_bfmeans_ pc1_bin_lgscaf_out.csv
python TimemaAnalysis_addLG.py lgpc2_scaf_pos_len_bfmeans_ pc2_bin_lgscaf_out.csv
python TimemaAnalysis_addLG.py lgpc3_scaf_pos_len_bfmeans_ pc3_bin_lgscaf_out.csv
########
I got the linkage groups for the scaffolds from the timema annotation file in the folder: /uufs/chpc.utah.edu/common/home/gompert-group1/projects/timema_adaptation/analyses/annotation/inputfiles
I first got the info in bash:
cut -d$'\t' -f 1 timema_cristinae_genomeannotation.txt > timema_lg_scaffolds.txt
In vim: replace "_" with "," and remove lg, ord and scaf.
Then in R:
dat<-read.csv("timema_lg_scaffolds.txt", header=F)
datuniq<-unique(dat)
colnames(datuniq)<-c("lg","ord","scaffold")
write.table(datuniq, "timema_lg_scaffolds_uniq.txt",row.names=F, col.names=T, sep=" ", quote=F) #use this for analyses