linkagegroups_scaf

#create separate linkage group and scaffold files from the filtered vcfs for each species

grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_bart.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaBart.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_bart.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_cali.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCali.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_cali.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_chum.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaChum.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_chum.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_cris.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaCris.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_cris.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_land.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaLand.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_land.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_knul.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaKnul.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_knul.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_podu.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPodu.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_podu.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d":" -f 2 | cut -d"-" -f 2 > lg_popp.txt

grep ^Sc morefilter3X_filtered3x_variantsTimemaPopp.vcf | cut -d":" -f 3 | cut -d"-" -f 2 > scaffold_popp.txt

#combine lg and scaffolds

paste lg_bart.txt scaffold_bart.txt > lg_scaffold_bart.txt

paste lg_cali.txt scaffold_cali.txt > lg_scaffold_cali.txt

paste lg_chum.txt scaffold_chum.txt > lg_scaffold_chum.txt

paste lg_cris.txt scaffold_cris.txt > lg_scaffold_cris.txt

paste lg_knul.txt scaffold_knul.txt > lg_scaffold_knul.txt

paste lg_land.txt scaffold_land.txt > lg_scaffold_land.txt

paste lg_podu.txt scaffold_podu.txt > lg_scaffold_podu.txt

paste lg_popp.txt scaffold_popp.txt > lg_scaffold_popp.txt

I then edited the files above manually to add comma delimiter between columns and added a column header. I then moved this final file to the timema_adaptation/analyses/baypass/pcvars_run/outfiles/bfmeans folder

## then in R

#list lg scaffold files for each species

lsfiles<-Sys.glob("lg_scaffold_*")

#list pc bfmeans files for each species

pc1files<-Sys.glob("pc1_scaf_pos_len_bfmeans_*")

pc2files<-Sys.glob("pc2_scaf_pos_len_bfmeans_*")

pc3files<-Sys.glob("pc3_scaf_pos_len_bfmeans_*")

#loops to create files for each species

#pc1

for (i in 1:length(pc1files)){

dat<-read.table(pc1files[i], header=T)

#print(i)

ls<-read.table(lsfiles[i], header=T, sep=",")

datc<-cbind(ls[,1],dat)

colnames(datc)<-c("lg","scaffold","position","length","bfmeans")

#print(head(datc))

write.table(datc, paste("lg",pc1files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)

}

#pc2

for (i in 1:length(pc2files)){

dat<-read.table(pc2files[i], header=T)

#print(i)

ls<-read.table(lsfiles[i], header=T, sep=",")

datc<-cbind(ls[,1],dat)

colnames(datc)<-c("lg","scaffold","position","length","bfmeans")

#print(head(datc))

write.table(datc, paste("lg",pc2files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)

}

#pc3

for (i in 1:length(pc3files)){

dat<-read.table(pc3files[i], header=T)

#print(i)

ls<-read.table(lsfiles[i], header=T, sep=",")

datc<-cbind(ls[,1],dat)

colnames(datc)<-c("lg","scaffold","position","length","bfmeans")

#print(head(datc))

write.table(datc, paste("lg",pc3files[[i]],sep = ""), col.names=T, row.names=F, sep=",", quote=F)

}

Then I edited the TimemaAnalyses.py script to TimemaAnalyses_addLG.py to add linkage group column to the final pc file which is used for all downstream analyses.

python TimemaAnalysis_addLG.py lgpc1_scaf_pos_len_bfmeans_ pc1_bin_lgscaf_out.csv

python TimemaAnalysis_addLG.py lgpc2_scaf_pos_len_bfmeans_ pc2_bin_lgscaf_out.csv

python TimemaAnalysis_addLG.py lgpc3_scaf_pos_len_bfmeans_ pc3_bin_lgscaf_out.csv

########

I got the linkage groups for the scaffolds from the timema annotation file in the folder: /uufs/chpc.utah.edu/common/home/gompert-group1/projects/timema_adaptation/analyses/annotation/inputfiles

I first got the info in bash:

cut -d$'\t' -f 1 timema_cristinae_genomeannotation.txt > timema_lg_scaffolds.txt

In vim: replace "_" with "," and remove lg, ord and scaf.

Then in R:

dat<-read.csv("timema_lg_scaffolds.txt", header=F)

datuniq<-unique(dat)

colnames(datuniq)<-c("lg","ord","scaffold")

write.table(datuniq, "timema_lg_scaffolds_uniq.txt",row.names=F, col.names=T, sep=" ", quote=F) #use this for analyses

Page updated

Google Sites

Report abuse