Post date: Sep 21, 2018 9:29:4 PM
As it is unclear whether ++ or +- is the inverted orientation (see note here), I decided on a two-step approach to identify inversions from the comparative alignment.
1. First, I putatively defined inversions based on the dominant orientation (which was defined as the non-inverted orientation). This will not always be true, but provides a justifiable best guess.
From the /uufs/chpc.utah.edu/common/home/u6000989/data/timema/tcrDovetail/MugsyComp, I ran
perl ExtractOrientInversions.pl orientation_mualnTcrBrwGrnDec16.txt mualnTcrBrwGrnDec16.maf
This generates the list of putative inversions, inversions_mualnTcrBrwGrnDec16.txt. This file lists the scaffold, start, stop and size of each inversion followed by some constant information fields that can be ignored.
2. Then, I identified those set of these inversions that overlapped with those found by lumpy or delly (I took the union of these two sets).
This was done in the /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_SV/SVs directory with FindSvsMugsy.R, which is provided below with embedded summaries.
## runs intansv with modifications to work with current formats to generate a combined SV list
library(intansv)
myReadLumpy<-function (file = "", regSizeLowerCutoff = 100, regSizeUpperCutoff = 1e+06,
readsSupport = 3, method = "Lumpy")
{
LumpyCont <- read.table(file, as.is = TRUE,fill=TRUE)
LumpyCont <- LumpyCont[, c(1:2, 5, 11, 12, 16)]
names(LumpyCont) <- c("chr", "start", "end", "type", "info", "rp_support")
LumpyCont$type <- gsub("TYPE:([A-Z]+)", "\\1", LumpyCont$type)
LumpyCont <- LumpyCont[LumpyCont$type %in% c("DELETION", "DUPLICATION",
"INVERSION"), ]
LumpyCont$size <- as.numeric(LumpyCont$end - LumpyCont$start)
LumpyCont <- LumpyCont[LumpyCont$rp_support >= readsSupport &
LumpyCont$size >= regSizeLowerCutoff & LumpyCont$size <=
regSizeUpperCutoff, ]
LumpyCont <- LumpyCont[, c("chr", "start", "end", "size",
"rp_support", "type")]
LumpyDelDf <- LumpyCont[LumpyCont$type == "DELETION", ]
if (!is.null(LumpyDelDf) && nrow(LumpyDelDf) > 0) {
LumpyDelIrange <- GRanges(seqnames = LumpyDelDf$chr,
ranges = IRanges(start = LumpyDelDf$start, end = LumpyDelDf$end))
LumpyDelIrangeRes <- findOverlaps(LumpyDelIrange, reduce(LumpyDelIrange))
LumpyDelDf$clu <- subjectHits(LumpyDelIrangeRes)
LumpyDelDfFilMer <- ddply(LumpyDelDf, ("clu"), LumpyCluster)
LumpyDelDfFilMer <- LumpyDelDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyDelDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyDelDfFilMer$info <- paste0("SU=", LumpyDelDfFilMer$readsSupport)
LumpyDelDfFilMer$readsSupport <- NULL
}
else {
LumpyDelDfFilMer <- NULL
LumpyDelDfFilMer <- NULL
}
LumpyDupDf <- LumpyCont[LumpyCont$type == "DUPLICATION", ]
if (!is.null(LumpyDupDf) && nrow(LumpyDupDf) > 0) {
LumpyDupIrange <- GRanges(seqnames = LumpyDupDf$chr,
ranges = IRanges(start = LumpyDupDf$start, end = LumpyDupDf$end))
LumpyDupIrangeRes <- findOverlaps(LumpyDupIrange, reduce(LumpyDupIrange))
LumpyDupDf$clu <- subjectHits(LumpyDupIrangeRes)
LumpyDupDfFilMer <- ddply(LumpyDupDf, ("clu"), LumpyCluster)
LumpyDupDfFilMer <- LumpyDupDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyDupDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyDupDfFilMer$info <- paste0("SU=", LumpyDupDfFilMer$readsSupport)
LumpyDupDfFilMer$readsSupport <- NULL
}
else {
LumpyDupDfFilMer <- NULL
}
LumpyInvDf <- LumpyCont[LumpyCont$type == "INVERSION", ]
if (!is.null(LumpyInvDf) && nrow(LumpyInvDf) > 0) {
LumpyInvIrange <- GRanges(seqnames = LumpyInvDf$chr,
ranges = IRanges(start = LumpyInvDf$start, end = LumpyInvDf$end))
LumpyInvIrangeRes <- findOverlaps(LumpyInvIrange, reduce(LumpyInvIrange))
LumpyInvDf$clu <- subjectHits(LumpyInvIrangeRes)
LumpyInvDfFilMer <- ddply(LumpyInvDf, ("clu"), LumpyCluster)
LumpyInvDfFilMer <- LumpyInvDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyInvDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyInvDfFilMer$info <- paste0("SU=", LumpyInvDfFilMer$readsSupport)
LumpyInvDfFilMer$readsSupport <- NULL
}
else {
LumpyInvDfFilMer <- NULL
}
retuRes <- list(del = LumpyDelDfFilMer, dup = LumpyDupDfFilMer,
inv = LumpyInvDfFilMer)
attributes(retuRes) <- c(attributes(retuRes), list(method = method))
return(retuRes)
}
e<-environment(readDelly)
environment(myReadLumpy)<-e
lumpy<-myReadLumpy("lumpy/rs_sub_lumpy_test2.bedpe",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="Lumpy")
delly<-readDelly("delly/Delly_filtering_out_overlapping_reads/vcfs/",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="DELLY", pass=TRUE, minMappingQuality=30)
## add pos1 and pos2 to delly info field so that it is carried into final files
delly[[1]]$info<-paste(delly[[1]]$info,delly[[1]]$pos1,delly[[1]]$pos2,sep=";")
delly[[2]]$info<-paste(delly[[2]]$info,delly[[2]]$pos1,delly[[2]]$pos2,sep=";")
delly[[3]]$info<-paste(delly[[3]]$info,delly[[3]]$pos1,delly[[3]]$pos2,sep=";")
## delly & lumpy
sv<-methodsMerge(delly,lumpy)
## mugsy inversions
mugsy<-read.table("inversions_mualnTcrBrwGrnDec16.txt")
mugsy<-mugsy[,-6]
colnames(mugsy)<-c("chromosome","pos1","pos2","size","info")
mugsy$info<-paste(mugsy[,1],mugsy[,2],mugsy[,3],sep="_")
mugsyL<-list(del=NULL,dup=NULL,inv=mugsy)
attr(mugsyL,"method")<-"mugsy"
## mugsy plus lumpy
sv_ml<-methodsMerge(lumpy,mugsyL)
## 106 inverions
## mugsy plus delly
sv_md<-methodsMerge(delly,mugsyL)
## 567 inverions
## get unique ids
ids_sv_md<-matrix(unlist(strsplit(sv_md[[3]][,5],"ScRvNkF_")),ncol=2,byrow=TRUE)
ids_sv_ml<-matrix(unlist(strsplit(sv_ml[[3]][,5],"ScRvNkF_")),ncol=2,byrow=TRUE)
## number shared
sum(ids_sv_md[,2] %in% ids_sv_ml[,2])
#[1] 48
add<-which(!ids_sv_ml[,2] %in% ids_sv_md[,2])
## merge the two, take union
sv_mugsy<-rbind(sv_md[[3]],sv_ml[[3]][add,])
size<-sv_mugsy[,3]-sv_mugsy[,2]
sv_mugsy<-data.frame(sv_mugsy,size)
summary(size)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1078 2243 3267 3824 4594 17444
write.table(sv_mugsy,"sv_mugsy.txt",row.names=FALSE,quote=FALSE)
save(list=ls(),file="sv_mugsy.rdat")