Post date: Jun 27, 2018 10:30:19 PM
I merged the SV calls from delly and lumpy, which came from Kay. This was mainly done in R, though note that I had first reformat the bedpe file from lumpy some (see AddReadSupport.pl).
The work in general was from /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_SV/SVs/. And the main R script is FindSvs.R.
## runs intansv with modifications to work with current formats to generate a combined SV list
library(intansv)
myReadLumpy<-function (file = "", regSizeLowerCutoff = 100, regSizeUpperCutoff = 1e+06,
readsSupport = 3, method = "Lumpy")
{
LumpyCont <- read.table(file, as.is = TRUE,fill=TRUE)
LumpyCont <- LumpyCont[, c(1:2, 5, 11, 12, 16)]
names(LumpyCont) <- c("chr", "start", "end", "type", "info", "rp_support")
LumpyCont$type <- gsub("TYPE:([A-Z]+)", "\\1", LumpyCont$type)
LumpyCont <- LumpyCont[LumpyCont$type %in% c("DELETION", "DUPLICATION",
"INVERSION"), ]
LumpyCont$size <- as.numeric(LumpyCont$end - LumpyCont$start)
LumpyCont <- LumpyCont[LumpyCont$rp_support >= readsSupport &
LumpyCont$size >= regSizeLowerCutoff & LumpyCont$size <=
regSizeUpperCutoff, ]
LumpyCont <- LumpyCont[, c("chr", "start", "end", "size",
"rp_support", "type")]
LumpyDelDf <- LumpyCont[LumpyCont$type == "DELETION", ]
if (!is.null(LumpyDelDf) && nrow(LumpyDelDf) > 0) {
LumpyDelIrange <- GRanges(seqnames = LumpyDelDf$chr,
ranges = IRanges(start = LumpyDelDf$start, end = LumpyDelDf$end))
LumpyDelIrangeRes <- findOverlaps(LumpyDelIrange, reduce(LumpyDelIrange))
LumpyDelDf$clu <- subjectHits(LumpyDelIrangeRes)
LumpyDelDfFilMer <- ddply(LumpyDelDf, ("clu"), LumpyCluster)
LumpyDelDfFilMer <- LumpyDelDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyDelDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyDelDfFilMer$info <- paste0("SU=", LumpyDelDfFilMer$readsSupport)
LumpyDelDfFilMer$readsSupport <- NULL
}
else {
LumpyDelDfFilMer <- NULL
LumpyDelDfFilMer <- NULL
}
LumpyDupDf <- LumpyCont[LumpyCont$type == "DUPLICATION", ]
if (!is.null(LumpyDupDf) && nrow(LumpyDupDf) > 0) {
LumpyDupIrange <- GRanges(seqnames = LumpyDupDf$chr,
ranges = IRanges(start = LumpyDupDf$start, end = LumpyDupDf$end))
LumpyDupIrangeRes <- findOverlaps(LumpyDupIrange, reduce(LumpyDupIrange))
LumpyDupDf$clu <- subjectHits(LumpyDupIrangeRes)
LumpyDupDfFilMer <- ddply(LumpyDupDf, ("clu"), LumpyCluster)
LumpyDupDfFilMer <- LumpyDupDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyDupDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyDupDfFilMer$info <- paste0("SU=", LumpyDupDfFilMer$readsSupport)
LumpyDupDfFilMer$readsSupport <- NULL
}
else {
LumpyDupDfFilMer <- NULL
}
LumpyInvDf <- LumpyCont[LumpyCont$type == "INVERSION", ]
if (!is.null(LumpyInvDf) && nrow(LumpyInvDf) > 0) {
LumpyInvIrange <- GRanges(seqnames = LumpyInvDf$chr,
ranges = IRanges(start = LumpyInvDf$start, end = LumpyInvDf$end))
LumpyInvIrangeRes <- findOverlaps(LumpyInvIrange, reduce(LumpyInvIrange))
LumpyInvDf$clu <- subjectHits(LumpyInvIrangeRes)
LumpyInvDfFilMer <- ddply(LumpyInvDf, ("clu"), LumpyCluster)
LumpyInvDfFilMer <- LumpyInvDfFilMer[, c("chr", "start",
"end", "size", "rp_support")]
names(LumpyInvDfFilMer) <- c("chromosome", "pos1", "pos2",
"size", "readsSupport")
LumpyInvDfFilMer$info <- paste0("SU=", LumpyInvDfFilMer$readsSupport)
LumpyInvDfFilMer$readsSupport <- NULL
}
else {
LumpyInvDfFilMer <- NULL
}
retuRes <- list(del = LumpyDelDfFilMer, dup = LumpyDupDfFilMer,
inv = LumpyInvDfFilMer)
attributes(retuRes) <- c(attributes(retuRes), list(method = method))
return(retuRes)
}
e<-environment(readDelly)
environment(myReadLumpy)<-e
lumpy<-myReadLumpy("lumpy/rs_sub_lumpy_test2.bedpe",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="Lumpy")
delly<-readDelly("delly/Delly_filtering_out_overlapping_reads/vcfs/",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="DELLY", pass=TRUE, minMappingQuality=30)
## add pos1 and pos2 to delly info field so that it is carried into final files
delly[[1]]$info<-paste(delly[[1]]$info,delly[[1]]$pos1,delly[[1]]$pos2,sep=";")
delly[[2]]$info<-paste(delly[[2]]$info,delly[[2]]$pos1,delly[[2]]$pos2,sep=";")
delly[[3]]$info<-paste(delly[[3]]$info,delly[[3]]$pos1,delly[[3]]$pos2,sep=";")
sv<-methodsMerge(delly,lumpy)
for(i in 1:3){
size<-sv[[i]][,3]-sv[[i]][,2]
sv[[i]]<-data.frame(sv[[i]],size)
}
## 194 deletions, 223 duplications, and 492 inversions
## with sizes of 1000 to 10 million
## size distributions
#> summary(sv[[1]][,6]) # deletions
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 2373 9400 16116 29372 26832 817971
#> summary(sv[[2]][,6]) # duplications
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 8577 18638 27034 56604 47568 1067830
#> summary(sv[[3]][,6]) # inversions
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1020 4669 9420 33365 17926 6061122
write.table(sv[[1]],"sv_deletions.txt",row.names=FALSE,quote=FALSE)
write.table(sv[[2]],"sv_dups.txt",row.names=FALSE,quote=FALSE)
write.table(sv[[3]],"sv_inversions.txt",row.names=FALSE,quote=FALSE)
save(list=ls(),file="sv.rdat")