Identifying inversions from the mugsy comparative alignment, continued

Post date: Sep 21, 2018 9:29:4 PM

As it is unclear whether ++ or +- is the inverted orientation (see note here), I decided on a two-step approach to identify inversions from the comparative alignment.

1. First, I putatively defined inversions based on the dominant orientation (which was defined as the non-inverted orientation). This will not always be true, but provides a justifiable best guess.

From the /uufs/chpc.utah.edu/common/home/u6000989/data/timema/tcrDovetail/MugsyComp, I ran

perl ExtractOrientInversions.pl orientation_mualnTcrBrwGrnDec16.txt mualnTcrBrwGrnDec16.maf

This generates the list of putative inversions, inversions_mualnTcrBrwGrnDec16.txt. This file lists the scaffold, start, stop and size of each inversion followed by some constant information fields that can be ignored.

2. Then, I identified those set of these inversions that overlapped with those found by lumpy or delly (I took the union of these two sets).

This was done in the /uufs/chpc.utah.edu/common/home/u6000989/projects/timema_SV/SVs directory with FindSvsMugsy.R, which is provided below with embedded summaries.

## runs intansv with modifications to work with current formats to generate a combined SV list

library(intansv)

myReadLumpy<-function (file = "", regSizeLowerCutoff = 100, regSizeUpperCutoff = 1e+06,

readsSupport = 3, method = "Lumpy")

{

LumpyCont <- read.table(file, as.is = TRUE,fill=TRUE)

LumpyCont <- LumpyCont[, c(1:2, 5, 11, 12, 16)]

names(LumpyCont) <- c("chr", "start", "end", "type", "info", "rp_support")

LumpyCont$type <- gsub("TYPE:([A-Z]+)", "\\1", LumpyCont$type)

LumpyCont <- LumpyCont[LumpyCont$type %in% c("DELETION", "DUPLICATION",

"INVERSION"), ]

LumpyCont$size <- as.numeric(LumpyCont$end - LumpyCont$start)

LumpyCont <- LumpyCont[LumpyCont$rp_support >= readsSupport &

LumpyCont$size >= regSizeLowerCutoff & LumpyCont$size <=

regSizeUpperCutoff, ]

LumpyCont <- LumpyCont[, c("chr", "start", "end", "size",

"rp_support", "type")]

LumpyDelDf <- LumpyCont[LumpyCont$type == "DELETION", ]

if (!is.null(LumpyDelDf) && nrow(LumpyDelDf) > 0) {

LumpyDelIrange <- GRanges(seqnames = LumpyDelDf$chr,

ranges = IRanges(start = LumpyDelDf$start, end = LumpyDelDf$end))

LumpyDelIrangeRes <- findOverlaps(LumpyDelIrange, reduce(LumpyDelIrange))

LumpyDelDf$clu <- subjectHits(LumpyDelIrangeRes)

LumpyDelDfFilMer <- ddply(LumpyDelDf, ("clu"), LumpyCluster)

LumpyDelDfFilMer <- LumpyDelDfFilMer[, c("chr", "start",

"end", "size", "rp_support")]

names(LumpyDelDfFilMer) <- c("chromosome", "pos1", "pos2",

"size", "readsSupport")

LumpyDelDfFilMer$info <- paste0("SU=", LumpyDelDfFilMer$readsSupport)

LumpyDelDfFilMer$readsSupport <- NULL

}

else {

LumpyDelDfFilMer <- NULL

}

LumpyDupDf <- LumpyCont[LumpyCont$type == "DUPLICATION", ]

if (!is.null(LumpyDupDf) && nrow(LumpyDupDf) > 0) {

LumpyDupIrange <- GRanges(seqnames = LumpyDupDf$chr,

ranges = IRanges(start = LumpyDupDf$start, end = LumpyDupDf$end))

LumpyDupIrangeRes <- findOverlaps(LumpyDupIrange, reduce(LumpyDupIrange))

LumpyDupDf$clu <- subjectHits(LumpyDupIrangeRes)

LumpyDupDfFilMer <- ddply(LumpyDupDf, ("clu"), LumpyCluster)

LumpyDupDfFilMer <- LumpyDupDfFilMer[, c("chr", "start",

"end", "size", "rp_support")]

names(LumpyDupDfFilMer) <- c("chromosome", "pos1", "pos2",

"size", "readsSupport")

LumpyDupDfFilMer$info <- paste0("SU=", LumpyDupDfFilMer$readsSupport)

LumpyDupDfFilMer$readsSupport <- NULL

}

else {

LumpyDupDfFilMer <- NULL

}

LumpyInvDf <- LumpyCont[LumpyCont$type == "INVERSION", ]

if (!is.null(LumpyInvDf) && nrow(LumpyInvDf) > 0) {

LumpyInvIrange <- GRanges(seqnames = LumpyInvDf$chr,

ranges = IRanges(start = LumpyInvDf$start, end = LumpyInvDf$end))

LumpyInvIrangeRes <- findOverlaps(LumpyInvIrange, reduce(LumpyInvIrange))

LumpyInvDf$clu <- subjectHits(LumpyInvIrangeRes)

LumpyInvDfFilMer <- ddply(LumpyInvDf, ("clu"), LumpyCluster)

LumpyInvDfFilMer <- LumpyInvDfFilMer[, c("chr", "start",

"end", "size", "rp_support")]

names(LumpyInvDfFilMer) <- c("chromosome", "pos1", "pos2",

"size", "readsSupport")

LumpyInvDfFilMer$info <- paste0("SU=", LumpyInvDfFilMer$readsSupport)

LumpyInvDfFilMer$readsSupport <- NULL

}

else {

LumpyInvDfFilMer <- NULL

}

retuRes <- list(del = LumpyDelDfFilMer, dup = LumpyDupDfFilMer,

inv = LumpyInvDfFilMer)

attributes(retuRes) <- c(attributes(retuRes), list(method = method))

return(retuRes)

}

e<-environment(readDelly)

environment(myReadLumpy)<-e

lumpy<-myReadLumpy("lumpy/rs_sub_lumpy_test2.bedpe",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="Lumpy")

delly<-readDelly("delly/Delly_filtering_out_overlapping_reads/vcfs/",regSizeLowerCutoff=1000,regSizeUpperCutoff=10000000, readsSupport=3,method="DELLY", pass=TRUE, minMappingQuality=30)

## add pos1 and pos2 to delly info field so that it is carried into final files

delly[[1]]$info<-paste(delly[[1]]$info,delly[[1]]$pos1,delly[[1]]$pos2,sep=";")

delly[[2]]$info<-paste(delly[[2]]$info,delly[[2]]$pos1,delly[[2]]$pos2,sep=";")

delly[[3]]$info<-paste(delly[[3]]$info,delly[[3]]$pos1,delly[[3]]$pos2,sep=";")

## delly & lumpy

sv<-methodsMerge(delly,lumpy)

## mugsy inversions

mugsy<-read.table("inversions_mualnTcrBrwGrnDec16.txt")

mugsy<-mugsy[,-6]

colnames(mugsy)<-c("chromosome","pos1","pos2","size","info")

mugsy$info<-paste(mugsy[,1],mugsy[,2],mugsy[,3],sep="_")

mugsyL<-list(del=NULL,dup=NULL,inv=mugsy)

attr(mugsyL,"method")<-"mugsy"

## mugsy plus lumpy

sv_ml<-methodsMerge(lumpy,mugsyL)

## 106 inverions

## mugsy plus delly

sv_md<-methodsMerge(delly,mugsyL)

## 567 inverions

## get unique ids

ids_sv_md<-matrix(unlist(strsplit(sv_md[[3]][,5],"ScRvNkF_")),ncol=2,byrow=TRUE)

ids_sv_ml<-matrix(unlist(strsplit(sv_ml[[3]][,5],"ScRvNkF_")),ncol=2,byrow=TRUE)

## number shared

sum(ids_sv_md[,2] %in% ids_sv_ml[,2])

#[1] 48

add<-which(!ids_sv_ml[,2] %in% ids_sv_md[,2])

## merge the two, take union

sv_mugsy<-rbind(sv_md[[3]],sv_ml[[3]][add,])

size<-sv_mugsy[,3]-sv_mugsy[,2]

sv_mugsy<-data.frame(sv_mugsy,size)

summary(size)

# Min. 1st Qu. Median Mean 3rd Qu. Max.

# 1078 2243 3267 3824 4594 17444

write.table(sv_mugsy,"sv_mugsy.txt",row.names=FALSE,quote=FALSE)

save(list=ls(),file="sv_mugsy.rdat")