William Gilks - Structural variant QC

Structural variant detection in whole-genome resequencing II.

R code for analysis of locus from Genomestrip CNV pipeline output, https://zenodo.org/record/159282

Summary data generated by GATK variantsToTable function.

package.list = c('tidyr', 'dplyr', 'ggplot2', 'cowplot', 'xtable', 'car')

new.packages = package.list[!(package.list %in% installed.packages()[,"Package"])]

if (length(new.packages)) install.packages(new.packages)

rm(package.list, new.packages)

library(ggplot2)

library(tidyr)

library(dplyr)

library(cowplot)

library(xtable)

library(car)

Settings for graphs.

theme_f3 = function() {

  theme_bw(base_size = 10) +

    theme(axis.line = element_line(size = .3, colour = "black"),

          legend.background = element_rect(fill = "white", size = 0.25, linetype = 1, colour = "grey"),

          legend.key = element_rect(colour = "white"),

            panel.border = element_rect(color = "black", size = 0),

            panel.margin.x = unit(c(.1,1,.1,1), "lines"),

            strip.background = element_rect(colour = "white", fill = "white"),

            axis.line.x = element_line(size = .3, colour = "black"),

            axis.line.y = element_line(size = .3, colour = "black"),

            panel.grid.major = element_blank(),

            panel.grid.minor = element_blank())}

attr(theme_f3(), "complete")

Load data

cnv1 = tbl_df(read.delim(file = "filtered.goodS.lhm_gs.cnvs.raw.vcf.qc.tbl"))

Format data

m.cnv1 = cnv1 %>%

  mutate(Mb = POS / 1e+06, lkb = GCLENGTH / 1000) %>%

  select(-c(AN, AF, AC, GLALTSUM, GLHETSUM, POS, GCLENGTH, HOM.VAR, HOM.REF, NO.CALL, GSNNONREF)) %>%

  gather(key = metric, value = value, -c(CHROM, Mb, ID, GSCNCATEGORY))

Calculate median values for each metric grouped by type

cnv.meds = m.cnv1 %>%

  group_by(metric, GSCNCATEGORY) %>%

  summarize(med = median(value, na.rm = T)) %>%

  spread(key = GSCNCATEGORY, med) %>%

  rename(del.med = DEL, dup.med = DUP)

Calculate IQR values for each metric by type

cnv.iqr = m.cnv1 %>%

  group_by(metric, GSCNCATEGORY) %>%

  summarize(iqr = IQR(value, na.rm = T)) %>%

  spread(key = GSCNCATEGORY, iqr) %>%

  rename(del.iqr = DEL, dup.iqr = DUP)

Combine the median and IQR data.

cnv.summ = cbind(cnv.meds[,1:2], cnv.iqr[,2], cnv.meds[,3], cnv.iqr[,3])

Output table in latex format for manuscript.

print( xtable(cnv.summ, caption = "Quality metrics and genotype counts for filtered Genomestrip variants"),

        "filtered_cnvs_qc_summary_latex.txt", type = "latex", include.rownames = FALSE)

Read CNV genotypes data from output of GATK variantsToTable function.

gt = tbl_df(read.delim("filtered.goodS.lhm_gs.cnvs.raw.GT.tbl", header = TRUE))

Convert positions to Mb, convert lengths to Kb, make a stop position for each CNV, reformat data frame as 'long', classify event type by copy-number.

mgt = gt %>%

  mutate(Mb = POS / 1e+06) %>%

  mutate(len.Mb = GCLENGTH / 1e+06) %>%

  mutate(stop = Mb + len.Mb) %>%

  gather(key = sample, value = cn, -c(CHROM, ID, POS, Mb, stop, len.Mb, GSCNCATEGORY, GCLENGTH, GSCNQUAL)) %>%

  mutate(event.type = recode(cn, "1='Deletion';2='wild-type';3='Duplication';4='Duplication';5='Duplication';6='Duplication';7='Duplication'"))

To order get the samples in the correct order on the plot, I assign rank numbers in the other direction.

212 samples, including 1 ref line. Code could be made more elegant. e.g. 212 = length (x), etc.

v1 = sort(rep(seq(1:212), 167), decreasing = TRUE)

cnv.gdat = cbind(mgt, v1)

      head(cnv.gdat)

      dim(mgt)

cnv.gplot =

  ggplot(data = filter(cnv.gdat, CHROM != "chr4"), aes(as.factor(v1), Mb, fill = event.type, colour = event.type)) +

  geom_crossbar(aes(ymin = Mb, ymax = stop)) +

  scale_x_discrete(labels = NULL, breaks = NULL, name = "Sample (in order sequenced)") +

  scale_y_continuous(name = "Position (Mb)", breaks = c(0, 5, 10, 15, 20, 25, 30), expand = c(0,0)) +

  scale_colour_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") +

  scale_fill_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") +

  theme_bw() +

  coord_flip() +

  guides(fill = guide_legend(ncol = 1)) +

  facet_grid(.~CHROM, scales = "free", space = "free", switch = "x") +

  theme(

    panel.margin.x = unit(c(-.02,.4,-.02,.4), "cm"),

    legend.position = "bottom",

    panel.background = element_blank(),

    strip.background = element_rect(colour = "white", fill = "white"),

    panel.grid = element_blank())

Make extra variables, and catagorise heterozygote count as a proxy for MAF.

distr.dat = cnv1 %>%

  mutate(gcount = HOM.REF + HET + HOM.VAR + NO.CALL) %>%

  mutate(prop.het1 = 100 * (HET / gcount)) %>%

  mutate(het.group = cut(prop.het1, breaks = seq(0,100,20), labels = c("0-20","20-40","40-60","60-80","80-100"))) %>%

  mutate(kb.len = GCLENGTH / 1000) %>%

  mutate(Mb = POS / 1000000) %>%

  select(-c(HOM.REF, HOM.VAR, NO.CALL, GLALTSUM, GLHETSUM, AN, AF, AC, GSNNONREF, Mb, prop.het1, gcount, GCLENGTH)) %>%

  gather(metric, value, -c(CHROM, ID, POS, GSCNCATEGORY, het.group))

Labels for facet plot

cnvdis.labs = as_labeller(c(`DEL` = "Deletions", `DUP` = "Duplications",

    `GCFRACTION` = "GC-content", `GSCLUSTERSEP` = "Cluster separation",`GSCNQUAL` = "Quality",

    `HET` = "het. count", `kb.len` = "Length(Kb)"))

Make plot for distributions of cnv metrics, coloured by proportion of heterozygote genotypes.

cnv.dis =

ggplot(distr.dat, aes(value, fill = het.group)) +

  geom_histogram(position = "stack", colour = "grey10", size = .01, bins = 10) +

  theme_bw(base_size = 28) +

  scale_fill_manual(name = "% Heterozygous Genotypes", values = alpha(c(

    "lemonchiffon", "tan1", "orangered", "darkorchid", "black"),.8)) +

  labs(y = "Variant Count") +

  coord_cartesian(expand = FALSE) +

  facet_grid(GSCNCATEGORY~metric, scales = "free", switch = "y", labeller = cnvdis.labs) +

  theme(strip.background = element_rect(colour = "white", fill = "white"),

        legend.position = "bottom",

        panel.grid.minor = element_blank())

Google Sites

Report abuse