R code for analysis of locus from Genomestrip CNV pipeline output, https://zenodo.org/record/159282
Summary data generated by GATK variantsToTable function.
package.list = c('tidyr', 'dplyr', 'ggplot2', 'cowplot', 'xtable', 'car')new.packages = package.list[!(package.list %in% installed.packages()[,"Package"])]if (length(new.packages)) install.packages(new.packages)rm(package.list, new.packages)library(ggplot2)library(tidyr)library(dplyr)library(cowplot)library(xtable)library(car)Settings for graphs.
theme_f3 = function() { theme_bw(base_size = 10) + theme(axis.line = element_line(size = .3, colour = "black"), legend.background = element_rect(fill = "white", size = 0.25, linetype = 1, colour = "grey"), legend.key = element_rect(colour = "white"), panel.border = element_rect(color = "black", size = 0), panel.margin.x = unit(c(.1,1,.1,1), "lines"), strip.background = element_rect(colour = "white", fill = "white"), axis.line.x = element_line(size = .3, colour = "black"), axis.line.y = element_line(size = .3, colour = "black"), panel.grid.major = element_blank(), panel.grid.minor = element_blank())}attr(theme_f3(), "complete")Load data
cnv1 = tbl_df(read.delim(file = "filtered.goodS.lhm_gs.cnvs.raw.vcf.qc.tbl"))Format data
m.cnv1 = cnv1 %>% mutate(Mb = POS / 1e+06, lkb = GCLENGTH / 1000) %>% select(-c(AN, AF, AC, GLALTSUM, GLHETSUM, POS, GCLENGTH, HOM.VAR, HOM.REF, NO.CALL, GSNNONREF)) %>% gather(key = metric, value = value, -c(CHROM, Mb, ID, GSCNCATEGORY))Calculate median values for each metric grouped by type
cnv.meds = m.cnv1 %>% group_by(metric, GSCNCATEGORY) %>% summarize(med = median(value, na.rm = T)) %>% spread(key = GSCNCATEGORY, med) %>% rename(del.med = DEL, dup.med = DUP)Calculate IQR values for each metric by type
cnv.iqr = m.cnv1 %>% group_by(metric, GSCNCATEGORY) %>% summarize(iqr = IQR(value, na.rm = T)) %>% spread(key = GSCNCATEGORY, iqr) %>% rename(del.iqr = DEL, dup.iqr = DUP)Combine the median and IQR data.
cnv.summ = cbind(cnv.meds[,1:2], cnv.iqr[,2], cnv.meds[,3], cnv.iqr[,3])Output table in latex format for manuscript.
print( xtable(cnv.summ, caption = "Quality metrics and genotype counts for filtered Genomestrip variants"), "filtered_cnvs_qc_summary_latex.txt", type = "latex", include.rownames = FALSE)Read CNV genotypes data from output of GATK variantsToTable function.
gt = tbl_df(read.delim("filtered.goodS.lhm_gs.cnvs.raw.GT.tbl", header = TRUE))Convert positions to Mb, convert lengths to Kb, make a stop position for each CNV, reformat data frame as 'long', classify event type by copy-number.
mgt = gt %>% mutate(Mb = POS / 1e+06) %>% mutate(len.Mb = GCLENGTH / 1e+06) %>% mutate(stop = Mb + len.Mb) %>% gather(key = sample, value = cn, -c(CHROM, ID, POS, Mb, stop, len.Mb, GSCNCATEGORY, GCLENGTH, GSCNQUAL)) %>% mutate(event.type = recode(cn, "1='Deletion';2='wild-type';3='Duplication';4='Duplication';5='Duplication';6='Duplication';7='Duplication'")) To order get the samples in the correct order on the plot, I assign rank numbers in the other direction.
212 samples, including 1 ref line. Code could be made more elegant. e.g. 212 = length (x), etc.
v1 = sort(rep(seq(1:212), 167), decreasing = TRUE)cnv.gdat = cbind(mgt, v1) head(cnv.gdat) dim(mgt)cnv.gplot = ggplot(data = filter(cnv.gdat, CHROM != "chr4"), aes(as.factor(v1), Mb, fill = event.type, colour = event.type)) + geom_crossbar(aes(ymin = Mb, ymax = stop)) + scale_x_discrete(labels = NULL, breaks = NULL, name = "Sample (in order sequenced)") + scale_y_continuous(name = "Position (Mb)", breaks = c(0, 5, 10, 15, 20, 25, 30), expand = c(0,0)) + scale_colour_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") + scale_fill_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") + theme_bw() + coord_flip() + guides(fill = guide_legend(ncol = 1)) + facet_grid(.~CHROM, scales = "free", space = "free", switch = "x") + theme( panel.margin.x = unit(c(-.02,.4,-.02,.4), "cm"), legend.position = "bottom", panel.background = element_blank(), strip.background = element_rect(colour = "white", fill = "white"), panel.grid = element_blank())Make extra variables, and catagorise heterozygote count as a proxy for MAF.
distr.dat = cnv1 %>% mutate(gcount = HOM.REF + HET + HOM.VAR + NO.CALL) %>% mutate(prop.het1 = 100 * (HET / gcount)) %>% mutate(het.group = cut(prop.het1, breaks = seq(0,100,20), labels = c("0-20","20-40","40-60","60-80","80-100"))) %>% mutate(kb.len = GCLENGTH / 1000) %>% mutate(Mb = POS / 1000000) %>% select(-c(HOM.REF, HOM.VAR, NO.CALL, GLALTSUM, GLHETSUM, AN, AF, AC, GSNNONREF, Mb, prop.het1, gcount, GCLENGTH)) %>% gather(metric, value, -c(CHROM, ID, POS, GSCNCATEGORY, het.group))Labels for facet plot
cnvdis.labs = as_labeller(c(`DEL` = "Deletions", `DUP` = "Duplications", `GCFRACTION` = "GC-content", `GSCLUSTERSEP` = "Cluster separation",`GSCNQUAL` = "Quality", `HET` = "het. count", `kb.len` = "Length(Kb)"))Make plot for distributions of cnv metrics, coloured by proportion of heterozygote genotypes.
cnv.dis =ggplot(distr.dat, aes(value, fill = het.group)) + geom_histogram(position = "stack", colour = "grey10", size = .01, bins = 10) + theme_bw(base_size = 28) + scale_fill_manual(name = "% Heterozygous Genotypes", values = alpha(c( "lemonchiffon", "tan1", "orangered", "darkorchid", "black"),.8)) + labs(y = "Variant Count") + coord_cartesian(expand = FALSE) + facet_grid(GSCNCATEGORY~metric, scales = "free", switch = "y", labeller = cnvdis.labs) + theme(strip.background = element_rect(colour = "white", fill = "white"), legend.position = "bottom", panel.grid.minor = element_blank())