R code for analysis of locus from Genomestrip CNV pipeline output, https://zenodo.org/record/159282
Summary data generated by GATK variantsToTable function.
package.list = c('tidyr', 'dplyr', 'ggplot2', 'cowplot', 'xtable', 'car')
new.packages = package.list[!(package.list %in% installed.packages()[,"Package"])]
if (length(new.packages)) install.packages(new.packages)
rm(package.list, new.packages)
library(ggplot2)
library(tidyr)
library(dplyr)
library(cowplot)
library(xtable)
library(car)
Settings for graphs.
theme_f3 = function() {
theme_bw(base_size = 10) +
theme(axis.line = element_line(size = .3, colour = "black"),
legend.background = element_rect(fill = "white", size = 0.25, linetype = 1, colour = "grey"),
legend.key = element_rect(colour = "white"),
panel.border = element_rect(color = "black", size = 0),
panel.margin.x = unit(c(.1,1,.1,1), "lines"),
strip.background = element_rect(colour = "white", fill = "white"),
axis.line.x = element_line(size = .3, colour = "black"),
axis.line.y = element_line(size = .3, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())}
attr(theme_f3(), "complete")
Load data
cnv1 = tbl_df(read.delim(file = "filtered.goodS.lhm_gs.cnvs.raw.vcf.qc.tbl"))
Format data
m.cnv1 = cnv1 %>%
mutate(Mb = POS / 1e+06, lkb = GCLENGTH / 1000) %>%
select(-c(AN, AF, AC, GLALTSUM, GLHETSUM, POS, GCLENGTH, HOM.VAR, HOM.REF, NO.CALL, GSNNONREF)) %>%
gather(key = metric, value = value, -c(CHROM, Mb, ID, GSCNCATEGORY))
Calculate median values for each metric grouped by type
cnv.meds = m.cnv1 %>%
group_by(metric, GSCNCATEGORY) %>%
summarize(med = median(value, na.rm = T)) %>%
spread(key = GSCNCATEGORY, med) %>%
rename(del.med = DEL, dup.med = DUP)
Calculate IQR values for each metric by type
cnv.iqr = m.cnv1 %>%
group_by(metric, GSCNCATEGORY) %>%
summarize(iqr = IQR(value, na.rm = T)) %>%
spread(key = GSCNCATEGORY, iqr) %>%
rename(del.iqr = DEL, dup.iqr = DUP)
Combine the median and IQR data.
cnv.summ = cbind(cnv.meds[,1:2], cnv.iqr[,2], cnv.meds[,3], cnv.iqr[,3])
Output table in latex format for manuscript.
print( xtable(cnv.summ, caption = "Quality metrics and genotype counts for filtered Genomestrip variants"),
"filtered_cnvs_qc_summary_latex.txt", type = "latex", include.rownames = FALSE)
Read CNV genotypes data from output of GATK variantsToTable function.
gt = tbl_df(read.delim("filtered.goodS.lhm_gs.cnvs.raw.GT.tbl", header = TRUE))
Convert positions to Mb, convert lengths to Kb, make a stop position for each CNV, reformat data frame as 'long', classify event type by copy-number.
mgt = gt %>%
mutate(Mb = POS / 1e+06) %>%
mutate(len.Mb = GCLENGTH / 1e+06) %>%
mutate(stop = Mb + len.Mb) %>%
gather(key = sample, value = cn, -c(CHROM, ID, POS, Mb, stop, len.Mb, GSCNCATEGORY, GCLENGTH, GSCNQUAL)) %>%
mutate(event.type = recode(cn, "1='Deletion';2='wild-type';3='Duplication';4='Duplication';5='Duplication';6='Duplication';7='Duplication'"))
To order get the samples in the correct order on the plot, I assign rank numbers in the other direction.
212 samples, including 1 ref line. Code could be made more elegant. e.g. 212 = length (x), etc.
v1 = sort(rep(seq(1:212), 167), decreasing = TRUE)
cnv.gdat = cbind(mgt, v1)
head(cnv.gdat)
dim(mgt)
cnv.gplot =
ggplot(data = filter(cnv.gdat, CHROM != "chr4"), aes(as.factor(v1), Mb, fill = event.type, colour = event.type)) +
geom_crossbar(aes(ymin = Mb, ymax = stop)) +
scale_x_discrete(labels = NULL, breaks = NULL, name = "Sample (in order sequenced)") +
scale_y_continuous(name = "Position (Mb)", breaks = c(0, 5, 10, 15, 20, 25, 30), expand = c(0,0)) +
scale_colour_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") +
scale_fill_manual(values = c("grey10", "forestgreen", "grey95"), name = "CNV event type") +
theme_bw() +
coord_flip() +
guides(fill = guide_legend(ncol = 1)) +
facet_grid(.~CHROM, scales = "free", space = "free", switch = "x") +
theme(
panel.margin.x = unit(c(-.02,.4,-.02,.4), "cm"),
legend.position = "bottom",
panel.background = element_blank(),
strip.background = element_rect(colour = "white", fill = "white"),
panel.grid = element_blank())
Make extra variables, and catagorise heterozygote count as a proxy for MAF.
distr.dat = cnv1 %>%
mutate(gcount = HOM.REF + HET + HOM.VAR + NO.CALL) %>%
mutate(prop.het1 = 100 * (HET / gcount)) %>%
mutate(het.group = cut(prop.het1, breaks = seq(0,100,20), labels = c("0-20","20-40","40-60","60-80","80-100"))) %>%
mutate(kb.len = GCLENGTH / 1000) %>%
mutate(Mb = POS / 1000000) %>%
select(-c(HOM.REF, HOM.VAR, NO.CALL, GLALTSUM, GLHETSUM, AN, AF, AC, GSNNONREF, Mb, prop.het1, gcount, GCLENGTH)) %>%
gather(metric, value, -c(CHROM, ID, POS, GSCNCATEGORY, het.group))
Labels for facet plot
cnvdis.labs = as_labeller(c(`DEL` = "Deletions", `DUP` = "Duplications",
`GCFRACTION` = "GC-content", `GSCLUSTERSEP` = "Cluster separation",`GSCNQUAL` = "Quality",
`HET` = "het. count", `kb.len` = "Length(Kb)"))
Make plot for distributions of cnv metrics, coloured by proportion of heterozygote genotypes.
cnv.dis =
ggplot(distr.dat, aes(value, fill = het.group)) +
geom_histogram(position = "stack", colour = "grey10", size = .01, bins = 10) +
theme_bw(base_size = 28) +
scale_fill_manual(name = "% Heterozygous Genotypes", values = alpha(c(
"lemonchiffon", "tan1", "orangered", "darkorchid", "black"),.8)) +
labs(y = "Variant Count") +
coord_cartesian(expand = FALSE) +
facet_grid(GSCNCATEGORY~metric, scales = "free", switch = "y", labeller = cnvdis.labs) +
theme(strip.background = element_rect(colour = "white", fill = "white"),
legend.position = "bottom",
panel.grid.minor = element_blank())