R code for exploring quality data derived from vcf-format genotypes file.
Input data is generated by GATK HaplotypeCaller and VariantsToTable functions.
Largest file is 380Mb, and takes an 8GB Mac 3 seconds to load.
1. Settings
2. Locus metrics data.
3. Variants-per-sample data
The standard repetoir for basic big data manipulations, with fields for a maths function, and xtable for pdf table.
package.list = c('data.table', 'tidyr', 'dplyr', 'ggplot2', 'cowplot', 'car', 'fields', 'xtable')
new.packages = package.list[!(package.list %in% installed.packages()[,"Package"])]
if (length(new.packages)) install.packages(new.packages)
rm(package.list, new.packages)
library(data.table) ## for big data.
library(tidyr) ## data formatting.
library(dplyr) ## data formatting.
library(ggplot2) ## fancy plots.
library(cowplot) ## combining swanky plots into one figure
library(car) ## for the 'recode character variables' function
library(fields) ## for calculating summary stats in bins.
library(xtable) ## for outputing data frames as tables in latex format.
theme_f2 = function() {
theme_bw(base_size = 10) +
theme(axis.line = element_line(size = .3, colour = "black"),
legend.background = element_rect(fill = "white", size = 0.25, linetype = 1, colour = "grey"),
legend.key = element_rect(colour = "white"),
panel.border = element_rect(color = "grey", size = 0.5),
panel.spacing.x = unit(c(.1,1,.1,1), "lines"),
strip.background = element_rect(colour = "white", fill = "white"),
axis.line.x = element_line(size = .3, colour = "black"),
axis.line.y = element_line(size = .3, colour = "black"),
panel.grid.major = element_line(colour = "light grey", size = 0.1),
panel.grid.minor = element_blank())}
attr(theme_f2(), "complete")
This uses the fread function of the data.table package to load the large locus metrics data set.
hcmets = fread("f1.lhm_rg_HC_raw.vcf.locmets.tbl", verbose = TRUE, header = TRUE)
Log of loading the locus metrics table from using fread.
Replacement of column names with those from a pre-defined list
decent.names = make.names(names(hcmets), unique = TRUE)
names(hcmets) = decent.names
Glimpse of the locus metrics table
View unique values in the Variant Type field
unique(hcmets$VariantType)
There's a bewildering away of variant classifier types.
summary(hcmets)
Summary of the variable values in the locus metrics table
This adds a column for group identifier (lhm), removes data from chromsome 4 and mitochondria, creates a column of the total genotype count (including 'no call'), calculates the drop-out/fail rate, and the allele counts for each group, creates a column of genomic positions in MegaBases, and assigns rare or common frequencies to each locus, and finally removes levels grouping structure.
f.hcmets = hcmets %>%
mutate(group = "lhm") %>%
filter(CHROM != "chr4", CHROM != "chrM") %>%
mutate(gcount = HOM.REF + HET + HOM.VAR + NO.CALL) %>%
mutate(fail.rate = NO.CALL / gcount) %>%
mutate(g2 = pmax(HET, HOM.VAR)) %>%
mutate(min.ac = pmin(HOM.REF, g2)) %>%
mutate(maj.ac = pmax(HOM.REF, g2)) %>%
mutate(MAF = min.ac / gcount) %>%
mutate(Mb = (POS / 1e+06)) %>%
mutate(freq.grp = ifelse(MAF < 0.05, yes = "rare", no = "common")) %>%
droplevels()
Create different tables by filtering on each major VariantType, assign usable factor, then combine (I'm sure there's an easier way).
bi.snps = filter(f.hcmets, VariantType == "SNP") %>%
mutate(type = "A_bi_snp")
mu.snps = filter(f.hcmets, VariantType != "SNP", EVENTLENGTH == 0) %>%
mutate(type = "B_mu_snp")
bi.indels = filter(f.hcmets, VariantType != "MULTIALLELIC_MIXED", VariantType != "MULTIALLELIC_COMPLEX.Other", EVENTLENGTH > 0) %>%
mutate(type = "C_bi_indel")
mu.indels = filter(f.hcmets, VariantType == "MULTIALLELIC_MIXED" | VariantType == "MULTIALLELIC_COMPLEX.Other") %>%
mutate(type = "D_mu_indel")
var.comb = rbind_all(list(bi.snps, mu.snps, bi.indels, mu.indels))
Four major 'types' of variant are now assigned.
MAF for multi-allelic is * because it counts both 'minor' alleles.
af.plot =
ggplot(var.comb, aes(MAF, colour = type, linetype = type)) +
geom_line(stat = "density", size = 0.8, alpha = 0.7) +
scale_x_continuous(name = "Minor allele frequency*", expand = c(0,0)) +
scale_linetype_manual(values = c(1,2,1,3), name = "Variant type",
labels = c("SNPs (biallelic)", "SNPs (multi-allelic)*", "Indels (biallelic)", "Indels (multi-allelic)*")) +
scale_colour_manual(values = c("blue", "blue", "orangered1", "orangered1"), name = "Variant type",
labels = c("SNPs (biallelic)", "SNPs (multi-allelic)*", "Indels (biallelic)", "Indels (multi-allelic)*")) +
theme_f2() +
theme(legend.position = c(0.7, 0.7),
legend.title.align = 0.5,
legend.key.width = unit(0.8, "cm"),
legend.key.height = unit(0.5, "cm"))
This re-formats the data as long, and calculates the medians for each metric and locus type.
locus.meds = var.comb %>%
select(c(ID, EVENTLENGTH, QUAL, DP, QD, MQ, FS, MQRankSum, HOM.REF, HET, HOM.VAR, NO.CALL, type)) %>%
gather(key = metric, value = value, -c(ID, type)) %>%
group_by(metric, type) %>%
summarize(med = round(median(value, na.rm = T), 2)) %>%
spread(key = type, value = med)
This re-formats the data as long, and calculates the IQR for each metric and locus type.
locus.iqr = var.comb %>%
select(c(ID, EVENTLENGTH, QUAL, DP, QD, MQ, FS, MQRankSum, HOM.REF, HET, HOM.VAR, NO.CALL, type)) %>%
gather(key = metric, value = value, -c(ID, type)) %>%
group_by(metric, type) %>%
summarize(iqr = round(IQR(value, na.rm = T), 2)) %>%
spread(key = type, value = iqr)
locus.n = var.comb %>%
group_by(type) %>%
summarize(num = n()) %>%
spread(key = type, value = num)
locus.comb = cbind(locus.meds, locus.iqr)
names(locus.comb) <- c("metric.med", "SNPs.bi.med", "SNPs.mu.med", "Indels.bi.med", "Indels.mu.med",
"metric.iqr", "SNPs.bi.iqr", "SNPs.mu.iqr", "Indels.bi.iqr", "Indels.mu.iqr")
loc.summ = locus.comb %>%
select(metric.med, SNPs.bi.med, SNPs.bi.iqr, SNPs.mu.med, SNPs.mu.iqr,
Indels.bi.med, Indels.bi.iqr, Indels.mu.med, Indels.mu.iqr)
Save the table as latex code using the xtable package
print( xtable(loc.summ, caption = "Quality metrics and genotype counts for filtered Haplotype Caller variants"),
"f1.locus.metric.summary.txt", type = "latex", include.rownames = FALSE)
Separated by variant type, and coloured by proportion of heterozygous genotypes. Assign groups to het genotype counts (using cut function), reformat as 'long' for plotting.
ex.hcmets = var.comb %>%
mutate(prop.het1 = 100 * (HET / gcount)) %>%
mutate(het.group = cut(prop.het1, breaks = seq(0,100,20), labels = c("0-20","20-40","40-60","60-80","80-100"))) %>%
mutate(mil.qual = QUAL / 1e+06) %>%
select(CHROM, ID, DP, QD, MQ, FS, MQRankSum, ReadPosRankSum, HET, het.group, type, mil.qual) %>%
gather(metric, value, -c(CHROM, ID, type, het.group))
Define group labels for facet plot
metdis.labs = as_labeller(c(`A_bi_snp` = "SNPs.bi", `B_mu_snp` = "SNPs.mu",
`C_bi_indel` = "Indels.bi", `D_mu_indel` = "Indels.mu",
`DP` = "Depth", `FS` = "Strand bias", `mil.qual` = "Qualx10^6", `MQ` = "Map.qual",
`MQRankSum` = "MQRankSum", `HET` = "Het. count",
`QD` = "Qual-by-depth", `ReadPosRankSum` = "Read.pos RS"))
Fill colour is grouped by allele frequency (using het.group as a proxy), facetting variables are locus type and metric.
metdis.plot =
ggplot(ex.hcmets, aes(value, fill = het.group)) +
geom_histogram(position = "stack", colour = "grey30", size = .001) +
scale_fill_manual(name = "% Heterozygous Genotypes", values = alpha(c(
"lemonchiffon", "tan1", "orangered", "darkorchid", "black"),.8)) +
labs(y = "Variant Count") +
coord_cartesian(expand = FALSE) +
facet_grid(type~metric, scales = "free", labeller = metdis.labs, switch = "both" ) +
theme_bw() +
theme(strip.placement = "outside",
strip.background = element_rect(colour = "white", fill = "white"),
legend.position = "bottom",
panel.grid.minor = element_blank())
There are many line-plots of variant density across the genome but I wanted to show the extent of small-window variation at the same time. To do this, I used ggplot to create an intermediate plot, and extracted the useful data from this
Dummy histogram for variant density data. 'Stacked 'option so positions are equal btn groups.
vd.dummy = ggplot(filter(var.comb, freq.grp == "common"), aes(Mb, colour = type, fill = type)) +
geom_histogram(binwidth = 0.01, position = "stack") +
scale_colour_manual(values = c("red", "blue", "green", "purple")) +
scale_fill_manual(values = c("red", "blue", "green", "purple")) +
facet_grid(.~CHROM, scales = "free_x") +
theme(legend.position = "bottom")
vd.dat1 = print(vd.dummy)
Remake category labels..
vd.plotdat = vd.dat1$data[[1]] %>%
mutate(chrom = recode(PANEL, "1 = 'chr2L'; 2 = 'chr2R'; 3 = 'chr3L'; 4 = 'chr3R'; 5 = 'chrX'")) %>%
mutate(type = recode(group, "1 = 'A_bi_snp' ; 2 = 'B_mu_snp'; 3 = 'C_bi_indel'; 4 = 'D_mu_indel'")) %>%
dplyr::rename(Mb = x)
Get summary data in windows. Extract-out only the means per window. Use row names to make columns for variant type.
vd.summ = lapply(split(vd.plotdat, vd.plotdat[,c("chrom", "type")]), function(x) stats.bin(x[,"Mb"],
x[,"count"], breaks = seq(0,max(x[,"Mb"]), 1)))
vd.sudat = do.call("rbind", lapply(vd.summ, function(x) data.frame(Mb = x$centers, mean = x$stats["mean",])))
vd.sudat = vd.sudat %>% mutate(temp = rownames(vd.sudat)) %>% separate(temp, into = c("chrom", "type", "misc"), sep = "\\.")
Labels for facet plot
vd.labs = as_labeller(c(`A_bi_snp` = "SNPs (biallelic)", `B_mu_snp` = "SNPs (multi-allelic)",
`C_bi_indel` = "Indels (biallelic)", `D_mu_indel` = "Indels (multi-allelic)",
`chr2L` = "chr2L", `chr2R` = "chr2R", `chr3L` = "chr3L", `chr3R` = "chr3R", `chrX` = "chrX"))
vd.plot = ggplot() +
geom_point(data = vd.plotdat, aes(Mb, count), colour = "black", size = .5, alpha = .1) +
geom_line(data = vd.sudat, aes(Mb, mean, group = type), alpha = 1, colour = "white", size = .6) +
geom_line(data = vd.sudat, aes(Mb, mean, group = type , colour = type), alpha = 1, size = .5) +
theme_f2() +
scale_x_continuous(expand = c(0,0), breaks = c(0, 10, 20, 30)) +
scale_y_continuous(expand = c(.02,0)) +
scale_colour_manual(values = c("skyblue1", "lightblue1", "red", "orangered1"), name = "",
labels = c("SNPs (biallelic)", "SNPs (multi)", "Indels (biallelic)", "Indels (multi)")) +
labs(x = "Genomic position (chromosome, Mb)",
y = "Variant density, per 10kb") +
facet_grid( type ~ chrom, scales = "free", space = "free_x", switch = "y", labeller = vd.labs) +
theme(panel.grid.major = element_blank(),
legend.position = "none", panel.spacing.x = unit(c(-.02,.4,-.02,.4), "cm"))
Density of common variants (MAF>.05) across the Drosophila melanogaster genome (LHM Sussex hemiclones). Lines indicate mean in 1Mb windows.
ind.sum = read.table("f1.lhm_rg_HC_raw.vcf.vareval.txt", header = TRUE)
head(ind.sum)
Does some filtering of redundant/duplicated fields, selects useful columns, converts to 'long', calculates counts in millions.
f.ind.sum = ind.sum %>%
filter(Novelty == "all", Sample != "all") %>%
select(Sample, nNoCalls, nHets, nHomRef, nHomVar) %>%
gather(Genotype, counts, -Sample) %>%
mutate(mcounts = counts / 1e+06)
Does some filtering of redundant/duplicated fields, selects useful columns, makes proportion calculations.
sorted = ind.sum %>%
filter(Novelty == "all", Sample != "all") %>%
select(Sample, nNoCalls, nHets, nHomRef, nHomVar) %>%
mutate(total = rowSums(.[2:5])) %>%
mutate(p_noCall = nNoCalls / total ) %>%
mutate(p_nHets = nHets / total ) %>%
mutate(p_nHomRef = nHomRef / total ) %>%
mutate(p_nHomVar = nHomVar / total ) %>%
arrange(-nHomVar)
Genotype counts per individual within LHM samples
var.count.summ.lhm = gc.lhm %>%
group_by(Genotype) %>%
summarize(med = median(counts, na.rm = T), iqr = IQR(counts, na.rm = T))
ig.plot =
ggplot(f.ind.sum, aes(Sample, mcounts, fill = Genotype, colour = Genotype)) +
geom_bar(stat = "identity", linetype = 0) +
theme_f2() +
scale_y_continuous(name = expression("Genotype counts," %*% 10 ^ {6}), expand = c(0,0), breaks = seq(0,2,.2)) +
scale_x_discrete(name = "Sample (in order sequenced)", breaks = c(50, 100, 150, 200)) +
scale_fill_manual(values = c("palegreen3", "white", "red", "black"),
labels = c("Het", "Hom.Ref", "Hom.Var", "No call")) +
scale_colour_manual(values = c("palegreen1", "white", "red", "black"),
labels = c("Het", "Hom.Ref", "Hom.Var", "No call")) +
guides(fill = guide_legend(ncol = 2)) +
geom_vline(xintercept = 220, linetype = 3, size = 0.4) +
geom_vline(xintercept = 222, size = 0.1) +
annotate("segment", x = 205, xend = 221, y = .9, yend = .86,
arrow = arrow(angle = 15, length = unit(0.25, "cm")), size = 0.25, colour = "grey20") +
annotate("text", x = 191, y = .95, label = "Reference\nstrain\nindividuals", colour = "grey20", size = 2.5) +
theme(legend.position = c(.7,.8),
legend.title.align = 0.4,
legend.key.width = unit(.5, "cm"),
legend.key.height = unit(.5, "cm"),
legend.key = element_rect(colour = "black", size = .2),
panel.grid = element_blank(), panel.border = element_blank())
fig.hcmets = ggdraw() +
draw_plot(vd.plot, x = 0.010, y = 0.000, width = 0.550, height = 1.000) +
draw_plot(af.plot, x = 0.598, y = 0.515, width = 0.392, height = 0.450) +
draw_plot(ig.plot, x = 0.580, y = 0.021, width = 0.410, height = 0.475) +
draw_plot_label(LETTERS[1:3], x = c(0.01, .59, .59), y = c(.99, .99, .5), size = 11)
save_plot("fig_smallVariants_summary.pdf", fig.hcmets, base_aspect_ratio = 1.5, base_height = 6)
Interactive addition of variant counts to the legend of each graph would be good.
> Sys.time()
[1] "2018-03-25 13:03:15 BST"
> sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 fields_9.6 maps_3.2.0 spam_2.1-2
[5] dotCall64_0.9-5.2 xtable_1.8-2 car_2.1-6 cowplot_0.9.1
[9] ggplot2_2.2.1 dplyr_0.7.4 tidyr_0.7.2 data.table_1.10.4-3
loaded via a namespace (and not attached):
[1] Rcpp_0.12.16 compiler_3.4.3 nloptr_1.0.4 plyr_1.8.4
[5] bindr_0.1 tools_3.4.3 digest_0.6.12 lme4_1.1-14
[9] tibble_1.3.4 gtable_0.2.0 nlme_3.1-131 lattice_0.20-35
[13] mgcv_1.8-22 pkgconfig_2.0.1 rlang_0.1.4 Matrix_1.2-12
[17] yaml_2.1.18 parallel_3.4.3 SparseM_1.77 stringr_1.3.0
[21] MatrixModels_0.4-1 tidyselect_0.2.3 nnet_7.3-12 glue_1.2.0
[25] R6_2.2.2 minqa_1.2.4 reshape2_1.4.3 purrr_0.2.4
[29] magrittr_1.5 scales_0.5.0 MASS_7.3-47 splines_3.4.3
[33] assertthat_0.2.0 pbkrtest_0.4-7 colorspace_1.3-2 labeling_0.3
[37] quantreg_5.34 stringi_1.1.6 lazyeval_0.2.1 munsell_0.4.3
Analysing NGS depth-of-coverage distribution on the train. Good wifi on Amtrak.