Post date: Dec 05, 2019 11:27:38 PM
We calculate the probability of being heterozygote by reverting the genotype likelihood Phred score of the vcf filtered file.
We test several thresholds for allele frequency to decide for heterozygosity (from 0.01 to 0.1)
Plot can be seen here.
Proportion of heterozygotes as a function of the threshold:
I am surprised not to see more variation.The mean proportion of heterozygotes starts at 0.24 for the lowest cut (0.01) and goes to 0.28 at the highest cut (0.1). The standard deviation is high and constant around 0.3.
Proportion of Nas as a function of the threshold:
The proportion of NAs drops rapidly from 0.55 to 0.33, and stays at this plateau after the threshold reaches 0.06.
Proportion of homozygotes as a function of the threshold:
It seems that the proportion of homozygotes "answers" the proportion of NAs variation, as it increases when the latter drops.
######################################################################################################
Code to generate the figure in R
#!/bin/user/Rscript
# The goal of this script is to intake the files containing the heterozygotes labelled to calculate how much heterozygotes and NA each threshold gives.
# go in the right directory (to be change)
setwd("/uufs/chpc.utah.edu/common/home/u6028866/data/Pando/variants/pando_only_variants/res/")
# Initizalization of variables and storage files
thresh <- seq(0.01,0.1,0.01)
store_h <- c() # stores the number of heterozygotes per SNP
store_NA <- c() # stores the number of NAs per SNP
store_ho <- c() # stores the number of homozygotes per SNP
for (t in seq(1,length(thresh)))
{
file_name <- sprintf("heteroZ_label_%g.txt",thresh[t])
data <- as.matrix(read.table(file_name, sep = "\t", header = FALSE))
# transform into factors
data <- apply(data, 2, as.factor)
# Number of individuals is number of columns minus 1 (last column is \n)
n <- dim(data)[2]-1 # number of individuals
N <- dim(data)[1] # number of SNPs
# The last column has the \n only - delete it
data <- data[,1:n]
# Initialization of the temporary vectors storing the data
nb_h <- rep(-1,N) # -1 because none of the data can be negative: control
nb_NA <- rep(-1,N)
nb_ho <- rep(-1,N)
for (i in seq(1,N))
{
# Calculate the number of 1 = heterozygotes
nb_h[i] <- length(which(data[i,]==1))
# Calculate the number of 2 = NA = we don't know
nb_NA[i] <- length(which(data[i,]==2))
# Calculate the number of 0 = homozygotes
nb_ho[i] <- length(which(data[i,]==0))
}
store_h <- cbind(store_h, nb_h)
store_NA <- cbind(store_NA, nb_NA)
store_ho <- cbind(store_ho, nb_ho)
}
# Calculations
prop_h <- apply(store_h, 2, function(cval) cval/(n))
mean_h <- apply(prop_h, 2, mean)
sd_h <- apply(prop_h, 2, sd)
# Proportion of NA's as a function of the threshold:
prop_NA <- apply(store_NA, 2, function(cval) cval/(n))
mean_NA <- apply(prop_NA, 2, mean)mean_NA <- apply(prop_NA, 2, mean)
sd_NA <- apply(prop_h, 2, sd)
# Proportion of homozygotes as a function of the threshold:
prop_ho <- apply(store_ho, 2, function(cval) cval/(n))
mean_ho <- apply(prop_ho, 2, mean)
sd_ho <- apply(prop_ho, 2, sd)
# Plot
pdf("plot_test.pdf")
# Heterozygotes
plot(thresh, mean_h, xlab="Threshold", ylab="Proportion", col = "blue",
type="both", lty=1, ylim=c(0.2,0.6))
# NAs
points(thresh, mean_NA, col = "red", pch = "+")
lines(thresh, mean_NA, col = "red", lty = 3)
# Homozygotes
points(thresh, mean_ho, col = "black", pch = "o")
lines(thresh, mean_ho, col = "black", lty = 3)
legend(0.070,0.58,legend=c("Heterozygotes","NAs","Homozygotes"), col=c("blue","red","black"),
pch=c("o","+","+"),lty=c(1,3,3), ncol=1)
dev.off()
######################################################################################################