Post date: Nov 12, 2019 6:40:41 PM
PCA with every individual, before filtering: here.
I am now looking at the filtered raw depths (depth_matrix.txt).
Filter by read depth.
####################################################################
######## Filter by read depth ##############
# After running "Prepare the data"
# Take the raw depths files and visualize
dp <- as.matrix(read.table("depth_matrix.txt"))
# transpose to have individuals as lines and variables (SNVs) as columns
dp <- t(dp)
indMean <- rowMeans(dp)
plot(sort(indMeans))
# Individuals with less than 5 reads total
threshold = 5 # number of thresholds
keepInd = matrix(0,nrow = 296, ncol = threshold ) # matrix to store the indices boolean value
for (i in seq(1:threshold))
{
temp <- t(indMean > i)
keepInd[,i] <- temp
}
# Use these indices to only plot the "TRUE" values
dataFiltered <- data[keepInd[,5]==1,]
res.pca <- prcomp(dataFiltered[,-c(1,2)], center = TRUE)
fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = dataFiltered[,2], # color by groups
palette = c("#af8dc3", "#E7B800", "#5ab4ac"),
#addEllipses = TRUE # Concentration ellipses
)
###############################################################################
> quantile(indMean)
0% 25% 50% 75% 100%
0.03298948 4.36446098 8.45310745 13.59518308 37.64490348
Plot can be found here.
Filter by allele frequency.
#######################################################################
######## Filter by allele frequency ##############
# Alleles with frequencies less than 1-5% are considered "rare variants"
# Mean by column to get the allele frequency for each allele accorss all individuals
allMean <- colMeans(data[,-c(1,2)])
# allMean <- cbind(data[])
threshold <- 0.05 # number of thresholds
keepAll <- matrix(0,ncol = dim(data)[2]-2, nrow = 5 ) # matrix to store the indices boolean value
ivec <- seq(0.01,threshold, by = 0.01)
# Loop to keep "common" variants
for (i in seq(1,5))
{
temp <- allMean >= ivec[i]
keepAll[i,] <- temp
}
data_noQual <- data[,-c(1,2)] # Need to delete the first two columns that contain qualitative variables
dataFiltered <- data_noQual[,(keepAll[5,]==1)]
res.pca <- prcomp(dataFiltered, center = TRUE)
fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = data[,2], # color by groups
palette = c("#af8dc3", "#E7B800", "#5ab4ac"),
#addEllipses = TRUE # Concentration ellipses
pointsize = 2
)
# Loop to keep "rare" variants
for (i in seq(1,5))
{
temp <- allMean < ivec[i]
keepAll[i,] <- temp
}
#data_noQual <- data[,-c(1,2)] # Need to delete the first two columns that contain qualitative variables
dataFiltered <- data_noQual[,(keepAll[1,]==1)]
res.pca <- prcomp(dataFiltered, center = TRUE)
fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = data[,2], # color by groups
palette = c("#af8dc3", "#E7B800", "#5ab4ac"),
#addEllipses = TRUE # Concentration ellipses
)
##################################################################################
Plots can be found here.
!!! I used "naive"allele frequency estimates when I should have used the estimated allele frequencies that are in the mle_p_pando_plus.txt file.
I first choose every frequency that is more than 0.5, then subtract 1 from them to make alleles from both extremes of the distribution variants with low frequencies.
Then, as a first step, I plot all the individuals with different allele frequency thresholds: 0.01, between 0.01 and 0.05, and 0.05.
Plots can be found here.
Code used:
########################################################################################
# Allele frequency estimates
mle <- as.matrix(read.table("mle_p_pando_plus.txt"))
# These are allele frequencies
hist(mle,
main="Allele frequency estimate",
xlab="Allele fequency estimate",
xlim=c(0,1),
col="darkmagenta",
freq=TRUE,
breaks=20
)
# To store the rare variants we need to keep the extremes right and left of the distribution
# For this we can select every frequency above .5 (x), calculate 1-x so that these extremes all
# shift to the low values
high <- mle>.5
# Store in a new vectot to keep the original safe
mirror <- mle
mirror[high==TRUE] <- 1-mirror[high==TRUE]
hist(mirror,
main="Allele frequency estimate",
xlab="Allele fequency estimate",
xlim=c(0,.5),
col="darkmagenta",
freq=TRUE,
breaks = 20
)
# Subset the data for PCA according to the allele frequency
data_noQual <- data[,-c(1,2)]
keepCommon <- data_noQual[,mirror>0.01]
res.pca <- prcomp(keepCommon, center = TRUE)
fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = data[,2], # color by groups
palette = c("#af8dc3", "#E7B800", "#5ab4ac"),
point.size = 2,
title = "p>0.01"
#addEllipses = TRUE # Concentration ellipses
)
keepRare <- data_noQual[,mirror>0.01 & mirror<0.05]
res.pca <- prcomp(keepRare, center = TRUE)
fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = data[,2], # color by groups
palette = c("#af8dc3", "#E7B800", "#5ab4ac"),
point.size = 2,
title = "0.01>p>.05"
#addEllipses = TRUE # Concentration ellipses
)
################################################################################