Visualization - PCA - 2

Post date: Nov 12, 2019 6:40:41 PM

PCA with every individual, before filtering: here.

I am now looking at the filtered raw depths (depth_matrix.txt).

Filter by read depth.

####################################################################

######## Filter by read depth ##############

# After running "Prepare the data"

# Take the raw depths files and visualize

dp <- as.matrix(read.table("depth_matrix.txt"))

# transpose to have individuals as lines and variables (SNVs) as columns

dp <- t(dp)

indMean <- rowMeans(dp)

plot(sort(indMeans))

# Individuals with less than 5 reads total

threshold = 5 # number of thresholds

keepInd = matrix(0,nrow = 296, ncol = threshold ) # matrix to store the indices boolean value

for (i in seq(1:threshold))

{

temp <- t(indMean > i)

keepInd[,i] <- temp

}

# Use these indices to only plot the "TRUE" values

dataFiltered <- data[keepInd[,5]==1,]

res.pca <- prcomp(dataFiltered[,-c(1,2)], center = TRUE)

fviz_pca_ind(res.pca,

label = "none", # hide individual labels

habillage = dataFiltered[,2], # color by groups

palette = c("#af8dc3", "#E7B800", "#5ab4ac"),

#addEllipses = TRUE # Concentration ellipses

)

###############################################################################

> quantile(indMean)

0% 25% 50% 75% 100%

0.03298948 4.36446098 8.45310745 13.59518308 37.64490348

Plot can be found here.

Filter by allele frequency.

#######################################################################

######## Filter by allele frequency ##############

# Alleles with frequencies less than 1-5% are considered "rare variants"

# Mean by column to get the allele frequency for each allele accorss all individuals

allMean <- colMeans(data[,-c(1,2)])

# allMean <- cbind(data[])

threshold <- 0.05 # number of thresholds

keepAll <- matrix(0,ncol = dim(data)[2]-2, nrow = 5 ) # matrix to store the indices boolean value

ivec <- seq(0.01,threshold, by = 0.01)

# Loop to keep "common" variants

for (i in seq(1,5))

{

temp <- allMean >= ivec[i]

keepAll[i,] <- temp

}

data_noQual <- data[,-c(1,2)] # Need to delete the first two columns that contain qualitative variables

dataFiltered <- data_noQual[,(keepAll[5,]==1)]

res.pca <- prcomp(dataFiltered, center = TRUE)

fviz_pca_ind(res.pca,

label = "none", # hide individual labels

habillage = data[,2], # color by groups

palette = c("#af8dc3", "#E7B800", "#5ab4ac"),

#addEllipses = TRUE # Concentration ellipses

pointsize = 2

)

# Loop to keep "rare" variants

for (i in seq(1,5))

{

temp <- allMean < ivec[i]

keepAll[i,] <- temp

}

#data_noQual <- data[,-c(1,2)] # Need to delete the first two columns that contain qualitative variables

dataFiltered <- data_noQual[,(keepAll[1,]==1)]

res.pca <- prcomp(dataFiltered, center = TRUE)

fviz_pca_ind(res.pca,

label = "none", # hide individual labels

habillage = data[,2], # color by groups

palette = c("#af8dc3", "#E7B800", "#5ab4ac"),

#addEllipses = TRUE # Concentration ellipses

)

##################################################################################

Plots can be found here.

!!! I used "naive"allele frequency estimates when I should have used the estimated allele frequencies that are in the mle_p_pando_plus.txt file.

I first choose every frequency that is more than 0.5, then subtract 1 from them to make alleles from both extremes of the distribution variants with low frequencies.

Then, as a first step, I plot all the individuals with different allele frequency thresholds: 0.01, between 0.01 and 0.05, and 0.05.

Plots can be found here.

Code used:

########################################################################################

# Allele frequency estimates

mle <- as.matrix(read.table("mle_p_pando_plus.txt"))

# These are allele frequencies

hist(mle,

main="Allele frequency estimate",

xlab="Allele fequency estimate",

xlim=c(0,1),

col="darkmagenta",

freq=TRUE,

breaks=20

)

# To store the rare variants we need to keep the extremes right and left of the distribution

# For this we can select every frequency above .5 (x), calculate 1-x so that these extremes all

# shift to the low values

high <- mle>.5

# Store in a new vectot to keep the original safe

mirror <- mle

mirror[high==TRUE] <- 1-mirror[high==TRUE]

hist(mirror,

main="Allele frequency estimate",

xlab="Allele fequency estimate",

xlim=c(0,.5),

col="darkmagenta",

freq=TRUE,

breaks = 20

)

# Subset the data for PCA according to the allele frequency

data_noQual <- data[,-c(1,2)]

keepCommon <- data_noQual[,mirror>0.01]

res.pca <- prcomp(keepCommon, center = TRUE)

fviz_pca_ind(res.pca,

label = "none", # hide individual labels

habillage = data[,2], # color by groups

palette = c("#af8dc3", "#E7B800", "#5ab4ac"),

point.size = 2,

title = "p>0.01"

#addEllipses = TRUE # Concentration ellipses

)

keepRare <- data_noQual[,mirror>0.01 & mirror<0.05]

res.pca <- prcomp(keepRare, center = TRUE)

fviz_pca_ind(res.pca,

label = "none", # hide individual labels

habillage = data[,2], # color by groups

palette = c("#af8dc3", "#E7B800", "#5ab4ac"),

point.size = 2,

title = "0.01>p>.05"

#addEllipses = TRUE # Concentration ellipses

)

################################################################################

Page updated

Google Sites

Report abuse