Post date: Nov 18, 2019 2:59:36 AM
I kept the PC1 score for every sample for which we have a coordinate (274 out of 296, check with Karen), and map them according to their PC1 value (no filtering).
The map, rather ugly but informative, can be foundhere.
It looks like the Pando clone clusters on positive values of PC1, and separates pretty well from the other genotypes.
Updated post: also look when read depth is constrained to be > 4.
###############################################################
# R code used
# Draw the genetic variance on the spatial map.
# The objective of this script is to keep the PC score for each tree and to map it in space.
# What the PCs tell you is where the largest variance in the dataset lies.
# PCA to visualize the data
#install.packages('FactoMineR')
library(jpeg)
library(Cairo)
library(FactoMineR)
library(factoextra)
library(dplyr)
# Work with FactoMineR
#http://factominer.free.fr/factomethods/principal-components-analysis.html
######## Prepare the data ##############
pntest <- as.matrix(read.table("pntest_filtered2xHiCov_pando_variants.txt"))
names <- as.matrix(read.table("sampleID.txt"))
type <- as.matrix(read.table("type_clean.txt"))
# Specify which individuals are Pando
track_type = rep("aroundPando",each=length(type))
for (i in c(which(type == "potr-017-B"):(which(type == "potr-277-B"))))
{
track_type[i] <- "Pando"
}
for (i in c(which(type == "potr-603-B"):(which(type == "potr-659-B"))))
{
track_type[i] <- "nearbyClone"
}
dp <- as.matrix(read.table("depth_matrix.txt"))
dp <- t(dp)
indMean <- rowMeans(dp)
size <- as.matrix(read.table("size.txt"))
size <- as.factor(size)
pairs <- as.matrix(read.table("pairs.txt"))
pairs <- as.factor(pairs)
data <- rbind(indMean,size,pairs,names,track_type,pntest)
data <- as.data.frame(t(data))
data[1:10,1:10]
#data <- rbind(type,pntest)
# Transpose matrix to have it with individuals as rows and variables as columns
#data[,c(1,2)] <- as.character(data[,c(1,2)])
for (i in c(6:dim(data)[2]))
{
data[,i] <- as.numeric(as.character(unlist(data[[i]])))
}
###################################################
# Get the pairs that should be deleted because there is not enough reads
##### read depth
rdp <- 4
ind <- which(data$indMean>rdp)
data_1 <- data[ind,]
##################################################
# Do the pca now
res.pca <- prcomp(data[,-c(1:5)], center = TRUE)
# Store PC1 coordinates
PC1coor <- res.pca$x[,1]
###############################################
# Mapping
samps <- read.csv("PandoSamples.csv")
samps[1,]
samps <- samps[order(samps[,1]),]
# Associate the samples PC1 with their coordinates
common <- intersect(samps[,1],data$V4)
# Keep the samples that pairs up
idx_coor <- c()
idx_PC1 <- c()
pairvec <- common
for (i in c(1:length(pairvec)))
{
tempA <- which(data$V4==pairvec[i])
tempB <- which(samps[,1]==pairvec[i])
idx_coor[i] <- tempB
idx_PC1[i] <- tempA
tempA <- c()
tempB <- c()
}
# There is 274 individuals with coordinate data (why not 296?)
data_2 <- data[idx_PC1,]
PC1_map <- PC1coor[idx_PC1]
coor <- samps[idx_coor,]
# Create a unique dataset containing coordinates, PC1
# They should be the same order, they are the same length
data_3 <- cbind(coor,PC1_map)
# Assign a color according to the value of the coordinate
#Create a function to generate a continuous color palette
rbPal <- colorRampPalette(c('purple4','deeppink'))
rbPal <- colorRampPalette(c('yellow','red'))
#This adds a column of color values
# based on the y values
data_3$color <- rbPal(100)[as.numeric(cut(data_3[,4],breaks = 100))]
#h <- ggmap(base)
h <- ggplot(data_3,aes(x=data_3[,2], y=data_3[,3]))
h <- h + geom_point(aes(colour = data_3[,4]), cex = 3.5) + scale_colour_gradient2(guide="colourbar", low = 'lightblue', high = 'darkblue')
h <- h + theme_dark() + theme(legend.position="bottom", axis.text = element_text(size = rel(0.75)))# # tweak the plot's appearance and legend position
h <- h + labs(x="Longitude", y="Latitude", title="Collection sites")
h
#####################################################
Code used to filter read depth
###################################################
# Get the pairs that should be deleted because there is not enough reads
##### read depth
rdp <- 4
data_1 <- data[indMean>rdp,]
dim(data_1)
data_1[1:10,1:10]
##################################################
PCA was done after filtering.