Post date: Mar 16, 2020 9:6:50 PM
To check with Zach:
- Nei's D higher than 1?
- I am missing 6 tree's coordinate data. For 5 of them, they were the small/big pair of another tree so I extended the same coordinates, but one is missing.
To read!!
https://www.datanovia.com/en/lessons/clustering-distance-measures/
########################################################################
R script used to calculate genetic distance matrices
# R script
# 20200315
# 20200316
# ------------------------------------------------------------------------- #
# GOAL: Create a distance plot based on probability data
# ------------------------------------------------------------------------- #
# Description of dataset names
# map - dataset with id, long, lat (108 x 3)
# coor - dataset with id, long (108 x 2)
# data - probability of heterozygosity at each SNP for each individual (7715 x 108)
# spd - spatial distance matrix
# eucl - distance matrix based on Euclidian distance
# neisd - genetic distance matrix based on Nei's distance
# GSvec - Gini-Simpson diversity index vector
# ------------------------------------------------------------------------- #
# Load genetic and spatial data.
# ------------------------------------------------------------------------- #
setwd("/Volumes/Data/Doc_Rozenn/Education/GaTech/Year2/Spring2020/Research/data/Pando/findSNPs/")
data <- read.table("hets.txt")
space <- read.table("Pando_ID.txt")
summary(data)
summary(space)
data <- data[,-87] # we do not have coordinates for this guy - check with Karen if she has it
dim(data)
# ------------------------------------------------------------------------- #
# Calculate distance matrix for spatial data.
# ------------------------------------------------------------------------- #
# Convert to numeric
long <- as.numeric(as.character(space[,2]))
lat <- as.numeric(as.character(space[,3]))
id <- as.factor(space[,1])
coor <- cbind(long,lat)
map <- data.frame(id,long,lat)
# Calculate Metric distance using Vincenty Ellipsoid distance (precise for small scale)
library(geosphere)
list1 <- data.frame(longitude = c(long), latitude = c(lat))
list2 <- data.frame(longitude = c(long), latitude = c(lat))
# Spatial distance matrix
spd <- distm(list1[,c('longitude','latitude')], list2[,c('longitude','latitude')], fun=distVincentyEllipsoid)
# Calculate distance of all trees based on one tree
# 017-S (North)
list1 <- coor[which(map=="017-S"),]
spd_N <- matrix(-1,108,1)
for (i in 1:dim(coor)[1]){
list2 <- coor[i,]
temp <- distm(list1, list2, fun=distVincentyEllipsoid)
spd_N[i] <- temp
}
# 277-S (South)
list1 <- coor[which(map=="277-S"),]
spd_S <- matrix(-1,108,1)
for (i in 1:dim(coor)[1]){
list2 <- coor[i,]
temp <- distm(list1, list2, fun=distVincentyEllipsoid)
spd_S[i] <- temp
}
# 155-S (Middle)
list1 <- coor[which(map=="155-S"),]
spd_M <- matrix(-1,108,1)
for (i in 1:dim(coor)[1]){
list2 <- coor[i,]
temp <- distm(list1, list2, fun=distVincentyEllipsoid)
spd_M[i] <- temp
}
# ------------------------------------------------------------------------- #
# Create distance matrix based on Euclidian distance
# ------------------------------------------------------------------------- #
library("fields")
eucl <- rdist(t(data))
# ------------------------------------------------------------------------- #
# Create distance matrix based on Nei's distance
# ------------------------------------------------------------------------- #
# Initialization
neisd <- matrix(-1,108,108) # Everything that will stay -1 after pertains to the first matrix - should not be kept
c1 <- 1 # column 1
start <- 1 # moves along the columns to follow the symmetric matrix
temp <- 0 # stores the distance value for each comparison
n <- dim(neisd)[1] # number of individuals
# Double loop: one column of "reference" against which eevery other column is being compared
# Symmetry in the calculation, second loop to move ref column and avoid unnecessary calculations
# Matrix multiplication in R uses %*%
for (c1 in 1:n) {
for (c2 in start:n){
temp <- -log((data[,c1]%*%data[,c2])/(sqrt(sum(data[,c1]^2)*sum(data[,c2]^2))))
neisd[c1,c2] <- temp
temp <- 0
}
start <- c1 + 1
}
# ------------------------------------------------------------------------- #
# Create distance matrix based on Gini-Simpson distance
# ------------------------------------------------------------------------- #
# Gini-Simpson index gives a diversity value for each individual
# I thus use a single value for the distance of each tree from a "reference" point: a tree
# in the middle, at the southern or the northern tip of the stand
# Initialization
GSvec <- matrix(100,1,108)
c <- 1 # column 1
temp <- 0 # stores the distance value for each comparison
n <- dim(GSvec)[2] # number of individuals
for (c in 1:n){
temp <- data[,c]%*%(1-data[,c])
GSvec[c] <- temp
}
# ------------------------------------------------------------------------- #
# Map : genetic versus spatial
# ------------------------------------------------------------------------- #
# Change plot parameters for each matrix drawn
# Empty plot
plot(x = 1,
xlab = "Spatial distance (meters)",
ylab = "Euclidian distance",
xlim = c(0, max(spd)),
ylim = c(10, max(eucl)),
main = "",
type = "n")
# loop to plot each pair from the genetic distance matrix, and the spatial distance matrix
# EUCLIDIAN's D
l <- 2
max <- dim(eucl)[1]
for (i in c(1:max))
{
for (j in c(l:max-1))
{
points(spd[i,j],eucl[i,j],pch=19)
}
l <- l + 1
j <- l
}
# NEI's D
l <- 2
max <- dim(neisd)[1]
for (i in c(1:max))
{
for (j in c(l:max))
{
points(spd[i,j],neisd[i,j],pch=19)
}
l <- l + 1
j <- l
}
# Gini-Simpson diversity metrics
plot(spd_M,GSvec,
xlab = "Spatial distance from middle point (meters)",
ylab = "Gini-Simpson's div. index",
main ="",
xlim = c(0,1000),
ylim = c(0,1500),
pch = 19)