##Average and standard deviation of reads per individual
depth<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/variants/ExpElevDepthMat.txt")
dim(depth)
[1] 430 21581
depth_mean<-apply(depth,1,mean)
mean(depth_mean)
[1] 3.359778
sd(depth_mean)
[1] 2.280689
##Script includes: allele frequencies, heterozygosity, and Fst
##Revised code and re-do of data analysis after defense
##April-May 2018
#read in ids of all plants
#the full file has 617 plant individuals, consisting of control (big exclosure), drought (big exclosure), irrigation (big exclosure), grass removal (big exclosure), shrub removal (big exclosure), LE (low elevation), HE (high elevation), P10 (separate small exclosure), P7 (separate small exclosure)
#sample size for each of these treatments: control = 66, drought = 103, irrigation = 62, grass removal = 1, shrub removal = 75, LE = 100, HE = 99, P10E1 = 54, P7E1 = 55
IDs <- read.csv("/uufs/chpc.utah.edu/common/home/u1065430/variants/pops_split_trt.txt",header=F)
names(IDs)[1] <- "fullname"
IDs$fullname <- as.character((IDs$fullname))
IDs$trt <- sapply(strsplit(IDs$fullname," "),"[[",2)
#remove shrub and grass removals, P10, and P7
IDs <- subset(IDs,trt !="GrassRemoval" & trt!="ShrubRemoval" & trt!="P7E1" & trt!="P10E1")
#remove duplicate PSSP_HE_Control_NA_NA_94
#the sample size of the remaining individuals are 430 of control, drought, irrigation, LE, and HE
IDs<-IDs[-c(401:402),]
pop_ids<-unique(IDs$trt) #pull out the unique treatment labels
#read in allele frequencies for each treatment
#each text file has 21,581 rows for each SNP and 4 rows for the mean, median, 95 CI lower bound, and 95 CI upper bound
drought<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Drought.txt",header=F,sep=",")
control<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Control.txt",header=F,sep=",")
irrigation<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Irrigation.txt",header=F,sep=",")
LE<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_LE.txt",header=F,sep=",")
HE<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_HE.txt",header=F,sep=",")
#combind all treatement mean allele frequencies into a matrix
allele_freq<-rbind(control[,1],drought[,1],irrigation[,1],LE[,1],HE[,1])
L<-dim(allele_freq)[2] #match with allele frequency matrix, dim = 21,581
N<-length(pop_ids) #match with plant individuals, dim = 430
### Minor allele frequencies-----------------------------------
#minor allele frequency across all treatments
maf_all <- colMeans(allele_freq)
maf_all[maf_all > 0.5] <- 1 - maf_all[maf_all > 0.5]
pdf("/uufs/chpc.utah.edu/common/home/u1065430/MAF_all.pdf")
hist(maf_all,xlab="Minor Allele Frequency",main="",cex.lab=1.5,cex.axis=1.25,col="darkgray",ylim=c(0,15000),ylab="",)
mtext(text = "Frequency",side = 2,line = 2.5,cex=1.5)
dev.off()
#### heterozygoisty and Fst--------------------------------------------
##function for calculating expected heterozygosity
#heterozygosity determines the level of genetic variation, which ranges from 0 to 1.
#if he = 0, no genetic variation
#if he = 1, genetic variation
He <- function(x){
He <- 2*x*(1-x)
return(He)
}
#heterozygosity for each treatment
#calculate heterozygosity for each loci
all_He<-He(allele_freq)
#calculate average heterozygosity for each treatment
ave_He<-rowMeans(all_He)
#standard deviations
sd_He<-apply(all_He,1,sd)
#function to calculate confidence intervals
popsize<-c(66,103,62,100,99)
CI<-function(sd_He,popsize){
CI<-qnorm(0.975)*sd_He/sqrt(popsize)
return(CI)
}
LB<-ave_He-CI(sd_He,popsize)
UB<-ave_He+CI(sd_He,popsize)
#average heterozygosity for control, drought, irrigation, HE, and LE
0.10296029, 0.10878762, 0.10865939, 0.10040517, and 0.09308721
#average heterozygosity barplot with CI's
#red = drought/less precip.
#blue = irrigation/more precip.
pdf("/uufs/chpc.utah.edu/common/home/u1065430/He_all.pdf")
He_plot<-barplot(ave_He,names=pop_ids,ylim=c(0,0.16),ylab="",col=c("darkgray","coral2","cadetblue3","cadetblue3","coral2"),cex.axis=1.25,cex.names=1.25)
arrows(x0=He_plot,y0=LB,x1=He_plot,y1=UB,code=3,angle=90,length=0.1,lwd=1.5)
mtext(text = "Average Heterozygosity",side = 2,line = 2.5,cex=1.5)
mtext(text = "Precipitation Treatment",side = 1,line = 3,cex=1.5)
dev.off()
#Fst function
#Fst is a fixation index developed by Sewell Wright and determines population differeniation, which ranges from 0 to 1
#Fst = (Ht - Hs)/Ht
#Fst = (heterozygosity between population - heterozygosity within population)/heterozygosity between population
#if Fst = 0, then no genetic differeniation between populations or treatments
#if Fst = 1, then there is high genetic differeniation between populations or treatments
#Fst function is of the allele frequency matrix
FST <- function(allele_mat){
#calculates allele frequency estimate for the sample as a whole (across populations)
Avg_alfreq_acrosspop <- apply(allele_mat, 2, mean)
#calculates expected heterozygosity for the sample as a whole:
Ht <- He(Avg_alfreq_acrosspop)
#calculates expected heterozygosity for each population individually:
individual_He <- He(allele_mat)
#calculates the mean expected heterozygosity for an population:
Hs <- apply(individual_He, 2, mean)
#calculates Fst:
FST <- mean(Ht-Hs)/mean(Ht)
return(FST)
}
#returns one Fst value across all treatments
Fst_all<-FST(allele_freq)
#Fst_all = 0.02518872, which indicates low genetic differeniation between all 5 treatments
#Fst between drought and irrigation
Fst_exp<-FST(allele_freq[2:3,])
#Fst_DI = 0.01290937, which indicates low genetic differeniation between drought and irrigation
#Fst between low and high elevations
Fst_elev<-FST(allele_freq[4:5,])
#Fst_elev = 0.01333165, which indicates low genetic differeniation between low and high elevation sites
#Overall, there is low genetic differeniation and we are seeing similar patterns of this between experimental and elevataion treatments. Even though the magnitude of these values are small, there appears to be more genetic differeniation in nature (elevation treatments).
#Fst across all loci for each treatment
#Fst function is of the allele frequency matrix, which is what mean_mat represents
FST_loci <- function(allele_mat){
#calculates allele frequency estimate for the sample as a whole (across populations)
Avg_alfreq_acrosspop <- apply(allele_mat, 2, mean)
#calculates expected heterozygosity for the sample as a whole:
Ht <- He(Avg_alfreq_acrosspop)
#calculates expected heterozygosity for each population individually:
individual_He <- He(allele_mat)
#calculates the mean expected heterozygosity for an population:
Hs <- apply(individual_He, 2, mean)
#calculates Fst:
FST_loci <- (Ht-Hs)/Ht
return(FST_loci)
}
#Fst across loci for drought and irrigation
FST_loci_DI<-FST_loci(allele_freq[2:3,])
FST_loci_LH<-FST_loci(allele_freq[4:5,])
#Fst across loci plots
pdf("/uufs/chpc.utah.edu/common/home/u1065430/DIFst.pdf")
DI<-plot(density(FST_loci_DI),xlim=c(0,0.5),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")
mtext(text = "Density",side = 2,line = 2.5,cex=1.5)
title(main="Drought vs. Irrig.",cex=2)
dev.off()
pdf("/uufs/chpc.utah.edu/common/home/u1065430/LHFst.pdf")
LH<-plot(density(FST_loci_LH),xlim=c(0,0.5),ylim=c(0,150),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")
mtext(text="Density",side=2,line=2.5,cex=1.5)
title(main="LE vs. HE",cex=2)
dev.off()
#both Fst plots together
pdf("/uufs/chpc.utah.edu/common/home/u1065430/ExpElevFst.pdf")
par(mfrow=c(1,2))
DI<-plot(density(FST_loci_DI),xlim=c(0,0.5),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")
mtext(text = "Density",side = 2,line = 2.5,cex=1.5)
title(main="Drought vs. Irrig.",cex=2)
LH<-plot(density(FST_loci_LH),xlim=c(0,0.5),ylim=c(0,150),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")
mtext(text="Density",side=2,line=2.5,cex=1.5)
title(main="LE vs. HE",cex=2)
dev.off()
#top SNPs
SIM<-10000
## define top quantiles
qs<-seq(0.95,0.995,0.005)
L<-length(FST_loci_DI)
N<-length(qs)
null<-matrix(NA,nrow=N,ncol=SIM)
out<-matrix(NA,nrow=N,ncol=4)
for(i in 1:N){
q<-qs[i]
X<-which(FST_loci_DI > quantile(FST_loci_DI,q))
Y<-which(FST_loci_LH > quantile(FST_loci_LH,q))
XY<-which((FST_loci_DI > quantile(FST_loci_DI,q)) & (FST_loci_LH > quantile(FST_loci_LH,q)))
obs<-length(XY)
NX<-length(X)
NY<-length(Y)
for(j in 1:SIM){
samX<-sample(1:L,NX,replace=FALSE)
samY<-sample(1:L,NY,replace=FALSE)
null[i,j]<-length(which(samX %in% samY))
}
out[i,]<-c(obs,mean(null[i,]),obs/mean(null[i,]),1-mean(obs > null[i,]) )
}
out<-out[1:8,]
sig<-(out[,4] < 0.05)+20 #+20 is pch
#summary of the top SNPs for the first 8 top quantiles (0.95, 0.955, 0.96, 0.965, 0.97, 0.975, 0.98, 0.985) which are the rows
#the columns are the number of top SNPs, mean, x-fold enrichment, and P-value
> out
[,1] [,2] [,3] [,4]
[1,] 105 53.9858 1.944956 0.0000
[2,] 88 43.7499 2.011433 0.0000
[3,] 71 34.5728 2.053638 0.0000
[4,] 62 26.4919 2.340338 0.0000
[5,] 44 19.4710 2.259771 0.0000
[6,] 30 13.4538 2.229853 0.0000
[7,] 23 8.6347 2.663671 0.0000
[8,] 12 4.8767 2.460680 0.0034
#plot of the top SNps
pdf("/uufs/chpc.utah.edu/common/home/u1065430/topSNPs.pdf")
plot(out[,3],type='b',pch=sig,axes=F,xlab="Top SNPs quantile",ylab="",cex.lab=1.5,cex.axis=1.5)
axis(1,at=1:8,as.character(qs[1:8]))
axis(2)
box()
abline(h=2,lty=2,col="blue")
mtext(text = "X-fold enrichment",side = 2,line = 2.5,cex=1.5)
dev.off()
## plot with 95th qs added
diHi<-quantile(FST_loci_DI,q)
lhHi<-quantile(FST_loci_LH,q)
pdf("/uufs/chpc.utah.edu/common/home/u1065430/DIxHLFstQuant.pdf")
plot(FST_loci_DI,FST_loci_LH,main="Experimental vs. Elevation",xlab="Fst D*I",ylab="",cex.lab=1.5,cex.axis=1.5)
mtext(text = "Fst H*L",side = 2,line = 2.5,cex=1.5)
abline(v=quantile(FST_loci_DI,0.95),col="purple",lty=2,lwd=2)
abline(h=quantile(FST_loci_LH,0.95),col="purple",lty=2,lwd=2)
dev.off()
#save workspace
save(list=ls(),file="analysisredo.rdat")
#load saved workspace
load(file="analysisredo.rdat")
###########################################################################################################################
first draft of code
April-May 2018
the data sets
#subset out data
IDs <- read.csv("/uufs/chpc.utah.edu/common/home/u1065430/variants/pops_split_trt.txt",header=F)
names(IDs)[1] <- "fullname"
IDs$fullname <- as.character((IDs$fullname))
IDs$trt <- sapply(strsplit(IDs$fullname," "),"[[",2)
IDs <- subset(IDs,trt !="GrassRemoval" & trt!="ShrubRemoval")
#remove duplicate PSSP_HE_Control_NA_NA_94
IDs<-IDs[-c(401:402),]
setwd("/uufs/chpc.utah.edu/common/home/u1065430/Data/")
#read in genotype file
genotypes<-read.table("PSSP_gen.txt",header=F,sep=",")
#transpose data to have individuals as rows and get rid of shrub removals, grass removals, and duplicate PSSP_HE_Control_NA_NA_94
genotypes2<-t(genotypes[-c(232:307,401,402)])
separate vector of allele freq across everyone by taking everyone in dopop
## population allele freqs, simple ML estimate
pop_ids<-unique(IDs$trt)
L<-dim(genotypes2)[2]
N<-length(pop_ids)
allele_freq<-matrix(NA,nrow=N,ncol=L)
for(i in 1:N){
dopop<-pop_ids[i]
dorows<-which(IDs$trt==dopop)
domat<-genotypes2[dorows,]
locimean<-colSums(domat)/(nrow(domat)*2)
allele_freq[i,]<-locimean
}
### Minor allele frequencies-------------------------------------------
#population order from pop_ids: control, drought, irrigation, HE, LE, P10, P7
#minor allele frequencies (maf) for each treatment
#subset by treatment
control<-allele_freq[1,]
drought<-allele_freq[2,]
irrigation<-allele_freq[3,]
he<-allele_freq[4,]
le<-allele_freq[5,]
#maf for experimental treatments
maf_all <- put in whatever the vector is
everything in brackets need to be maf_all
maf_control <- control
maf_control[maf_all > 0.5] <- 1 - maf_control[maf_all > 0.5]
maf_drought <- drought
maf_drought[maf_drought > 0.5] <- 1 - maf_drought[maf_drought > 0.5]
maf_irrigation <- irrigation
maf_irrigation[maf_irrigation > 0.5] <- 1 - maf_irrigation[maf_irrigation > 0.5]
#plots of maf for experimental treatments
png("ExperimentalMAF.png")
hist(maf_control, main="Control",ylim=c(0,15000),xlab="Minor Allele Frequency")
hist(maf_drought,main="Drought",ylim=c(0,15000),xlab="Minor Allele Frequency")
hist(maf_irrigation,main="Irrigation",ylim=c(0,15000),xlab="Minor Allele Frequency")
dev.off()
#maf for elevation treatments
maf_low <- le
maf_low[maf_low > 0.5] <- 1 - maf_low[maf_low > 0.5]
maf_high <- he
maf_high[maf_high > 0.5] <- 1 - maf_high[maf_high > 0.5]
#plots of maf for elevation treatments
png("ElevationMAF.png")
hist(maf_low,main="Low Elevation",xlab="Minor Allele Frequency")
hist(maf_high,main="High Elevation",xlab="Minor Allele Frequency")
dev.off()
#### heterozygoisty and Fst--------------------------------------------
##function for calculating expected heterozygosity
He <- function(x){
He <- 2*x*(1-x)
return(He)
}
take out p10 and p7 treatments and then rerun
how much variation among groups
Fst of h and l
Fst of d and i
Fst of d, i, and control
FST <- function(mean.mat){
#calculates allele frequency estimate for the sample as a whole (across populations):
Avg.alfeq.acrosspop <- apply(mean.mat, 2, mean)
#calculates expected heterozygosity for the sample as a whole:
piT <- He(Avg.alfeq.acrosspop)
#calculates expected heterozygosity for each population individually:
individual.He <- apply(mean.mat, 1 or 2, He) it could be just taking the 1, I need 1 or 2
#calculates the mean expected heterozygosity for an population:
piS <- apply(individual.He, 2, mean)
#calculates Fst:
FST <- mean(piT-piS)/mean(piT)
return(FST)
}
myfst<-FST(allele_freq)
repeat fst script above for loci
FST <- (piT-piS)/piT
#heterozygosity for each treatment across all loci
all.he<-apply(allele_freq,c(1,2),He)
pdf instead or eps
#he boxplot
png("HeBoxPlot.png")
boxplot(all.he[1,],all.he[2,],all.he[3,],all.he[4,],all.he[5,],names=c("Control","Drought","Irrigation","LE","HE"),ylab="Heterozygosity")
dev.off()
#hudson Fst?
hudsonFst2<-function(p1=NA,p2=NA,n1=NA,n2=NA){
numerator<-p1 * (1 - p1) + p2 * (1 - p2)
denominator<-p1 * (1 - p2) + p2 * (1 - p1)
fst<-1 - numerator/denominator
out<-cbind(numerator,denominator,fst)
return(out)
}
N<-table(IDs$trt)
droughtids<-which(pop_ids=="Drought")
irrigationids<-which(pop_ids=="Irrigation")
HEids<-which(pop_ids=="HE")
LEids<-which(pop_ids=="LE")
## fst for each snp
fstDxIhu<-hudsonFst2(p1=allele_freq[droughtids,],p2=allele_freq[irrigationids,],n1=N[droughtids],n2=N[irrigationids])
fstLExHEhu<-hudsonFst2(p1=allele_freq[LEids,],p2=allele_freq[HEids,],n1=N[LEids],n2=N[HEids])
png("Fst_Plots.png")
par(mfrow=c(1,3))
hist(fstDxIhu,xlab="Fst",main="Drought vs. Irrig.")
hist(fstLExHEhu,xlab="Fst",main="LE vs. HE")
plot(fstDxIhu,fstLExHEhu,bty="n",xlab="Fst D*I",ylab="Fst L*H",pch=".")
abline(0,1,col="red")
dev.off()