Allele Freqs., He, & Fst

##Average and standard deviation of reads per individual

depth<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/variants/ExpElevDepthMat.txt")

dim(depth)

[1] 430 21581

depth_mean<-apply(depth,1,mean)

mean(depth_mean)

[1] 3.359778

sd(depth_mean)

[1] 2.280689

##Script includes: allele frequencies, heterozygosity, and Fst

##Revised code and re-do of data analysis after defense

##April-May 2018

#read in ids of all plants

#the full file has 617 plant individuals, consisting of control (big exclosure), drought (big exclosure), irrigation (big exclosure), grass removal (big exclosure), shrub removal (big exclosure), LE (low elevation), HE (high elevation), P10 (separate small exclosure), P7 (separate small exclosure)

#sample size for each of these treatments: control = 66, drought = 103, irrigation = 62, grass removal = 1, shrub removal = 75, LE = 100, HE = 99, P10E1 = 54, P7E1 = 55

IDs <- read.csv("/uufs/chpc.utah.edu/common/home/u1065430/variants/pops_split_trt.txt",header=F)

names(IDs)[1] <- "fullname"

IDs$fullname <- as.character((IDs$fullname))

IDs$trt <- sapply(strsplit(IDs$fullname," "),"[[",2)

#remove shrub and grass removals, P10, and P7

IDs <- subset(IDs,trt !="GrassRemoval" & trt!="ShrubRemoval" & trt!="P7E1" & trt!="P10E1")

#remove duplicate PSSP_HE_Control_NA_NA_94

#the sample size of the remaining individuals are 430 of control, drought, irrigation, LE, and HE

IDs<-IDs[-c(401:402),]

pop_ids<-unique(IDs$trt) #pull out the unique treatment labels

#read in allele frequencies for each treatment

#each text file has 21,581 rows for each SNP and 4 rows for the mean, median, 95 CI lower bound, and 95 CI upper bound

drought<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Drought.txt",header=F,sep=",")

control<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Control.txt",header=F,sep=",")

irrigation<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_Irrigation.txt",header=F,sep=",")

LE<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_LE.txt",header=F,sep=",")

HE<-read.table("/uufs/chpc.utah.edu/common/home/u1065430/AlleleFreq/p_o_PSSP_HE.txt",header=F,sep=",")

#combind all treatement mean allele frequencies into a matrix

allele_freq<-rbind(control[,1],drought[,1],irrigation[,1],LE[,1],HE[,1])

L<-dim(allele_freq)[2] #match with allele frequency matrix, dim = 21,581

N<-length(pop_ids) #match with plant individuals, dim = 430

### Minor allele frequencies-----------------------------------

#minor allele frequency across all treatments

maf_all <- colMeans(allele_freq)

maf_all[maf_all > 0.5] <- 1 - maf_all[maf_all > 0.5]

pdf("/uufs/chpc.utah.edu/common/home/u1065430/MAF_all.pdf")

hist(maf_all,xlab="Minor Allele Frequency",main="",cex.lab=1.5,cex.axis=1.25,col="darkgray",ylim=c(0,15000),ylab="",)

mtext(text = "Frequency",side = 2,line = 2.5,cex=1.5)

dev.off()

#### heterozygoisty and Fst--------------------------------------------

##function for calculating expected heterozygosity

#heterozygosity determines the level of genetic variation, which ranges from 0 to 1.

#if he = 0, no genetic variation

#if he = 1, genetic variation

He <- function(x){

He <- 2*x*(1-x)

return(He)

}

#heterozygosity for each treatment

#calculate heterozygosity for each loci

all_He<-He(allele_freq)

#calculate average heterozygosity for each treatment

ave_He<-rowMeans(all_He)

#standard deviations

sd_He<-apply(all_He,1,sd)

#function to calculate confidence intervals

popsize<-c(66,103,62,100,99)

CI<-function(sd_He,popsize){

CI<-qnorm(0.975)*sd_He/sqrt(popsize)

return(CI)

}

LB<-ave_He-CI(sd_He,popsize)

UB<-ave_He+CI(sd_He,popsize)

#average heterozygosity for control, drought, irrigation, HE, and LE

0.10296029, 0.10878762, 0.10865939, 0.10040517, and 0.09308721

#average heterozygosity barplot with CI's

#red = drought/less precip.

#blue = irrigation/more precip.

pdf("/uufs/chpc.utah.edu/common/home/u1065430/He_all.pdf")

He_plot<-barplot(ave_He,names=pop_ids,ylim=c(0,0.16),ylab="",col=c("darkgray","coral2","cadetblue3","cadetblue3","coral2"),cex.axis=1.25,cex.names=1.25)

arrows(x0=He_plot,y0=LB,x1=He_plot,y1=UB,code=3,angle=90,length=0.1,lwd=1.5)

mtext(text = "Average Heterozygosity",side = 2,line = 2.5,cex=1.5)

mtext(text = "Precipitation Treatment",side = 1,line = 3,cex=1.5)

dev.off()

#Fst function

#Fst is a fixation index developed by Sewell Wright and determines population differeniation, which ranges from 0 to 1

#Fst = (Ht - Hs)/Ht

#Fst = (heterozygosity between population - heterozygosity within population)/heterozygosity between population

#if Fst = 0, then no genetic differeniation between populations or treatments

#if Fst = 1, then there is high genetic differeniation between populations or treatments

#Fst function is of the allele frequency matrix

FST <- function(allele_mat){

#calculates allele frequency estimate for the sample as a whole (across populations)

Avg_alfreq_acrosspop <- apply(allele_mat, 2, mean)

#calculates expected heterozygosity for the sample as a whole:

Ht <- He(Avg_alfreq_acrosspop)

#calculates expected heterozygosity for each population individually:

individual_He <- He(allele_mat)

#calculates the mean expected heterozygosity for an population:

Hs <- apply(individual_He, 2, mean)

#calculates Fst:

FST <- mean(Ht-Hs)/mean(Ht)

return(FST)

}

#returns one Fst value across all treatments

Fst_all<-FST(allele_freq)

#Fst_all = 0.02518872, which indicates low genetic differeniation between all 5 treatments

#Fst between drought and irrigation

Fst_exp<-FST(allele_freq[2:3,])

#Fst_DI = 0.01290937, which indicates low genetic differeniation between drought and irrigation

#Fst between low and high elevations

Fst_elev<-FST(allele_freq[4:5,])

#Fst_elev = 0.01333165, which indicates low genetic differeniation between low and high elevation sites

#Overall, there is low genetic differeniation and we are seeing similar patterns of this between experimental and elevataion treatments. Even though the magnitude of these values are small, there appears to be more genetic differeniation in nature (elevation treatments).

#Fst across all loci for each treatment

#Fst function is of the allele frequency matrix, which is what mean_mat represents

FST_loci <- function(allele_mat){

#calculates allele frequency estimate for the sample as a whole (across populations)

Avg_alfreq_acrosspop <- apply(allele_mat, 2, mean)

#calculates expected heterozygosity for the sample as a whole:

Ht <- He(Avg_alfreq_acrosspop)

#calculates expected heterozygosity for each population individually:

individual_He <- He(allele_mat)

#calculates the mean expected heterozygosity for an population:

Hs <- apply(individual_He, 2, mean)

#calculates Fst:

FST_loci <- (Ht-Hs)/Ht

return(FST_loci)

}

#Fst across loci for drought and irrigation

FST_loci_DI<-FST_loci(allele_freq[2:3,])

FST_loci_LH<-FST_loci(allele_freq[4:5,])

#Fst across loci plots

pdf("/uufs/chpc.utah.edu/common/home/u1065430/DIFst.pdf")

DI<-plot(density(FST_loci_DI),xlim=c(0,0.5),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")

mtext(text = "Density",side = 2,line = 2.5,cex=1.5)

title(main="Drought vs. Irrig.",cex=2)

dev.off()

pdf("/uufs/chpc.utah.edu/common/home/u1065430/LHFst.pdf")

LH<-plot(density(FST_loci_LH),xlim=c(0,0.5),ylim=c(0,150),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")

mtext(text="Density",side=2,line=2.5,cex=1.5)

title(main="LE vs. HE",cex=2)

dev.off()

#both Fst plots together

pdf("/uufs/chpc.utah.edu/common/home/u1065430/ExpElevFst.pdf")

par(mfrow=c(1,2))

DI<-plot(density(FST_loci_DI),xlim=c(0,0.5),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")

mtext(text = "Density",side = 2,line = 2.5,cex=1.5)

title(main="Drought vs. Irrig.",cex=2)

LH<-plot(density(FST_loci_LH),xlim=c(0,0.5),ylim=c(0,150),xlab="Fst",main="",lwd=2,cex.lab=1.5,cex.axis=1.5,ylab="")

mtext(text="Density",side=2,line=2.5,cex=1.5)

title(main="LE vs. HE",cex=2)

dev.off()

#top SNPs

SIM<-10000

## define top quantiles

qs<-seq(0.95,0.995,0.005)

L<-length(FST_loci_DI)

N<-length(qs)

null<-matrix(NA,nrow=N,ncol=SIM)

out<-matrix(NA,nrow=N,ncol=4)

for(i in 1:N){

q<-qs[i]

X<-which(FST_loci_DI > quantile(FST_loci_DI,q))

Y<-which(FST_loci_LH > quantile(FST_loci_LH,q))

XY<-which((FST_loci_DI > quantile(FST_loci_DI,q)) & (FST_loci_LH > quantile(FST_loci_LH,q)))

obs<-length(XY)

NX<-length(X)

NY<-length(Y)

for(j in 1:SIM){

samX<-sample(1:L,NX,replace=FALSE)

samY<-sample(1:L,NY,replace=FALSE)

null[i,j]<-length(which(samX %in% samY))

}

out[i,]<-c(obs,mean(null[i,]),obs/mean(null[i,]),1-mean(obs > null[i,]) )

}

out<-out[1:8,]

sig<-(out[,4] < 0.05)+20 #+20 is pch

#summary of the top SNPs for the first 8 top quantiles (0.95, 0.955, 0.96, 0.965, 0.97, 0.975, 0.98, 0.985) which are the rows

#the columns are the number of top SNPs, mean, x-fold enrichment, and P-value

> out

[,1] [,2] [,3] [,4]

[1,] 105 53.9858 1.944956 0.0000

[2,] 88 43.7499 2.011433 0.0000

[3,] 71 34.5728 2.053638 0.0000

[4,] 62 26.4919 2.340338 0.0000

[5,] 44 19.4710 2.259771 0.0000

[6,] 30 13.4538 2.229853 0.0000

[7,] 23 8.6347 2.663671 0.0000

[8,] 12 4.8767 2.460680 0.0034

#plot of the top SNps

pdf("/uufs/chpc.utah.edu/common/home/u1065430/topSNPs.pdf")

plot(out[,3],type='b',pch=sig,axes=F,xlab="Top SNPs quantile",ylab="",cex.lab=1.5,cex.axis=1.5)

axis(1,at=1:8,as.character(qs[1:8]))

axis(2)

box()

abline(h=2,lty=2,col="blue")

mtext(text = "X-fold enrichment",side = 2,line = 2.5,cex=1.5)

dev.off()

## plot with 95th qs added

diHi<-quantile(FST_loci_DI,q)

lhHi<-quantile(FST_loci_LH,q)

pdf("/uufs/chpc.utah.edu/common/home/u1065430/DIxHLFstQuant.pdf")

plot(FST_loci_DI,FST_loci_LH,main="Experimental vs. Elevation",xlab="Fst D*I",ylab="",cex.lab=1.5,cex.axis=1.5)

mtext(text = "Fst H*L",side = 2,line = 2.5,cex=1.5)

abline(v=quantile(FST_loci_DI,0.95),col="purple",lty=2,lwd=2)

abline(h=quantile(FST_loci_LH,0.95),col="purple",lty=2,lwd=2)

dev.off()

#save workspace

save(list=ls(),file="analysisredo.rdat")

#load saved workspace

load(file="analysisredo.rdat")

###########################################################################################################################

first draft of code

April-May 2018

the data sets

#subset out data

IDs <- read.csv("/uufs/chpc.utah.edu/common/home/u1065430/variants/pops_split_trt.txt",header=F)

names(IDs)[1] <- "fullname"

IDs$fullname <- as.character((IDs$fullname))

IDs$trt <- sapply(strsplit(IDs$fullname," "),"[[",2)

IDs <- subset(IDs,trt !="GrassRemoval" & trt!="ShrubRemoval")

#remove duplicate PSSP_HE_Control_NA_NA_94

IDs<-IDs[-c(401:402),]

setwd("/uufs/chpc.utah.edu/common/home/u1065430/Data/")

#read in genotype file

genotypes<-read.table("PSSP_gen.txt",header=F,sep=",")

#transpose data to have individuals as rows and get rid of shrub removals, grass removals, and duplicate PSSP_HE_Control_NA_NA_94

genotypes2<-t(genotypes[-c(232:307,401,402)])

separate vector of allele freq across everyone by taking everyone in dopop

## population allele freqs, simple ML estimate

pop_ids<-unique(IDs$trt)

L<-dim(genotypes2)[2]

N<-length(pop_ids)

allele_freq<-matrix(NA,nrow=N,ncol=L)

for(i in 1:N){

dopop<-pop_ids[i]

dorows<-which(IDs$trt==dopop)

domat<-genotypes2[dorows,]

locimean<-colSums(domat)/(nrow(domat)*2)

allele_freq[i,]<-locimean

}

### Minor allele frequencies-------------------------------------------

#population order from pop_ids: control, drought, irrigation, HE, LE, P10, P7

#minor allele frequencies (maf) for each treatment

#subset by treatment

control<-allele_freq[1,]

drought<-allele_freq[2,]

irrigation<-allele_freq[3,]

he<-allele_freq[4,]

le<-allele_freq[5,]

#maf for experimental treatments

maf_all <- put in whatever the vector is

everything in brackets need to be maf_all

maf_control <- control

maf_control[maf_all > 0.5] <- 1 - maf_control[maf_all > 0.5]

maf_drought <- drought

maf_drought[maf_drought > 0.5] <- 1 - maf_drought[maf_drought > 0.5]

maf_irrigation <- irrigation

maf_irrigation[maf_irrigation > 0.5] <- 1 - maf_irrigation[maf_irrigation > 0.5]

#plots of maf for experimental treatments

png("ExperimentalMAF.png")

hist(maf_control, main="Control",ylim=c(0,15000),xlab="Minor Allele Frequency")

hist(maf_drought,main="Drought",ylim=c(0,15000),xlab="Minor Allele Frequency")

hist(maf_irrigation,main="Irrigation",ylim=c(0,15000),xlab="Minor Allele Frequency")

dev.off()

#maf for elevation treatments

maf_low <- le

maf_low[maf_low > 0.5] <- 1 - maf_low[maf_low > 0.5]

maf_high <- he

maf_high[maf_high > 0.5] <- 1 - maf_high[maf_high > 0.5]

#plots of maf for elevation treatments

png("ElevationMAF.png")

hist(maf_low,main="Low Elevation",xlab="Minor Allele Frequency")

hist(maf_high,main="High Elevation",xlab="Minor Allele Frequency")

dev.off()

#### heterozygoisty and Fst--------------------------------------------

##function for calculating expected heterozygosity

He <- function(x){

He <- 2*x*(1-x)

return(He)

}

take out p10 and p7 treatments and then rerun

how much variation among groups

Fst of h and l

Fst of d and i

Fst of d, i, and control

FST <- function(mean.mat){

#calculates allele frequency estimate for the sample as a whole (across populations):

Avg.alfeq.acrosspop <- apply(mean.mat, 2, mean)

#calculates expected heterozygosity for the sample as a whole:

piT <- He(Avg.alfeq.acrosspop)

#calculates expected heterozygosity for each population individually:

individual.He <- apply(mean.mat, 1 or 2, He) it could be just taking the 1, I need 1 or 2

#calculates the mean expected heterozygosity for an population:

piS <- apply(individual.He, 2, mean)

#calculates Fst:

FST <- mean(piT-piS)/mean(piT)

return(FST)

}

myfst<-FST(allele_freq)

repeat fst script above for loci

FST <- (piT-piS)/piT

#heterozygosity for each treatment across all loci

all.he<-apply(allele_freq,c(1,2),He)

pdf instead or eps

#he boxplot

png("HeBoxPlot.png")

boxplot(all.he[1,],all.he[2,],all.he[3,],all.he[4,],all.he[5,],names=c("Control","Drought","Irrigation","LE","HE"),ylab="Heterozygosity")

dev.off()

#hudson Fst?

hudsonFst2<-function(p1=NA,p2=NA,n1=NA,n2=NA){

numerator<-p1 * (1 - p1) + p2 * (1 - p2)

denominator<-p1 * (1 - p2) + p2 * (1 - p1)

fst<-1 - numerator/denominator

out<-cbind(numerator,denominator,fst)

return(out)

}

N<-table(IDs$trt)

droughtids<-which(pop_ids=="Drought")

irrigationids<-which(pop_ids=="Irrigation")

HEids<-which(pop_ids=="HE")

LEids<-which(pop_ids=="LE")

## fst for each snp

fstDxIhu<-hudsonFst2(p1=allele_freq[droughtids,],p2=allele_freq[irrigationids,],n1=N[droughtids],n2=N[irrigationids])

fstLExHEhu<-hudsonFst2(p1=allele_freq[LEids,],p2=allele_freq[HEids,],n1=N[LEids],n2=N[HEids])

png("Fst_Plots.png")

par(mfrow=c(1,3))

hist(fstDxIhu,xlab="Fst",main="Drought vs. Irrig.")

hist(fstLExHEhu,xlab="Fst",main="LE vs. HE")

plot(fstDxIhu,fstLExHEhu,bty="n",xlab="Fst D*I",ylab="Fst L*H",pch=".")

abline(0,1,col="red")

dev.off()

Page updated

Google Sites

Report abuse