PSSP Genotype Matrix

Made copy of PSSP_BayGen_metadata to hard drive

scp u1065430@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/PSSP_BayGen_metadata /home/jacqueline/Documents/PSSPGenMatrix/

SLURM info: https://www.chpc.utah.edu/documentation/software/slurm.php#submit

Below are the scripts for the posterior probabilities for genotype estimates using Bayesian. They are found in this directory: /uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/scripts

This script returns a genotype matirx (locus by ind) with genotype means (point estimates) from a genotype likelihood file; a HW prior is used based on allele frequencies provided in a separate file

#!/usr/bin/perl

## USAGE: perl gl2genest.pl af_file.txt file.gl

use warnings;

$af = shift (@ARGV);

$in = shift (@ARGV);

## read in and store maf's

open (IN, $af) or die "read failed: $af\n";

while (<IN>){

chomp;

push (@af,$_);

}

close (IN);

## read through gl file and estimate genotypes

open (IN, $in) or die "read failed: $in\n";

$out = $in;

$out =~ s/gl$/txt/;

open (OUT, "> pntest_$out") or die;

while (<IN>){

This scripts conversts a vcf file to a simpler format for downstream analysis. I am calling this format genetoype likelihood (gl). The first line lists: the number of individuals and loci. The next line has individual ids. This if followed by on line per SNP that gives the SNP id (scaffold, position) and the phred-scaled genotype likelihoods, three per individual. The script prints the non-reference allele frequency to a file so that they can be used to estimate genotypes

#!/usr/bin/perl

# USAGE: perl vcf2gl.pl myfile.vcf

my $in = shift(@ARGV);

my @line = ();

my $word;

my $nind = 603;

my $nloc = 52039;

my $out = $in;

$out =~ s/vcf/gl/ or die "failed file name sub\n";

open (OUT, "> $out") or die "Could not write the outfile\n";

if ($out =~ s/gl/txt/){

open (OUT2, "> af_$out") or die "Count not write the alt. af file\n";

}

open (IN, $in) or die "Could not read the vcf file\n";

while (<IN>){

chomp;

To the vcf: /uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants

filtered2x70_PSSP_variants.vcf

command:

module load perl

perl $filepath/JJP_vcf2gl.pl filtered2x70_PSSP_variants.vcf

/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/PSSP_BayGen_metadata

PSSP sample ids reformatted to split quad names and treatments:

setwd("/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/")

genotypeest <- read.table("pntest_filtered2x70_PSSP_variants.txt", header=FALSE)

ids <- read.table("ids.txt", header=FALSE)

ids2 <- ids$V1 #assign as vector

ids2

ids_char <- as.character(ids2)

ids_char

ids_split <- strsplit(ids_char, "_")

tx <- character()

for(i in 1:length(genotypeest[1,])){

tx[i] <- as.numeric(ids_split[[i]][3])

}

quad <- character()

for(i in 1:length(genotypeest[1,])){

tx[i] <- ids_split[[i]][5]

}

factors <- rbind(tx, quad)

head(factors)

genotypeest_meta <- rbind(genotypeest, factors)

tail(genotypeest_meta)

write.table(genotypeest_meta, "PSSP_BayGen_metadata")

###PCA for PSSP###

###12/12/17###

###Using the princomp command which is a Spectral decomposition

###which examines the covariances / correlations between variables

#I wrote a csv file to get rid of quotes

write.csv(GG, file = "PSSP_BayGen.csv")

Gen<-as.matrix(read.csv("PSSP_BayGen.csv",header=TRUE))

ids<-read.csv("PSSP_ids_2017.csv",header=TRUE)

#transpose the data to get SNP's as rows and plant individuals as columns

Gen.trans<-t(Gen)

#do PCA

myPCA = princomp(Gen.trans[,2:NCOL(Gen.trans)])

plot(myPCA)

summary(myPCA)

loadings(myPCA)

names<-ids$tx

plot(loadings(myPCA),col=factor(names),pch=16)

col=as.numeric(unique(factor(names)))

legend("topleft", legend=levels(factor(names)),

col=as.numeric(unique(factor(names))),

pch=19)

#Getting rid of grass and removal treatments

Gen2<-as.matrix(read.csv("Gen_WithoutRemovals.csv",header=TRUE))

ids2<-read.csv("ids_WithoutRemovals.csv",header=TRUE)

#transpose the data to get SNP's as rows and plant individuals as columns

Gen.trans2<-t(Gen2)

#do PCA without grass and shrub removal

myPCA = princomp(Gen2[,2:NCOL(Gen2)])

plot(myPCA)

summary(myPCA)

loadings(myPCA)

PCA=prcomp(Gen2, scale = FALSE)$x

plot(PCA)

loadings(PCA)

names<-ids2$tx

lanes<-ids2$lane

plot(loadings(myPCA),col=factor(lanes),pch=16)

col=as.numeric(unique(factor(lanes)))

legend("topleft", legend=levels(factor(lanes)),

col=as.numeric(unique(factor(lanes))),

pch=19,inset=c(0.0,0.0),ncol=2)

myPCA22=prcomp(Gen2[,2:NCOL(Gen2)])

plot(myPCA22$x[,1],myPCA22$x[,2],type='n',xlab="pc1",ylab="pc2",col=factor(names),pch=20)

text(myPCA22$x[,1],myPCA22$x[,2],names)

#plot of treatments

col=as.numeric(unique(factor(names)))

plot(loadings(myPCA),col=factor(lanes),pch=20)

legend("topleft", legend=levels(factor(names)),

col=as.numeric(unique(factor(lanes))),

pch=20)

#plot of lanes

col=as.numeric(unique(factor(lanes)))

plot(loadings(myPCA),col=factor(lanes),pch=20)

legend("topleft", legend=levels(factor(lanes)),

col=as.numeric(unique(factor(lanes))),

pch=20,inset=c(0.0,0.0),ncol=2)

##PCA of just control, drought, and irrigation##

Gen3<-as.matrix(read.csv("Gen_PrecipTrt.csv",header=TRUE))

ids3<-read.csv("ids_PrecipTrt.csv",header=TRUE)

#transpose the data to get SNP's as rows and plant individuals as columns

Gen.trans3<-t(Gen3)

myPCA3 = princomp(Gen3[,2:NCOL(Gen3)])

plot(myPCA3)

summary(myPCA3)

loadings(myPCA3)

names3<-ids3$quad

col=as.numeric(unique(factor(names3)))

plot(loadings(myPCA3),col=factor(names3),pch=16)

legend("topleft", legend=levels(factor(names3)),

col=as.numeric(unique(factor(names3))),

pch=19,inset=c(0.0,0.0),ncol=2)

##PCA of just low and high elvations##

Gen4<-as.matrix(read.csv("Gen_Elevation.csv",header=TRUE))

ids4<-read.csv("ids_Elevation.csv",header=TRUE)

#transpose the data to get SNP's as rows and plant individuals as columns

Gen.trans4<-t(Gen4)

myPCA4 = princomp(Gen4[,2:NCOL(Gen4)])

plot(myPCA4)

summary(myPCA4)

loadings(myPCA4)

names4<-ids4$tx

col=as.numeric(unique(factor(names4)))

plot(loadings(myPCA3),col=factor(names4),pch=16)

legend("topleft", legend=levels(factor(names4)),

col=as.numeric(unique(factor(names4))),

pch=19)

##Code from meeting with Zach######

o<-prcomp(t(as.matrix(Gen[,-1])),center=T,scale=F)

plot(o$x[,1],o$x[,2])

plot(o$x[,1],o$x[,2],col=c("blue","green","orange","red","brown","purple","black")[as.numeric(lanes)])

names<-ids2$tx

lanes<-ids2$lane

quad<-ids2$quad

plot(o$x[,1],o$x[,2],col=c(as.numeric(unique(factor(quad))))[as.numeric(quad)])

plot(o$x[,1],o$x[,2],col=col.rainbow,pch=19)

legend("topleft",legend=levels(factor(quad)),fill=col.rainbow,inset=c(0.0,0.0),ncol=4)

par(mar=c(5, 5, 5, 5) + 0.1)

qcolor <- as.factor(ids2$quad)

col.rainbow <- rainbow(30)

par(mar=c(5, 4, 4, 2) + 0.1)

##Reading in csv that only has Control, Drought, and Irrigation

Gen6<-as.matrix(read.csv("Gen_CDI.csv",header=TRUE))

ids6<-read.csv("ids_CDI.csv",header=TRUE)

k<-prcomp(t(as.matrix(Gen6[,-1])),center=T,scale=F)

plot(k$x[,1],k$x[,2])

names6<-ids6$tx

lanes6<-ids6$lane

quad6<-ids6$quad

plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=as.numeric(unique(factor(names6),pch=16)))

legend("topleft", legend=levels(factor(names6)),

col=as.numeric(unique(factor(names6))),

pch=19)

legend("topleft",legend=levels(factor(quad)),fill=col.rainbow,inset=c(0.0,0.0),ncol=4)

plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=as.numeric(unique(factor(lanes6),pch=19)))

legend("topleft", legend=levels(factor(lanes6)),

col=as.numeric(unique(factor(lanes6))),

pch=19)

plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=col.rainbow,pch=19)

legend("topleft", legend=levels(factor(quad6)),

fill=col.rainbow,

pch=19,ncol=3)