Made copy of PSSP_BayGen_metadata to hard drive
scp u1065430@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/PSSP_BayGen_metadata /home/jacqueline/Documents/PSSPGenMatrix/
SLURM info: https://www.chpc.utah.edu/documentation/software/slurm.php#submit
Below are the scripts for the posterior probabilities for genotype estimates using Bayesian. They are found in this directory: /uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/scripts
This script returns a genotype matirx (locus by ind) with genotype means (point estimates) from a genotype likelihood file; a HW prior is used based on allele frequencies provided in a separate file
#!/usr/bin/perl
## USAGE: perl gl2genest.pl af_file.txt file.gl
use warnings;
$af = shift (@ARGV);
$in = shift (@ARGV);
## read in and store maf's
open (IN, $af) or die "read failed: $af\n";
while (<IN>){
chomp;
push (@af,$_);
}
close (IN);
## read through gl file and estimate genotypes
open (IN, $in) or die "read failed: $in\n";
$out = $in;
$out =~ s/gl$/txt/;
open (OUT, "> pntest_$out") or die;
while (<IN>){
This scripts conversts a vcf file to a simpler format for downstream analysis. I am calling this format genetoype likelihood (gl). The first line lists: the number of individuals and loci. The next line has individual ids. This if followed by on line per SNP that gives the SNP id (scaffold, position) and the phred-scaled genotype likelihoods, three per individual. The script prints the non-reference allele frequency to a file so that they can be used to estimate genotypes
#!/usr/bin/perl
#
#
# USAGE: perl vcf2gl.pl myfile.vcf
#
my $in = shift(@ARGV);
my @line = ();
my $word;
my $nind = 603;
my $nloc = 52039;
my $out = $in;
$out =~ s/vcf/gl/ or die "failed file name sub\n";
open (OUT, "> $out") or die "Could not write the outfile\n";
if ($out =~ s/gl/txt/){
open (OUT2, "> af_$out") or die "Count not write the alt. af file\n";
}
open (IN, $in) or die "Could not read the vcf file\n";
while (<IN>){
chomp;
To the vcf: /uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants
filtered2x70_PSSP_variants.vcf
command:
module load perl
perl $filepath/JJP_vcf2gl.pl filtered2x70_PSSP_variants.vcf
/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/PSSP_BayGen_metadata
PSSP sample ids reformatted to split quad names and treatments:
setwd("/uufs/chpc.utah.edu/common/home/u6010445/data/grasses_JJP/variants/")
genotypeest <- read.table("pntest_filtered2x70_PSSP_variants.txt", header=FALSE)
ids <- read.table("ids.txt", header=FALSE)
ids2 <- ids$V1 #assign as vector
ids2
ids_char <- as.character(ids2)
ids_char
ids_split <- strsplit(ids_char, "_")
tx <- character()
for(i in 1:length(genotypeest[1,])){
tx[i] <- as.numeric(ids_split[[i]][3])
}
quad <- character()
for(i in 1:length(genotypeest[1,])){
tx[i] <- ids_split[[i]][5]
}
factors <- rbind(tx, quad)
head(factors)
genotypeest_meta <- rbind(genotypeest, factors)
tail(genotypeest_meta)
write.table(genotypeest_meta, "PSSP_BayGen_metadata")
###PCA for PSSP###
###12/12/17###
###Using the princomp command which is a Spectral decomposition
###which examines the covariances / correlations between variables
#I wrote a csv file to get rid of quotes
write.csv(GG, file = "PSSP_BayGen.csv")
Gen<-as.matrix(read.csv("PSSP_BayGen.csv",header=TRUE))
ids<-read.csv("PSSP_ids_2017.csv",header=TRUE)
#transpose the data to get SNP's as rows and plant individuals as columns
Gen.trans<-t(Gen)
#do PCA
myPCA = princomp(Gen.trans[,2:NCOL(Gen.trans)])
plot(myPCA)
summary(myPCA)
loadings(myPCA)
names<-ids$tx
plot(loadings(myPCA),col=factor(names),pch=16)
col=as.numeric(unique(factor(names)))
legend("topleft", legend=levels(factor(names)),
col=as.numeric(unique(factor(names))),
pch=19)
#Getting rid of grass and removal treatments
Gen2<-as.matrix(read.csv("Gen_WithoutRemovals.csv",header=TRUE))
ids2<-read.csv("ids_WithoutRemovals.csv",header=TRUE)
#transpose the data to get SNP's as rows and plant individuals as columns
Gen.trans2<-t(Gen2)
#do PCA without grass and shrub removal
myPCA = princomp(Gen2[,2:NCOL(Gen2)])
plot(myPCA)
summary(myPCA)
loadings(myPCA)
PCA=prcomp(Gen2, scale = FALSE)$x
plot(PCA)
loadings(PCA)
names<-ids2$tx
lanes<-ids2$lane
plot(loadings(myPCA),col=factor(lanes),pch=16)
col=as.numeric(unique(factor(lanes)))
legend("topleft", legend=levels(factor(lanes)),
col=as.numeric(unique(factor(lanes))),
pch=19,inset=c(0.0,0.0),ncol=2)
myPCA22=prcomp(Gen2[,2:NCOL(Gen2)])
plot(myPCA22$x[,1],myPCA22$x[,2],type='n',xlab="pc1",ylab="pc2",col=factor(names),pch=20)
text(myPCA22$x[,1],myPCA22$x[,2],names)
#plot of treatments
col=as.numeric(unique(factor(names)))
plot(loadings(myPCA),col=factor(lanes),pch=20)
legend("topleft", legend=levels(factor(names)),
col=as.numeric(unique(factor(lanes))),
pch=20)
#plot of lanes
col=as.numeric(unique(factor(lanes)))
plot(loadings(myPCA),col=factor(lanes),pch=20)
legend("topleft", legend=levels(factor(lanes)),
col=as.numeric(unique(factor(lanes))),
pch=20,inset=c(0.0,0.0),ncol=2)
##PCA of just control, drought, and irrigation##
Gen3<-as.matrix(read.csv("Gen_PrecipTrt.csv",header=TRUE))
ids3<-read.csv("ids_PrecipTrt.csv",header=TRUE)
#transpose the data to get SNP's as rows and plant individuals as columns
Gen.trans3<-t(Gen3)
myPCA3 = princomp(Gen3[,2:NCOL(Gen3)])
plot(myPCA3)
summary(myPCA3)
loadings(myPCA3)
names3<-ids3$quad
col=as.numeric(unique(factor(names3)))
plot(loadings(myPCA3),col=factor(names3),pch=16)
legend("topleft", legend=levels(factor(names3)),
col=as.numeric(unique(factor(names3))),
pch=19,inset=c(0.0,0.0),ncol=2)
##PCA of just low and high elvations##
Gen4<-as.matrix(read.csv("Gen_Elevation.csv",header=TRUE))
ids4<-read.csv("ids_Elevation.csv",header=TRUE)
#transpose the data to get SNP's as rows and plant individuals as columns
Gen.trans4<-t(Gen4)
myPCA4 = princomp(Gen4[,2:NCOL(Gen4)])
plot(myPCA4)
summary(myPCA4)
loadings(myPCA4)
names4<-ids4$tx
col=as.numeric(unique(factor(names4)))
plot(loadings(myPCA3),col=factor(names4),pch=16)
legend("topleft", legend=levels(factor(names4)),
col=as.numeric(unique(factor(names4))),
pch=19)
##Code from meeting with Zach######
o<-prcomp(t(as.matrix(Gen[,-1])),center=T,scale=F)
plot(o$x[,1],o$x[,2])
plot(o$x[,1],o$x[,2],col=c("blue","green","orange","red","brown","purple","black")[as.numeric(lanes)])
names<-ids2$tx
lanes<-ids2$lane
quad<-ids2$quad
plot(o$x[,1],o$x[,2],col=c(as.numeric(unique(factor(quad))))[as.numeric(quad)])
plot(o$x[,1],o$x[,2],col=col.rainbow,pch=19)
legend("topleft",legend=levels(factor(quad)),fill=col.rainbow,inset=c(0.0,0.0),ncol=4)
par(mar=c(5, 5, 5, 5) + 0.1)
qcolor <- as.factor(ids2$quad)
col.rainbow <- rainbow(30)
par(mar=c(5, 4, 4, 2) + 0.1)
##Reading in csv that only has Control, Drought, and Irrigation
Gen6<-as.matrix(read.csv("Gen_CDI.csv",header=TRUE))
ids6<-read.csv("ids_CDI.csv",header=TRUE)
k<-prcomp(t(as.matrix(Gen6[,-1])),center=T,scale=F)
plot(k$x[,1],k$x[,2])
names6<-ids6$tx
lanes6<-ids6$lane
quad6<-ids6$quad
plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=as.numeric(unique(factor(names6),pch=16)))
legend("topleft", legend=levels(factor(names6)),
col=as.numeric(unique(factor(names6))),
pch=19)
legend("topleft",legend=levels(factor(quad)),fill=col.rainbow,inset=c(0.0,0.0),ncol=4)
plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=as.numeric(unique(factor(lanes6),pch=19)))
legend("topleft", legend=levels(factor(lanes6)),
col=as.numeric(unique(factor(lanes6))),
pch=19)
plot(k$x[,1],k$x[,2],xlab="PCA 1",ylab="PCA 2",col=col.rainbow,pch=19)
legend("topleft", legend=levels(factor(quad6)),
fill=col.rainbow,
pch=19,ncol=3)