Post date: May 16, 2014 9:48:49 PM
#I checked the barcode file; it looks fine.
#So, I'll do a PCA with all Eurycea to see if all Eurycea come out as three groups.
#I copied euryceacontigs.fas (the unassembled reference contigs) and mod_parsed_s_1_UW5_1_Replacement.fastq into a new directory, eurycea. I split the fastq file by individual. I used the fas2fasta.pl script. I used the jobsubIndex.sh script. And I did the BWA alignment:
perl wrap_qsub_rc_bwa.pl E-*fastq
#257159-257542.torque
#May 19
perl wrap_qsub_rc_sam2bam.pl alnE-*sam
cd /labs/evolution/projects/eurycea/
samtools view -b -S -o alnE-SS-W-SS-354.bam alnE-SS-W-SS-354.sam
samtools sort alnE-SS-W-SS-354.bam alnE-SS-W-SS-354.sorted
samtools index alnE-SS-W-SS-354.sorted.bam
#257782-258165.torque
#May 20
samtools faidx *.fasta
cp ../sosorum/jobsubVariantCall.sh ./
vi jobsubVariantCall.sh
qsub jobsubVariantCall.sh
#258184.torque.rc.usu.edu
#May 21
#589 variable loci, average coverage is 8.12x
#after vcf2genotype.pl, 320 loci, 384 individuals
#See PCA: there is still the same weird spoke / 3 group pattern. Should I just drop this dataset and conclude it's just crap?
#May 23: So, do I get this weird pattern with the other lane that was prepped at the same time as the Eurycea lane? How about with the new Stygoparnus lane?
#new directories in /labs/evolution/projects: heterelmis/ stygobromus/ stygoparnus/
perl wrap_qsub_rc_bwa.pl H-*fastq
bwa aln -n 4 -l 20 -k 2 -t 8 -q 10 -f alnH-U-IS-IS-461.sai heterNoVcontigs.fasta H-U-IS-IS-461.fastq
bwa samse -n 1 -r \'@RG\\tID:H-U-IS-IS-461\' -f alnH-U-IS-IS-461.sam heterNoVcontigs.fasta alnH-U-IS-IS-461.sai H-U-IS-IS-461.fastq
#259796-259941.torque -- nevermind, these never ran! Re-did these May 28: 2349-2534
qsub jobsubVariantCall.sh #with -d 0.5
#2536
#interactive session: perl vcfFilterMinMax.pl *vcf #specifies min (2x) and max (6x) coverage
#Retained 2134 variable loci
#ave cov. = 3x
perl vcf2genotype.pl filtered_varHet05.vcf
#Number of loci: 1630; number of individuals 186
head -n 1 filtered_varHet05.genest | perl -p -i -e 's/ /\n/g' | perl -p -i -e 's/^(aln[A-Z]\-[A-Z]\-[A-Z]+)\-/\1,/' > indIDs.txt
#in R
popinfo<-read.csv("indIDs.txt",header=F)
gendata<-read.table("filtered_varHet05NoHdr.genest",header=F)
pcaout<-prcomp(t(as.matrix(gendata)),center=T,scale.=F)
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 2.22369 2.10802 1.90781 1.73510 1.41230 1.33597 1.2709
#Proportion of Variance 0.03887 0.03493 0.02861 0.02366 0.01568 0.01403 0.0127
#Cumulative Proportion 0.03887 0.07380 0.10241 0.12607 0.14175 0.15578 0.1685
pdf("het05_PCA.pdf",width=6,height=6)
plot(pcaout$x[,1],pcaout$x[,2],type="n",ylab="PC2 (3.5%)",xlab="PC1 (3.9%)")
title(main="Heterelmis",adj=0)
text(pcaout$x[popinfo[,1] == 'alnH-C-CS',1],pcaout$x[popinfo[,1] == 'alnH-C-CS',2],col="purple",labels=("CS"))
text(pcaout$x[popinfo[,1] == 'alnH-C-SM',1],pcaout$x[popinfo[,1] == 'alnH-C-SM',2],col="blue",labels=("SM"))
text(pcaout$x[popinfo[,1] == 'alnH-G-CA',1],pcaout$x[popinfo[,1] == 'alnH-G-CA',2],col="gray",labels=("CA"))
text(pcaout$x[popinfo[,1] == 'alnH-G-DS',1],pcaout$x[popinfo[,1] == 'alnH-G-DS',2],col="black",labels=("DS"))
text(pcaout$x[popinfo[,1] == 'alnH-G-FB',1],pcaout$x[popinfo[,1] == 'alnH-G-FB',2],col="red",labels=("FB"))
text(pcaout$x[popinfo[,1] == 'alnH-G-SS',1],pcaout$x[popinfo[,1] == 'alnH-G-SS',2],col="orange",labels=("SS"))
text(pcaout$x[popinfo[,1] == 'alnH-U-IS',1],pcaout$x[popinfo[,1] == 'alnH-U-IS',2],col="green",labels=("IS"))
dev.off()
#Heterelmis looks fine!
#re-did nana variant calling with -d 0.3 instead of 0.8 or 0.5
259730.torque.rc.usu.edu -- nevermind, this never ran, I guess!
#May 27 redo: 1877
#will run vcfFilterMinMax.pl when finished because it specifies min (2x) and max (5x) coverage. Will need to start an interactive session to run this.
Finished filtering varEnana03.vcf
Retained 2077 variable loci
Ave. cov per locus per individual = 2.82
perl vcf2genotype.pl filtered_varEnana03.vcf
Number of loci: 1408; number of individuals 195
> nanapopinfo<-read.csv("indIds.csv",header=F)
> nanagendata<-read.table("filtered_varEnana03NoHdr.genest",header=F)
> nanapcaout<-prcomp(t(as.matrix(nanagendata)),center=T,scale.=F)
> summary(nanapcaout)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.76115 2.50566 1.78163 1.0767 1.04403 1.03565 1.0159
Proportion of Variance 0.07828 0.06446 0.03259 0.0119 0.01119 0.01101 0.0106
Cumulative Proportion 0.07828 0.14274 0.17533 0.1872 0.19843 0.20944 0.2200
> summary(pcgcov)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.07527 0.06179 0.02074 0.01145 0.01078 0.01062 0.01021
Proportion of Variance 0.38451 0.25917 0.02920 0.00890 0.00789 0.00765 0.00707
Cumulative Proportion 0.38451 0.64368 0.67288 0.68178 0.68968 0.69733 0.70440
#May 28. So, the Heterelmis pattern looks normal. I'll try one more thing to figure out what the he** is up with the Eurycea PCA pattern. I copied over var_hicov_fin_m2_mod_snpcnt.txt from greenhouse. This is the file I used to estimate allele frequencies for Eurycea while in TX. Zach worked his magic to get genotype probabilities with a global prior for allele freq (not one for each subpop). Let's see if the pattern is the same!
#in R
> popinfo<-read.csv("ids.txt",header=F)
> gendata<-read.table("oldEurycea.genest",header=F)
> pcaout<-prcomp(t(as.matrix(gendata)),center=T,scale.=F)
> summary(pcaout)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.25260 1.79611 1.65007 1.55185 1.52729 1.51877 1.48095
Proportion of Variance 0.01486 0.00944 0.00797 0.00705 0.00683 0.00675 0.00642
Cumulative Proportion 0.01486 0.02430 0.03227 0.03932 0.04615 0.05290 0.05932
pdf("oldEurycea_PCA.pdf",width=6,height=6)
plot(pcaout$x[,1],pcaout$x[,2],type="n",ylab="PC2 (0.9%)",xlab="PC1 (1.5%)")
title(main="old Eurycea",adj=0)
text(pcaout$x[popinfo[,1] == 'E-CS-W',1],pcaout$x[popinfo[,1] == E-CS-W',2],col="purple",labels=("CS"))
text(pcaout$x[popinfo[,1] == 'E-DB-W',1],pcaout$x[popinfo[,1] == 'E-DB-W',2],col="blue",labels=("DB"))
text(pcaout$x[popinfo[,1] == 'E-FB-W',1],pcaout$x[popinfo[,1] == 'E-FB-W',2],col="gray",labels=("FB"))
text(pcaout$x[popinfo[,1] == 'E-HS-W',1],pcaout$x[popinfo[,1] == 'E-HS-W',2],col="black",labels=("HS"))
text(pcaout$x[popinfo[,1] == 'E-JW-W',1],pcaout$x[popinfo[,1] == 'E-JW-W',2],col="red",labels=("JW"))
text(pcaout$x[popinfo[,1] == 'E-OS-W',1],pcaout$x[popinfo[,1] == 'E-OS-W',2],col="orange",labels=("OS"))
text(pcaout$x[popinfo[,1] == 'E-SM-W',1],pcaout$x[popinfo[,1] == 'E-SM-W',2],col="green",labels=("SM"))
text(pcaout$x[popinfo[,1] == 'E-SS-W',1],pcaout$x[popinfo[,1] == 'E-SS-W',2],col="darkblue",labels=("SS"))
dev.off()
#This pattern is better. My old assembly / data is not crap.
> hicov<-which(cov > 5)
> length(hicov)
[1] 68
> pcaoutH<-prcomp(t(as.matrix(gendata[hicov,])),center=T,scale.=F)
> pdf("oldEuryceaHicov_PCA.pdf",width=6,height=6)
> plot(pcaoutH$x[,1],pcaoutH$x[,2],type="n",ylab="PC2 (0.9%)",xlab="PC1 (1.5%)")
> title(main="old Eurycea",adj=0)
> text(pcaoutH$x[popinfo[,1] == 'E-CS-W',1],pcaoutH$x[popinfo[,1] == 'E-CS-W',2],col="purple",labels=("CS"))
> text(pcaoutH$x[popinfo[,1] == 'E-DB-W',1],pcaoutH$x[popinfo[,1] == 'E-DB-W',2],col="blue",labels=("DB"))
> text(pcaoutH$x[popinfo[,1] == 'E-FB-W',1],pcaoutH$x[popinfo[,1] == 'E-FB-W',2],col="gray",labels=("FB"))
> text(pcaoutH$x[popinfo[,1] == 'E-HS-W',1],pcaoutH$x[popinfo[,1] == 'E-HS-W',2],col="black",labels=("HS"))
> text(pcaoutH$x[popinfo[,1] == 'E-JW-W',1],pcaoutH$x[popinfo[,1] == 'E-JW-W',2],col="red",labels=("JW"))
> text(pcaoutH$x[popinfo[,1] == 'E-OS-W',1],pcaoutH$x[popinfo[,1] == 'E-OS-W',2],col="orange",labels=("OS"))
> text(pcaoutH$x[popinfo[,1] == 'E-SM-W',1],pcaoutH$x[popinfo[,1] == 'E-SM-W',2],col="green",labels=("SM"))
> text(pcaoutH$x[popinfo[,1] == 'E-SS-W',1],pcaoutH$x[popinfo[,1] == 'E-SS-W',2],col="darkblue",labels=("SS"))
> dev.off()
null device
1
> summary(pcaoutH)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.7303 0.49752 0.43599 0.41698 0.41182 0.39390 0.37218
Proportion of Variance 0.1596 0.07406 0.05687 0.05202 0.05074 0.04642 0.04144
Cumulative Proportion 0.1596 0.23363 0.29050 0.34252 0.39326 0.43968 0.48112
#This pattern with my old assembly / data is even better. But this PCA is only with 68 loci (cov > 5).