Post date: May 09, 2014 10:46:42 PM
#So, I need to a nana-only assembly, since I called nana variants with all TX hill country eurycea in the file last year.
#I copied the parsed fastq file from greenhouse to USU using:
rsync -avz llucas@147.26.169.195:/Volumes/data/eurycea/parsedreads/mod_parsed_s_1_UW5_1_Replacement.fastq ./
#Then I split up this fastq file by individual, using splitFastq.pl (I had to change the regular expression slightly). I also made the nanaids.txt file by deleting all columns but one in the original barcode file (using open office).
perl splitFastq.pl nanaids.txt mod_parsed_s_1_UW5_1_Replacement.fastq
#Then I'll concatenate all nana fastq files together on the USU cluster and copy it over to sunflower (/data/local/august13_ut/) for a de novo assembly.
cat E-SM-* > combinednana.fastq
rsync -avz ./combinednana.fastq lauren@sunflower.uwyo.edu:/data/local/august13_ut/nana/
#On node4, de novo assembly:
head -n 160000000 combinednana.fastq > Enana40mil.fastq &
cp /data/local/august13_ut/Esosorum_40mil_denovo.smng.txt Enana_40mil_denovo.smng.txt
#only changed directory names:
emacs Enana_40mil_denovo.smng.txt
smng Enana_40mil_denovo.smng.txt &
#Results: 20.5% assembled.
Assembly Totals
Contigs: 198536
Contigs > 2K: 0
Assembled Sequences: 4184725
Unassembled Sequences: 16227787
Sequences not assembled due to complete trimming: 587145
All Sequences: 20412512
Contig N50: 86 bases
#Made consensus sequence (I chose contigs that were 80 to 96 bases in length because that's what I did with all three other taxa):
cp ../pruneContigs.pl ./
perl pruneContigs.pl Enana_40mil_denovo_mmp92.ace 80 96
grep Contig pruned_Enana_40mil_denovo_mmp92.ace | wc
Contigs: 198536
mv pruned_Enana_40mil_denovo_mmp92.ace pruned_Enana_40mil_denovo_mmp92.fasta
#Then I tried to assemble these sequences to themselves to identify similar, potentially repetitive contigs.
cp Enana_40mil_denovo.smng.txt Enana_40mil_qc.smng.txt
emacs Enana_40mil_qc.smng.txt
#I changed:
loadSeq file:
"/data/local/august13_ut/pruned_Enana_40mil_denovo_mmp92.fasta"
setParam minMatchPercent:84
RealignContigs
saveProject file: "/data/local/august13_ut/nana/Enana_40mil_qc_mmp84.fasta"
format:Phrap
saveReport file: "/data/local/august13_ut/nana/Enana_40mil_qc_mmp84.report.txt"
writeUnassembledSeqs file: "/data/local/august13_ut/nana/Enana_40mil_qc_mmp84.fasta"
closeProject
smng Enana_40mil_qc.smng.txt
Unassembled Sequences: 195573
#Then I copied Enana_40mil_qc_mmp84.fasta to a new directory at USU: /labs/evolution/projects/nana/
mv Enana_40mil_qc_mmp84.fasta Enana_40mil_qc_mmp84.fastq
# First, I needed to make my fasta file (from the de novo assembly) look like a fasta file, not a fastq file. See script fastq2fasta.pl in /labs/evolution/projects/sosorum at USU.
perl fastq2fasta.pl Enana_40mil_qc_mmp84.fastq
# Then I indexed my reference sequence on the cluster here. See jobsubIndex.sh. To run it:
qsub jobsubIndex.sh
#254024.torque
#To run BWA for the alignment, I modified some things in wrap_qsub_rc_bwa.pl, and typed to run:
perl wrap_qsub_rc_bwa.pl E-SM-*fastq
cd /labs/evolution/projects/nana/
bwa aln -n 4 -l 20 -k 2 -t 8 -q 10 -f alnE-SM-W-HO-258.sai Enana_40mil_qc_mmp84.fasta E-SM-W-HO-258.fastq
bwa samse -n 1 -r \'@RG\\tID:E-SM-W-HO-258\' -f alnE-SM-W-HO-258.sam Enana_40mil_qc_mmp84.fasta alnE-SM-W-HO-258.sai E-SM-W-HO-258.fastq
#195 Job #s: 254047 - 254241.torque