1. Kojima,K.K. and Kanehisa,M. (2008) Systematic Survey for Novel Types of Prokaryotic Retroelements Based on Gene Neighborhood and Protein Architecture. Molecular Biology and Evolution, 25, 1395–1404.
2. Simon,D.M. and Zimmerly,S. (2008) A diversity of uncharacterized reverse transcriptases in bacteria. Nucleic Acids Res., 36, 7219–7229.
3. Zimmerly,S. and Wu,L. (2015) An Unexplored Diversity of Reverse Transcriptases in Bacteria. Microbiology Spectrum, 3, MDNA3–0058–2014.
4. Kent,T., Mateos-Gomez,P.A., Sfeir,A., Elife,R.P.2016 Polymerase θ is a robust terminal transferase that oscillates between three different mechanisms during end-joining. cdn.elifesciences.org.
5. Stamos,J.L., Lentzsch,A.M. and Lambowitz,A.M. (2017) Structure of a Thermostable Group II Intron Reverse Transcriptase with Template-Primer and Its Functional and Evolutionary Implications. Mol. Cell, 68, 926–939.e4.
6. Zheng,G., Qin,Y., Clark,W.C., Dai,Q., Yi,C., He,C., Lambowitz,A.M. and Pan,T. (2015) Efficient and quantitative high-throughput tRNA sequencing. Nat. Methods, 12, 835–837.
7. Cock,P., Antao,T., Chang,J.T., Chapman,B.A.2009 Biopython: freely available Python tools for computational molecular biology and bioinformatics. academic.oup.com.
8. Larkin,M.A., Blackshields,G., Brown,N.P., Chenna,R.2007 Clustal W and Clustal X version 2.0. academic.oup.com.
Reverse-complement DNA or RNA sequences
from Bio import SeqIO
records = (rec.reverse_complement(id="rc_"+rec.id, description= "reverse complement")
for rec in SeqIO.parse("YIDDMgRNA_S18_L001_R2_001.fastq","fastq"))
SeqIO.write(records,"YIDD_Mg_RNA_R2_rc.fastq","fastq")
2. Input sequence trimming
from Bio import SeqIO
def trim_adaptors(records, adaptor):
len_adaptor = len(adaptor)
for record in records:
index = record.seq.find(adaptor)
if index == -1:
yield record
else:
yield record[index + len_adaptor:]
original_read = SeqIO.parse("YIDD_Mg_RNA_R2_rc.fastq","fastq")
trimmed_reads = trim_adaptors(original_read, "CCCAA")
count = SeqIO.write(trimmed_reads, "YIDD_Mg_RNA_R2_rc_trimmed.fastq","fastq")
print("Saved %i reads"% count)
3. Sorting FASTQ files by sequence length
from Bio import SeqIO
records = list(SeqIO.parse("YIDD_Mg_RNA_R2_rc_trimmed.fastq", "fastq"))
records.sort(key=lambda r: len(r))
SeqIO.write(records, "YIDD_Mg_RNA_R2_rc_trimmed_sort.fastq", "fastq")
4. Sequences length cutoff (save 1-70nt)
from Bio import SeqIO
short_read = (rec for rec in SeqIO.parse("YIDD_Mg_RNA_R2_rc_trimmed_sort.fastq","fastq")
if 0< len(rec) <=70)
count= SeqIO.write(short_read,"YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff.fastq","fastq")
5. Quality filtering with a minimum PHRED quality of 20
from Bio import SeqIO
good_reads=(
rec
for rec in SeqIO.parse("YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff.fastq", "fastq")
if min(rec.letter_annotations["phred_quality"]) >=20
)
count = SeqIO.write(good_reads, "YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff_qulity.fastq","fastq")
print("Saved %i reads"% count)
6. Convert FASTQ file to FASTA file
from Bio import SeqIO
SeqIO.convert("YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff_qulity.fastq","fastq", "YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff_qulity.fasta","fasta")
7. Running statistics and plotting length distribution of extension
from Bio import SeqIO
sizes = [len(rec) for rec in SeqIO.parse("YIDD_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity.fasta","fasta")]
print(sizes)
import pylab
pylab.hist(sizes, bins=50)
pylab.title(
"%i YIDD_Mn_RNA sequence\nLengths %int to %int"% (len(sizes), min(sizes), max(sizes))
)
pylab.xlabel("Sequence length (nt)")
pylab.ylabel("Count")
pylab.savefig("YIDD_Mn_RNA.pdf")
8. Analyzing the occurrence of each sequences (single nucleotides)
output_file = open("YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff_qulity_nucleotide_con.tsv","w")
output_file.write('Gene\tseq\tA\tA%\tC\tC%\tG\tG%\tT\tT%\tLength\tAT%\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse("YIDD_Mg_RNA_R2_rc_trimmed_sort_cutoff_qulity.fasta","fasta"):
gene_name=cur_record.name
sequence = cur_record.seq
A_count = cur_record.seq.count("A")
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
A_percentage = A_count / length *100
C_percentage = C_count / length *100
G_percentage = G_count / length *100
T_percentage = T_count / length *100
at_percentage = float(A_count + T_count) / length *100
cg_percentage = float(C_count + G_count) / length *100
output_line = '%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, sequence, A_count, A_percentage, C_count, C_percentage, G_count,G_percentage, T_count,T_percentage, length, at_percentage, cg_percentage)
output_file.write(output_line)
9. Analyzing the occurrence of each sequences (oligonucleotides)
from Bio import SeqIO
trimmed_primer_reads1 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("CCTTA")
)
trimmed_primer_reads2 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("AGAAT")
)
trimmed_primer_reads3 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("GAATT")
)
trimmed_primer_reads4 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("ATTTG")
)
trimmed_primer_reads5 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TTTGT")
)
trimmed_primer_reads6 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TGTGT")
)
trimmed_primer_reads7 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TTGTA")
)
trimmed_primer_reads8 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TAGAT")
)
trimmed_primer_reads9 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("GATTA")
)
trimmed_primer_reads10 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TTATT")
)
trimmed_primer_reads11 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.startswith("TTGC")
)
total_reads = list(SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta","fasta"))
print("Total reads is %i reads" % len(total_reads))
count = SeqIO.write(trimmed_primer_reads1, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_CCTTA.fasta", "fasta")
print("Starting CCTTA from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads2, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_AGAAT.fasta", "fasta")
print("Starting AGAAT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads3, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_GAATT.fasta", "fasta")
print("Starting GAATT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads4, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_ATTTG.fasta", "fasta")
print("Starting ATTTG from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads5, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TTTGT.fasta", "fasta")
print("Starting TTTGT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads6, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TGTGT.fasta", "fasta")
print("Starting TGTGT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads7, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TTGTA.fasta", "fasta")
print("Starting TTGTA from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads8, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TAGAT.fasta", "fasta")
print("Starting TAGAT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads9, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_GATTA.fasta", "fasta")
print("Starting GATTA from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads10, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TTATT.fasta", "fasta")
print("Starting TTATT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads11, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_TTGC.fasta", "fasta")
print("Starting TTGC from saved %i reads" % count,count/len(total_reads)*100,"%")
10. Analyzing the occurrence of each sequences (ends with specific single nucleotide)
from Bio import SeqIO
trimmed_primer_reads1 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("A")
)
trimmed_primer_reads2 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("AA")
)
trimmed_primer_reads3 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("AAA")
)
trimmed_primer_reads4 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("T")
)
trimmed_primer_reads5 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("TT")
)
trimmed_primer_reads6 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("TTT")
)
trimmed_primer_reads7 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("C")
)
trimmed_primer_reads8 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("CC")
)
trimmed_primer_reads9 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("CCC")
)
trimmed_primer_reads10 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("G")
)
trimmed_primer_reads11 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("GG")
)
trimmed_primer_reads12 = (
rec[:]
for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if rec.seq.endswith("GGG")
)
total_reads = list(SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta","fasta"))
print("Total reads is %i reads" % len(total_reads))
count = SeqIO.write(trimmed_primer_reads1, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eA.fasta", "fasta")
print("Ending A from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads2, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eAA.fasta", "fasta")
print("Ending AA from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads3, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eAAA.fasta", "fasta")
print("Ending AAA from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads4, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eT.fasta", "fasta")
print("Ending T from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads5, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eTT.fasta", "fasta")
print("Ending TT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads6, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eTTT.fasta", "fasta")
print("Ending TTT from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads7, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eC.fasta", "fasta")
print("Ending C from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads8, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eCC.fasta", "fasta")
print("Ending CC from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads9, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eCCC.fasta", "fasta")
print("Ending CCC from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads10, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eG.fasta", "fasta")
print("Ending G from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads11, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eGG.fasta", "fasta")
print("Ending GG from saved %i reads" % count,count/len(total_reads)*100,"%")
count = SeqIO.write(trimmed_primer_reads11, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity_eGGG.fasta", "fasta")
print("Ending GGG from saved %i reads" % count,count/len(total_reads)*100,"%")
11. Analyzing specific length range of FASTA file
from Bio import SeqIO
short_read = (rec for rec in SeqIO.parse("YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity.fasta", "fasta")
if 30 <= len(rec) <= 50)
count = SeqIO.write(short_read, "YIDD_Mg_DNA_R2_rc_trimmed_sort_cutoff_qulity(30-50).fasta", "fasta")
print("Saved %i reads" % count)
12. Individual length distribution
from Bio import SeqIO
sizes1 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==1]
total_count1=list(sizes1)
print("%i"%len(total_count1))
sizes2 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==2]
total_count2=list(sizes2)
print("%i"%len(total_count2))
sizes3 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==3]
total_count3=list(sizes3)
print("%i"%len(total_count3))
sizes4 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==4]
total_count4=list(sizes4)
print("%i"%len(total_count4))
sizes5 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==5]
total_count5=list(sizes5)
print("%i"%len(total_count5))
sizes6 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==6]
total_count6=list(sizes6)
print("%i"%len(total_count6))
sizes7 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==7]
total_count7=list(sizes7)
print("%i"%len(total_count7))
sizes8 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==8]
total_count8=list(sizes8)
print("%i"%len(total_count8))
sizes9 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==9]
total_count9=list(sizes9)
print("%i"%len(total_count9))
sizes10 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==10]
total_count10=list(sizes10)
print("%i"%len(total_count10))
sizes11 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==11]
total_count11=list(sizes11)
print("%i"%len(total_count11))
sizes12 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==12]
total_count12=list(sizes12)
print("%i"%len(total_count12))
sizes13 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==13]
total_count13=list(sizes13)
print("%i"%len(total_count13))
sizes14 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==14]
total_count14=list(sizes14)
print("%i"%len(total_count14))
sizes15 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==15]
total_count15=list(sizes15)
print("%i"%len(total_count15))
sizes16 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==16]
total_count16=list(sizes16)
print("%i"%len(total_count16))
sizes17 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==17]
total_count17=list(sizes17)
print("%i"%len(total_count17))
sizes18 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==18]
total_count18=list(sizes18)
print("%i"%len(total_count18))
sizes19 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==19]
total_count19=list(sizes19)
print("%i"%len(total_count19))
sizes20 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==20]
total_count20=list(sizes20)
print("%i"%len(total_count20))
sizes21 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==21]
total_count21=list(sizes21)
print("%i"%len(total_count21))
sizes22 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==22]
total_count22=list(sizes22)
print("%i"%len(total_count22))
sizes23 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==23]
total_count23=list(sizes23)
print("%i"%len(total_count23))
sizes24 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==24]
total_count24=list(sizes24)
print("%i"%len(total_count24))
sizes25 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==25]
total_count25=list(sizes25)
print("%i"%len(total_count25))
sizes26 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==26]
total_count26=list(sizes26)
print("%i"%len(total_count26))
sizes27 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==27]
total_count27=list(sizes27)
print("%i"%len(total_count27))
sizes28 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==28]
total_count28=list(sizes28)
print("%i"%len(total_count28))
sizes29 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==29]
total_count29=list(sizes29)
print("%i"%len(total_count29))
sizes30 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==30]
total_count30=list(sizes30)
print("%i"%len(total_count30))
sizes31 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==31 ]
total_count31 =list(sizes31 )
print("%i"%len(total_count31))
sizes32 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==32]
total_count32=list(sizes32)
print("%i"%len(total_count32))
sizes33 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==33]
total_count33=list(sizes33)
print("%i"%len(total_count33))
sizes34 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==34]
total_count34=list(sizes34)
print("%i"%len(total_count34))
sizes35 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==35]
total_count35=list(sizes35)
print("%i"%len(total_count35))
sizes36 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==36]
total_count36=list(sizes36)
print("%i"%len(total_count36))
sizes37 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==37]
total_count37=list(sizes37)
print("%i"%len(total_count37))
sizes38 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==38]
total_count38=list(sizes38)
print("%i"%len(total_count38))
sizes39 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==39]
total_count39=list(sizes39)
print("%i"%len(total_count39))
sizes40 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==40]
total_count40=list(sizes40)
print("%i"%len(total_count40))
sizes41 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==41]
total_count41=list(sizes41)
print("%i"%len(total_count41))
sizes42 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==42]
total_count42=list(sizes42)
print("%i"%len(total_count42))
sizes43 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==43]
total_count43=list(sizes43)
print("%i"%len(total_count43))
sizes44 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==44]
total_count44=list(sizes44)
print("%i"%len(total_count44))
sizes45 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==45]
total_count45=list(sizes45)
print("%i"%len(total_count45))
sizes46 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==46]
total_count46=list(sizes46)
print("%i"%len(total_count46))
sizes47 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==47]
total_count47=list(sizes47)
print("%i"%len(total_count47))
sizes48 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==48]
total_count48=list(sizes48)
print("%i"%len(total_count48))
sizes49 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==49]
total_count49=list(sizes49)
print("%i"%len(total_count49))
sizes50 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==50]
total_count50=list(sizes50)
print("%i"%len(total_count50))
sizes51 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==51]
total_count51=list(sizes51)
print("%i"%len(total_count51))
sizes52 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==52]
total_count52=list(sizes52)
print("%i"%len(total_count52))
sizes53 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==53]
total_count53=list(sizes53)
print("%i"%len(total_count53))
sizes54 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==54]
total_count54=list(sizes54)
print("%i"%len(total_count54))
sizes55 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==55]
total_count55=list(sizes55)
print("%i"%len(total_count55))
sizes56 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==56]
total_count56=list(sizes56)
print("%i"%len(total_count56))
sizes57 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==57]
total_count57=list(sizes57)
print("%i"%len(total_count57))
sizes58 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==58]
total_count58=list(sizes58)
print("%i"%len(total_count58))
sizes59 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==59]
total_count59=list(sizes59)
print("%i"%len(total_count59))
sizes60 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==60]
total_count60=list(sizes60)
print("%i"%len(total_count60))
sizes61 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==61]
total_count61=list(sizes61)
print("%i"%len(total_count61))
sizes62 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==62]
total_count62=list(sizes62)
print("%i"%len(total_count62))
sizes63 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==63]
total_count63=list(sizes63)
print("%i"%len(total_count63))
sizes64 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==64]
total_count64=list(sizes64)
print("%i"%len(total_count64))
sizes65 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==65]
total_count65=list(sizes65)
print("%i"%len(total_count65))
sizes66 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==66]
total_count66=list(sizes66)
print("%i"%len(total_count66))
sizes67 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==67]
total_count67=list(sizes67)
print("%i"%len(total_count67))
sizes68 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==68]
total_count68=list(sizes68)
print("%i"%len(total_count68))
sizes69 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==69]
total_count69=list(sizes69)
print("%i"%len(total_count69))
sizes70 = [rec for rec in SeqIO.parse("RTcas_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta","fasta") if len(rec)==70]
total_count70=list(sizes70)
print("%i"%len(total_count70))
13. Searching the specific sequence (GAATTTA)
def readFastq(filename):
sequences = []
with open(filename) as fh:
while True:
fh.readline()
seq = fh.readline().rstrip()
if len(seq)== 0:
break
sequences.append(seq)
return sequences
seq = readFastq('YIDD_Mn_RNA_R2_rc_trimmed_sort_cutoff_qulity_trimmed_sort_cutoff_GAATT.fasta')
n_X = float(seq.count("GAATTTA"))
print (" TGTTTG TATGTGTTGTATTGTATAGATTATTGC occurred", int(n_X), "times")
print(seq[:10])
#print(seq[:5]) can check the first five strings of sequencing results
#print(qual[:5]) can check the qualities for the first five reads