library(dada2);library(Biostrings); library(ShortRead); # From Bioconductor
library(ggplot2); library(reshape2); library(gridExtra) # installed from CRAN
ref. DADA2 & Bioconductor
Install by Binary from Bioconductor
@R$ if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")@R$ BiocManager::install("dada2", version = "3.21") # for R ver. 4.5#@R$ BiocManager::install("dada2", version = "3.19") # for R ver. 4.4@R$ BiocManager::install("Biostrings")@R$ BiocManager::install("ShortRead")Technology = PacBio SeqWell2
Amplicon = 16S rRNA gene long read with Primers(27F/1492R)
生物技研使用プライマー:
input file = [Unziped.fastq]
ref. benjjneb.github
いずれの有力なtoolもアダプター削除用なので縮重塩基には弱い。
Primer部分を認識しているようだが、プライマー部分をカットしない
残ったプライマー領域を対象に何通りも繰り返す必要がある。
▶︎ cutadapt ... 縮重塩基には非対応
NNN+Primerも改善せず。。。
PrimerのReverseComplementを考慮して与える必要あり
▶︎ seqkit amplicon .... 縮重塩基には非対応だが「ミスマッチ許容無し」設定でプライマー内ミスマッチを除外するためプライマーの長さが統一される。そのため、fastpにより塩基数を指定して前後の配列を除去することでプライマー領域を取り除くことができる。
--max-mismatchと縮重プライマーを同時に使用すると-mオプションは無視されるらしい (Ushio's)
←そうでもなさそう。-mの数字により結果が異なる
NNN+Primerも改善せず。。。
PrimerのReverseComplementを考慮する必要なし(F, Rともに5'→3')
-m 4で良さそう
プライマーミスマッチを除去 & 断片の方向を揃える
% seqkit amplicon
-F AGRGTTYGATYMTGGCTCAG
-R RGYTACCTTGTTACGACTT # -m 4 ⬅︎ max-mismatchオプション無し=ミスマッチ許容しない
mao7_trmd_max1600.fastq.gz
> mao7_trm_m16_rmp_m4v10.fastq
fastpによりプライマーの塩基数を両端から削除する
% fastp -i [FILE: mao7.fastq] -o [FILE: mao7_rmfr.fastq] -f 20 -t 19
▶︎ clsplitseq (in claident) ... 縮重塩基には非対応
$ clsplitseq --runname=MAO7 --indexname=M --primerfile=/home/tak-gk/local/share/primers/Fprimer_BACFULL.fas --tagname=M --reverseprimerfile=/home/tak-gk/local/share/primers/Rprimer_BACFULL2.fas --truncateN=ENABLE --append --numthreads=4 ~/wd/WholeGenome/MAO7/Pac16S/mao7_trmd_max1600.fastq.gz ./1_rmprimers_Rev2... 縮重塩基には非対応
NNN+Primerも改善せず。。。
1st
cutadapt
~/wd/WholeGenome/MAO7/Pac16S$
% cutadapt
-g ^AGRGTTYGATYMTGGCTCAG
-a AAGTCGTAACAAGGTARCY
-o mao7_trm_max16_cutp1.fastq.gz
mao7_trmd_max1600.fastq.gz
2nd_Reverse case
~/wd/WholeGenome/MAO7/Pac16S$
% cutadapt
-g ^RGYTACCTTGTTACGACTT
-a CTGAGCCAKRATCRAACYCT
-o mao7_trm_max16_cutp_2nd.fastq.gz
mao7_trm_max16_cutp.fastq.gz
... --max-mismatchと縮重プライマーを同時に使用すると-mオプションは無視されるらしい (Ushio's)
$ seqkit amplicon
-F AGRGTTYGATYMTGGCTCAG
-R RGYTACCTTGTTACGACTT
-m 4
mao7_trmd_max1600.fastq.gz
> mao7_trm_m16_rmp_m4v10.fastq
$ clsplitseq --runname=MAO7 --indexname=M --primerfile=/home/tak-gk/local/share/primers/Fprimer_BACFULL.fas --tagname=M --reverseprimerfile=/home/tak-gk/local/share/primers/Rprimer_BACFULL2.fas --truncateN=ENABLE --append --numthreads=4 ~/wd/WholeGenome/MAO7/Pac16S/mao7_trmd_max1600.fastq.gz ./1_rmprimers_Rev2
R環境設定
R@ library(dada2); library(Biostrings); library(ShortRead);
library(ggplot2); library(reshape2); library(gridExtra); library(phyloseq) # installed from CRAN
Dereplicate:
R@ drp2 <- derepFastq('../working/', verbose=T)
Learning errors
R@ err2 <- learnErrors(drp2, errorEstimationFunction=PacBioErrfun, BAND_SIZE=32, multithread=T)
R@ saveRDS(err2, "./MAO6_err2.rds")
R@ plotErrors(err2)
Denoising
R@ dd2 <- dada(drp2, err=err2, BAND_SIZE=32, multithread=T)
R@ saveRDS(dd2, "MAO6_dd2.rds")
Wrightout sequence table
R@ st2 <- makeSequenceTable(dd2); dim(st2)
-----
Fasta形式へ変換
% cat dada_keep75v2.txt | sed "s/^/>ASV_/" | perl -pe 's/\s/\n/g' > MAO6kp75_dada2.fas
アラインメント(F/Rの方向を一方へ統一: --adjustdirection)
% mafft --thread 4 --adjustdirection MAO6kp75_dada2.fas > MAO6kp75_dada2_ali.fas
========== LOGs ==========
240831 Jn10c07-18S-Pacbio
Sample 1 - 91626 reads in 43951 unique sequences.
fastp: 統合QCツール(2 colors対応)GitHub, Kazmax's web-fastp, 本家manual
QCレポート:
% fastp -i [FILE1] -3 -o [FILE1_res]
-h [report .html] -j [report.json] -q 15 -n 10 -w 4
-q=平均Q-val-n=1readあたりのNの数。設定値を超えたreadとpair-readを除去-t=read1のtailから除去する配列数[default=0]-T=指定無しでread2に対して-tに準ずる-f=read1のfrontから除去する配列数[default=0]-F=指定無しでread2に対して-tに準ずる-l=最低リード長。設定以下のreadを除去-w=thread数Seqkit stats
$ seqkit stats Jn10c07L18S.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
Jn10c07L18S.fq.gz FASTQ DNA 103,496 265,603,042 51 2,566.3 5,272
Filtlong
% filtlong --keep_percent 90 --min_length 2000 hiyc8ont_raw.fq.gz | pigz > h8trmd.fq.gz
鎖長分布
$ perl ~/local/bin/fastq_avelength_morethan1k.perl Jn10c07L18S.fastq1st
2nd remove primer
3rd remove primer accurately
4th dada2 ver 1.36
2025/6/3
Torimming
~/wd/WholeGenome/MAO7/Pac16S$ filtlong --keep_percent 90 --min_length 1500 MAO7-16S.fastq.gz | pigz > mao7_trmd.fq.gz
~/wd/WholeGenome/MAO7/Pac16S$ filtlong --keep_percent 90 --min_length 1500 --max_length 1600 MAO7-16S.fastq.gz | pigz > mao7_trmd_max1600.fq.gz
2025/6/17
> drp2 <- derepFastq('./working',verbose=T)Dereplicating sequence entries in Fastq file: ./working/mao7_rmfr.fastqEncountered 47160 unique sequences from 65078 total sequences read.> err2 <- learnErrors(drp2, errorEstimationFunction=PacBioErrfun, BAND_SIZE=32, multithread=T)96635688 total bases in 65078 reads from 1 samples will be used for learning the error rates.The max qual score of 93 was not detected. Using standard error fitting. The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.The max qual score of 93 was not detected. Using standard error fitting.> saveRDS(err2, "./mao7_err2.rds")> dd2 <- dada(drp2, err=err2, BAND_SIZE=32, multithread=T)Sample 1 - 65078 reads in 47160 unique sequences.> saveRDS(dd2, "MAO7_dd2.rds")> st2 <- makeSequenceTable(dd2); dim(st2)[1] 1 1042025/6/18
dada2=ver. 1.36, @R
Ubuntuのupdateに伴い、Bioconductorを再インストールした。
Rのライブラリ読み込み
@R% library(dada2);library(Biostrings); library(ShortRead)
@R% library(ggplot2); library(reshape2)
@R% drp <- derepFastq("./working/",verbose=T) # 重複配列除去
Dereplicating sequence entries in Fastq file: ./working//mao7_rmfr.fastq
Encountered 47160 unique sequences from 65078 total sequences read.
@R% err <- learnErrors(drp, multithread=T) # エラーモデル学習
96635688 total bases in 65078 reads from 1 samples will be used for learning the error rates.
@R% plotErrors(err)
@R% saveRDS(err, "./mao7_err.rds")
@R% dd2 <- dada(drp, err=err, multithread=T) # ASV推定: Error correction
Sample 1 - 65078 reads in 47160 unique sequences.
@R% saveRDS(dd2, "./mao7_dd2.rds")
@R% seqtable <- makeSequenceTable(dd2);dim(seqtable)
[1] 1 104
@R% seqtable_nonchim <- removeBimeraDenovo(seqtable, method="consensus",multithread=T); dim(seqtable_nonchim)
[1] 1 17
#-- Writeout nonchimera-seq
@R% asv_names <- paste("ASV_", seq_along(seqtable_nonchim))
@R% asv_dna <- DNAStringSet(colnames(seqtable_nonchim))
@R% names(asv_dna) <- asv_names
@R% writeXStringSet(asv_dna,"MAO7_nonchim.fasta")
#-- Writeout seq (untreated chimera removal)
@R% asv_names <- paste("ASV_", seq_along(seqtable))
@R% asv_dna <- DNAStringSet(colnames(seqtable))
@R% names(asv_dna) <- asv_names
@R% writeXStringSet(asv_dna,"MAO7_onlyasv.fasta")