STAR (Spliced transcript alignment to a reference) alignment
STAR aligning allows for aligning of the short sequences or reads which are usually around 100 base pairs to the genes of the reference genome of minimum length of ~1000 basepairs. The STAR algorithm sequentially searches for unmapped reads and then performs clustering, stitching and scoring to get the best alignment for the read.
The cleandata of the Tomato sequences or reads are aligned to the indexed Diploid Potato genome, the output is BAM (non human readable) files.
Script below
#!/bin/tcsh
#BSUB -J Portfolio_staralign #job name
#BSUB -n 12 #number of threads
#BSUB -W 10:0 #time for job to complete
#BSUB -R span[hosts=1] #to keep tasks on one node
#BSUB -R "rusage[mem=20000]" #to request a node with 20MB of memory
#BSUB -o Portfolio_staralign_%J.out #output file
#BSUB -e Portfolio_staralign_%J.err #error file
#to align RNA-seq reads to indexed genome using STAR
#STAR cannot make use of HPC MPI, must have -R options to set 1 node & memory
#set threads under 12 on Henry2
#input of indexed genome path `is /share/bitcpt/Fall2022/maharry/Portfolio/starindices
#input of sequence reads path is /share/bitcpt/Fall2022/CleanData/Solanum_lycopersicum/
#output of aligned reads will go into STAR_align_Tom subdirectory in working directory
module load conda
conda activate /usr/local/usrapps/bitcpt/star
# SET IN VARIABLES
set IN=/share/bitcpt/Fall2022/CleanData/Solanum_lycopersicum (path to quality checked data)
set index=starindices
set out=AlignedToTranscriptome (path to tell STAR where to generate the output aligned data)
#################################
## Leaf Rep 1
#################################
# RNA-seq data are in format Sl_Leaf_Rep1_3X_1.fp.fq.gz
set S=Sl_Leaf_Rep1_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM (path tells STAR to generate output as unsorted bam and align reads in two pass mode, second pass after adding discovered junction sites)
################################
## Leaf Rep 2
################################
# RNA-seq data are in format Sl_Leaf_Rep2_3X_1.fp.fq.gz
set S=Sl_Leaf_Rep2_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM
################################
## Leaf Rep 3
################################
# RNA-seq data are in format Sl_Leaf_Rep3_3X_1.fp.fq.gz
set S=Sl_Leaf_Rep3_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM
################################
## SAM Rep 1
################################
# RNA-seq data are in format Sl_SAM_Rep1_3X_1.fp.fq.gz
set S=Sl_SAM_Rep1_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM
################################
## SAM Rep 2
################################
# RNA-seq data are in format Sl_SAM_Rep2_3X_1.fp.fq.gz
set S=Sl_SAM_Rep2_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM
################################
## SAM Rep 3
################################
# RNA-seq data are in format Sl_SAM_Rep3_3X_1.fp.fq.gz
set S=Sl_SAM_Rep3_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM
################################
## SAM Rep 4
################################
# RNA-seq data are in format Sl_SAM_Rep4_3X_1.fp.fq.gz
set S=Sl_SAM_Rep4_3X
set EN=fp.fq.gz
# Print the file name to make sure it is right
echo ${IN}/${S}_1.${EN}
STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI
n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi
c --quantMode TranscriptomeSAM