STAR alignment code

STAR (Spliced transcript alignment to a reference) alignment

STAR aligning allows for aligning of the short sequences or reads which are usually around 100 base pairs to the genes of the reference genome of minimum length of ~1000 basepairs. The STAR algorithm sequentially searches for unmapped reads and then performs clustering, stitching and scoring to get the best alignment for the read.

The cleandata of the Tomato sequences or reads are aligned to the indexed Diploid Potato genome, the output is BAM (non human readable) files.

Script below

#!/bin/tcsh

#BSUB -J Portfolio_staralign #job name

#BSUB -n 12 #number of threads

#BSUB -W 10:0 #time for job to complete

#BSUB -R span[hosts=1] #to keep tasks on one node

#BSUB -R "rusage[mem=20000]" #to request a node with 20MB of memory

#BSUB -o Portfolio_staralign_%J.out #output file

#BSUB -e Portfolio_staralign_%J.err #error file

#to align RNA-seq reads to indexed genome using STAR

#STAR cannot make use of HPC MPI, must have -R options to set 1 node & memory

#set threads under 12 on Henry2

#input of indexed genome path `is /share/bitcpt/Fall2022/maharry/Portfolio/starindices

#input of sequence reads path is /share/bitcpt/Fall2022/CleanData/Solanum_lycopersicum/

#output of aligned reads will go into STAR_align_Tom subdirectory in working directory

module load conda

conda activate /usr/local/usrapps/bitcpt/star

# SET IN VARIABLES

set IN=/share/bitcpt/Fall2022/CleanData/Solanum_lycopersicum (path to quality checked data)

set index=starindices

set out=AlignedToTranscriptome (path to tell STAR where to generate the output aligned data)

#################################

## Leaf Rep 1

#################################

# RNA-seq data are in format Sl_Leaf_Rep1_3X_1.fp.fq.gz

set S=Sl_Leaf_Rep1_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM (path tells STAR to generate output as unsorted bam and align reads in two pass mode, second pass after adding discovered junction sites)

################################

## Leaf Rep 2

################################

# RNA-seq data are in format Sl_Leaf_Rep2_3X_1.fp.fq.gz

set S=Sl_Leaf_Rep2_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

################################

## Leaf Rep 3

################################

# RNA-seq data are in format Sl_Leaf_Rep3_3X_1.fp.fq.gz

set S=Sl_Leaf_Rep3_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

################################

## SAM Rep 1

################################

# RNA-seq data are in format Sl_SAM_Rep1_3X_1.fp.fq.gz

set S=Sl_SAM_Rep1_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

################################

## SAM Rep 2

################################

# RNA-seq data are in format Sl_SAM_Rep2_3X_1.fp.fq.gz

set S=Sl_SAM_Rep2_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

################################

## SAM Rep 3

################################

# RNA-seq data are in format Sl_SAM_Rep3_3X_1.fp.fq.gz

set S=Sl_SAM_Rep3_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

################################

## SAM Rep 4

################################

# RNA-seq data are in format Sl_SAM_Rep4_3X_1.fp.fq.gz

set S=Sl_SAM_Rep4_3X

set EN=fp.fq.gz

# Print the file name to make sure it is right

echo ${IN}/${S}_1.${EN}

STAR --runThreadN 12 --runMode alignReads --genomeDir ${index} --outFileNamePrefix ${out}/${S}_ --readFilesI

n ${IN}/${S}_1.${EN} ${IN}/${S}_2.${EN} --readFilesCommand zcat --outSAMtype BAM Unsorted --twopassMode Basi

c --quantMode TranscriptomeSAM

Page updated

Report abuse