#GTF Preparation and Merging
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.long_noncoding_RNAs.gtf.gz
perl -pe 's/^chr(\S+)/$1/' gencode.v47lift37.long_noncoding_RNAs.gtf > lncRNA.gencode.hg19.chrRemoved.gtf
cat smORF.gtf lncRNA.gencode.hg19.chrRemoved.gtf > merged_smORF_lncRNA.gencode.hg19.chrRemoved.gtf
#STAR indexing - index_star_14mar25.sh
STAR --runThreadN 8 \
--runMode genomeGenerate \
--genomeDir ./lncRNA_star_index \
--genomeFastaFiles ../Homo_sapiens.GRCh37.dna.primary_assembly.fa \
--sjdbGTFfile lncRNA.gencode.hg19.chrRemoved.gtf
STAR --runThreadN 8 \
--runMode genomeGenerate \
--genomeDir ./smORF__star_index \
--genomeFastaFiles ../Homo_sapiens.GRCh37.dna.primary_assembly.fa \
--sjdbGTFfile smORF.gtf
STAR --runThreadN 8 \
--runMode genomeGenerate \
--genomeDir ./merged_lncRNA_smORF__star_index \
--genomeFastaFiles ../Homo_sapiens.GRCh37.dna.primary_assembly.fa \
--sjdbGTFfile merged_smORF_lncRNA.gencode.hg19.chrRemoved.gtf
#trimming
TRIM_GALORE="/stor/work/Sullivan/anik/project_micropeptide/tools/trim_galore/TrimGalore-0.6.10/trim_galore"
mkdir -p fixed_adapter.trim
for i in *.gz; do
$TRIM_GALORE \
--adapter CTGTAGGCACCATCAAT \
--gzip \
--cores 8 \
--output_dir fixed_adapter.trim \
"$i"
done
multiqc fixed_adapter.trim -o fixed_adapter.trim
mkdir -p fixed_adapter.trim
TRIM_GALORE="/stor/work/Sullivan/anik/project_micropeptide/tools/trim_galore/TrimGalore-0.6.10/trim_galore"
for i in *.gz; do
$TRIM_GALORE \
--adapter AGATCGGAAGAGCACACGTCT \
--gzip \
--cores 8 \
--output_dir fixed_adapter.trim \
"$i"
done
#multiqc . -o fixed_adapter.trim
multiqc fixed_adapter.trim -o fixed_adapter.trim
TRIM_GALORE="/stor/work/Sullivan/anik/project_micropeptide/tools/trim_galore/TrimGalore-0.6.10/trim_galore"
for i in *.gz; do
$TRIM_GALORE \
--adapter AAAAAAAAAA \
--gzip \
--cores 8 \
--output_dir fixed_adapter.trim \
"$i"
done
#multiqc . -o fixed_adapter.trim
multiqc fixed_adapter.trim -o fixed_adapter.trim
#STAR Alignment and Count – copy.star.alignment.count.combined.sh
TRIMMED_DIR=$1
OUTPUT_DIR=$2
ALIGNMENT_DIR="${OUTPUT_DIR}/alignments_lncRNA"
STAR_INDEX_DIR="/stor/work/Sullivan/anik/project_micropeptide/ref/gencode_lncrna/lncRNA_star_index"
for TRIMMED_FILE in $TRIMMED_DIR/*.fq.gz; do
BASE_NAME=$(basename $TRIMMED_FILE _trimmed.fq.gz)
echo "Processing $BASE_NAME"
STAR --runThreadN 20 \
--genomeDir $STAR_INDEX_DIR \
--readFilesIn $TRIMMED_FILE \
--readFilesCommand zcat \
--outFileNamePrefix $ALIGNMENT_DIR/$BASE_NAME \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outFilterType BySJout \
--outFilterMismatchNmax 2 \
--outFilterMultimapNmax 1 \
--outFilterMatchNmin 16 \
--alignEndsType EndToEnd
samtools quickcheck $ALIGNMENT_DIR/${BASE_NAME}Aligned.sortedByCoord.out.bam
if [ $? -ne 0 ]; then
echo "BAM file is corrupted. Re-running STAR alignment."
STAR --runThreadN 20 \
--genomeDir $STAR_INDEX_DIR \
--readFilesIn $TRIMMED_FILE \
--readFilesCommand zcat \
--outFileNamePrefix $ALIGNMENT_DIR/$BASE_NAME \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM GeneCounts \
--outFilterType BySJout \
--outFilterMismatchNmax 2 \
--outFilterMultimapNmax 1 \
--outFilterMatchNmin 16 \
--alignEndsType EndToEnd
fi
done
COUNT_MATRIX="${OUTPUT_DIR}/gene_counts_matrix.tsv"
SAMPLES=$(ls $ALIGNMENT_DIR/*ReadsPerGene.out.tab | sed 's/_ReadsPerGene.out.tab//g')
echo -e "GeneID\t$(echo $SAMPLES | tr ' ' '\t')" > $COUNT_MATRIX
awk 'FNR==1 {ARGIND++}
ARGIND==1 {genes[$1]}
ARGIND==2 {counts[$1]=$4}
END {for (gene in genes) {printf gene; for (s=1; s<=ARGC-1; s++) printf "\t%s", counts[gene]; print}}' $ALIGNMENT_DIR/*ReadsPerGene.out.tab >> $COUNT_MATRIX
#3-File Automation Script – copy.3file.automate.sh
ulimit -n 8192
./copy.star.alignment.count.combined.sh ../data/razooki_influenza/fixed_adapter.trim ../data/razooki_influenza/fixed_adapter.trim/mar17
./copy.star.alignment.count.combined.sh ../data/finkel.sarscov2/fixed_adapter.trim ../data/finkel.sarscov2/fixed_adapter.trim/mar17
./copy.star.alignment.count.combined.sh ../data/influenza_machkovech/fixed_adapter.trim ../data/influenza_machkovech/fixed_adapter.trim/mar17
#BAM Length Distribution Analysis
ls /stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA/*toTranscriptome.out.bam
BAM="/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA/SRR13165891_GSM4949730_Calu3_fp_uninf_1_Homo_sapiens_OTHERAligned.toTranscriptome.out.bam"
samtools view "$BAM" | awk '{print length($10)}' | sort | uniq -c
for BAM in /stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA/*toTranscriptome.out.bam; do
echo "Processing $BAM..."
samtools view "$BAM" | awk '{print length($10)}' | sort | uniq -c
done
for BAM in /stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/alignments_lncRNA/*toTranscriptome.out.bam; do
echo "Processing $BAM..."
samtools view "$BAM" | awk '{print length($10)}' | sort | uniq -c
done > razooki.bam.length.dist.txt
for BAM in /stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA/*toTranscriptome.out.bam; do
echo "Processing $BAM..."
samtools view "$BAM" | awk '{print length($10)}' | sort | uniq -c
done > finkel.bam.length.dist.txt
for BAM in /stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/alignments_lncRNA/*toTranscriptome.out.bam; do
echo "Processing $BAM..."
samtools view "$BAM" | awk '{print length($10)}' | sort | uniq -c
done > influenza_machkovech.bam.length.dist.txt
#ribocode_analysis_pilot.sh for Finkel
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/ribocode_result_lncRNA"
RIBOCODE_ANNOT="/stor/work/Sullivan/anik/project_micropeptide/ref/gencode_lncrna/lncRNA_ribocode_annot"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*.toTranscriptome.out.bam; do
SAMPLE_NAME=$(basename "$BAM" .Aligned.toTranscriptome.out.bam)
SAMPLE_OUTDIR="$OUTPUT_DIR/$SAMPLE_NAME"
mkdir -p "$SAMPLE_OUTDIR"
CONFIG_FILE="$SAMPLE_OUTDIR/config.txt"
{
echo "# SampleName AlignmentFile Stranded P-siteReadLength P-siteLocations"
echo -e "$SAMPLE_NAME\t$BAM\tno\t26,27,28,29,30,31,32,33,34\t12,12,12,13,13,14,14,14,14"
} > "$CONFIG_FILE"
RiboCode -a "$RIBOCODE_ANNOT" \
-c "$CONFIG_FILE" \
-l no \
-g \
-o "$SAMPLE_OUTDIR/RiboCode_ORFs"
done
#ribocode_analysis_pilot.sh for Razooki
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/ribocode_result_lncRNA"
RIBOCODE_ANNOT="/stor/work/Sullivan/anik/project_micropeptide/ref/gencode_lncrna/lncRNA_ribocode_annot"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*.toTranscriptome.out.bam; do
SAMPLE_NAME=$(basename "$BAM" .Aligned.toTranscriptome.out.bam)
SAMPLE_OUTDIR="$OUTPUT_DIR/$SAMPLE_NAME"
mkdir -p "$SAMPLE_OUTDIR"
CONFIG_FILE="$SAMPLE_OUTDIR/config.txt"
{
echo "# SampleName AlignmentFile Stranded P-siteReadLength P-siteLocations"
echo -e "$SAMPLE_NAME\t$BAM\tno\t26,27,28,29,30,31,32,33,34\t12,12,12,13,13,14,14,14,14"
} > "$CONFIG_FILE"
RiboCode -a "$RIBOCODE_ANNOT" \
-c "$CONFIG_FILE" \
-l no \
-g \
-o "$SAMPLE_OUTDIR/RiboCode_ORFs"
done
#ribocode_analysis_pilot.sh for Influenza Machkovech
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/ribocode_result_lncRNA"
RIBOCODE_ANNOT="/stor/work/Sullivan/anik/project_micropeptide/ref/gencode_lncrna/lncRNA_ribocode_annot"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*.toTranscriptome.out.bam; do
SAMPLE_NAME=$(basename "$BAM" .Aligned.toTranscriptome.out.bam)
SAMPLE_OUTDIR="$OUTPUT_DIR/$SAMPLE_NAME"
mkdir -p "$SAMPLE_OUTDIR"
CONFIG_FILE="$SAMPLE_OUTDIR/config.txt"
{
echo "# SampleName AlignmentFilt_lncRNA
le Stranded P-siteReadLength P-siteLocations"
echo -e "$SAMPLE_NAME\t$BAM\tno\t26,27,28,29,30,31,32,33,34\t12,12,12,13,13,14,14,14,14"
} > "$CONFIG_FILE"
RiboCode -a "$RIBOCODE_ANNOT" \
-c "$CONFIG_FILE" \
-l no \
-g \
-o "$SAMPLE_OUTDIR/RiboCode_ORFs"
done
#Combined Config File Generation
cat *Aligned.toTranscriptome.out.bam/config.txt > finkel_combined_config.txt
cat *Aligned.toTranscriptome.out.bam/config.txt > mack_combined_config.txt
cat *Aligned.toTranscriptome.out.bam/config.txt > razooki_combined_config.txt
cat /stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/ribocode_result_lncRNA/*combined_config.txt /stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/ribocode_result_lncRNA/*combined_config.txt | grep -v "#" > combined_config.txt
#ribocode_apr8_single_orf_file_output.sh
#!/bin/bash
RIBOCODE_ANNOT="/stor/work/Sullivan/anik/project_micropeptide/ref/gencode_lncrna/lncRNA_ribocode_annot"
COMBINED_CONFIG="/stor/work/Sullivan/anik/project_micropeptide/scripts/config/combined_config.txt"
OUTPUT_DIR="./config"
mkdir -p "$OUTPUT_DIR"
RiboCode -a "$RIBOCODE_ANNOT" \
-c "$COMBINED_CONFIG" \
-l no \
-g \
-o "$OUTPUT_DIR/RiboCode_ORFs"
#RiboCode_ORFs Analysis
cut -f10 RiboCode_ORFs.txt
head -1 RiboCode_ORFs.txt | tr '\t' '\n' | nl -v 1
head -1 RiboCode_ORFs.txt | tr '\t' '\n' | nl -v 1
awk -F '\t' '{print $1, $10}' RiboCode_ORFs.txt | column -t
awk -F '\t' '{print $1, $10}' RiboCode_ORFs.txt | sort -k2,2nr | column -t
#Collapsed ORF Analysis and Filtering
head -1 RiboCode_ORFs_collapsed.txt | tr '\t' '\n' | nl -v 1
awk -F '\t' '$11 >= 30 && $20 < 0.05' RiboCode_ORFs_collapsed.txt > high_confidence_ORFs.txt
#Filtering Candidate Micropeptides
awk -F '\t' '$11 <= 150' RiboCode_ORFs_collapsed.txt > ./picropeptide_lessORequal_than_100/candidate_micropeptides.txt
mv picropeptide_lessORequal_than_100/ micropeptide_lessORequal_than_150/
cut -f1 candidate_micropeptides.txt > candidate_orf_ids.txt
awk -F '\t' 'NR==FNR {ids[$1]; next} $9 ~ /orf_id/ {split($9, a, ";"); split(a[1], b, "\""); if (b[2] in ids) print}' candidate_orf_ids.txt RiboCode_ORFs_collapsed.gtf > candidate_micropeptides.gtf
#ORFcount_apr8.sh for Finkel
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/apr8_orfcount_result"
GTF="/stor/work/Sullivan/anik/project_micropeptide/scripts/config/micropeptide_lessORequal_than_150/candidate_micropeptides.gtf"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*Aligned.sortedByCoord.out.bam; do
SAMPLE_NAME=$(basename "$BAM" Aligned.sortedByCoord.out.bam)
OUTPUT_FILE="$OUTPUT_DIR/${SAMPLE_NAME}_ORF.counts"
ORFcount -g "$GTF" \
-r "$BAM" \
-f 5 \
-l 3 \
-e 30 \
-m 25 \
-M 35 \
-o "$OUTPUT_FILE"
echo "Processed: $SAMPLE_NAME ^f^r $(basename "$OUTPUT_FILE")"
done
#ORFcount_apr8.sh for Influenza Machkovech
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/apr8_orfcount_result"
GTF="/stor/work/Sullivan/anik/project_micropeptide/scripts/config/micropeptide_lessORequal_than_150/candidate_micropeptides.gtf"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*Aligned.sortedByCoord.out.bam; do
SAMPLE_NAME=$(basename "$BAM" Aligned.sortedByCoord.out.bam)
OUTPUT_FILE="$OUTPUT_DIR/${SAMPLE_NAME}_ORF.counts"
ORFcount -g "$GTF" \
-r "$BAM" \
-f 5 \
-l 3 \
-e 30 \
-m 25 \
-M 35 \
-o "$OUTPUT_FILE"
echo "Processed: $SAMPLE_NAME ^f^r $(basename "$OUTPUT_FILE")"
done
#ORFcount_apr8.sh for Razooki
#!/bin/bash
INPUT_BAM_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/alignments_lncRNA"
OUTPUT_DIR="/stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/apr8_orfcount_result"
GTF="/stor/work/Sullivan/anik/project_micropeptide/scripts/config/micropeptide_lessORequal_than_150/candidate_micropeptides.gtf"
mkdir -p "$OUTPUT_DIR"
for BAM in "$INPUT_BAM_DIR"/*Aligned.sortedByCoord.out.bam; do
SAMPLE_NAME=$(basename "$BAM" Aligned.sortedByCoord.out.bam)
OUTPUT_FILE="$OUTPUT_DIR/${SAMPLE_NAME}_ORF.counts"
ORFcount -g "$GTF" \
-r "$BAM" \
-f 5 \
-l 3 \
-e 30 \
-m 25 \
-M 35 \
-o "$OUTPUT_FILE"
echo "Processed: $SAMPLE_NAME ^f^r $(basename "$OUTPUT_FILE")"
done
library(DESeq2)
library(ggplot2)
library(pheatmap)
count_dir_finkel <- "/stor/work/Sullivan/anik/project_micropeptide/data/finkel.sarscov2/fixed_adapter.trim/mar17/apr12_orfcount_result_update_fresh"
count_files_finkel <- list.files(count_dir_finkel, pattern = "_ORF.counts$", full.names = TRUE)
count_data_finkel <- lapply(count_files_finkel, function(file) {
sample_name <- gsub("_ORF.counts", "", basename(file))
df <- read.delim(file, header = FALSE, col.names = c("ORF_ID", sample_name))
return(df)
})
count_matrix_finkel <- Reduce(function(x, y) merge(x, y, by = "ORF_ID", all = TRUE), count_data_finkel)
rownames(count_matrix_finkel) <- count_matrix_finkel$ORF_ID
count_matrix_finkel <- count_matrix_finkel[, -1]
count_matrix_finkel[is.na(count_matrix_finkel)] <- 0
write.csv(count_matrix_finkel, file = "finkel_count_matrix_round3.csv", quote = FALSE, row.names = TRUE)
count_dir_razooki <- "/stor/work/Sullivan/anik/project_micropeptide/data/razooki_influenza/fixed_adapter.trim/mar17/apr12_orfcount_result_update_fresh"
count_files_razooki <- list.files(count_dir_razooki, pattern = "_ORF.counts$", full.names = TRUE)
count_data_razooki <- lapply(count_files_razooki, function(file) {
sample_name <- gsub("_ORF.counts", "", basename(file))
df <- read.delim(file, header = FALSE, col.names = c("ORF_ID", sample_name))
return(df)
})
count_matrix_razooki <- Reduce(function(x, y) merge(x, y, by = "ORF_ID", all = TRUE), count_data_razooki)
rownames(count_matrix_razooki) <- count_matrix_razooki$ORF_ID
count_matrix_razooki <- count_matrix_razooki[, -1]
count_matrix_razooki[is.na(count_matrix_razooki)] <- 0
write.csv(count_matrix_razooki, file = "razooki_count_matrix_round3.csv", quote = FALSE, row.names = TRUE)
count_dir_machkovech <- "/stor/work/Sullivan/anik/project_micropeptide/data/influenza_machkovech/fixed_adapter.trim/mar17/apr12_orfcount_result_update_fresh"
count_files_machkovech <- list.files(count_dir_machkovech, pattern = "_ORF.counts$", full.names = TRUE)
count_data_machkovech <- lapply(count_files_machkovech, function(file) {
sample_name <- gsub("_ORF.counts", "", basename(file))
df <- read.delim(file, header = FALSE, col.names = c("ORF_ID", sample_name))
return(df)
})
count_matrix_machkovech <- Reduce(function(x, y) merge(x, y, by = "ORF_ID", all = TRUE), count_data_machkovech)
rownames(count_matrix_machkovech) <- count_matrix_machkovech$ORF_ID
count_matrix_machkovech <- count_matrix_machkovech[, -1]
count_matrix_machkovech[is.na(count_matrix_machkovech)] <- 0
write.csv(count_matrix_machkovech, file = "machkovech_count_matrix_round3.csv", quote = FALSE, row.names = TRUE)
filtered_count_matrix_finkel <- count_matrix_finkel[!grepl("^__", rownames(count_matrix_finkel)), ]
write.csv(filtered_count_matrix_finkel, file = "finkel_count_matrix_filtered.csv", quote = FALSE, row.names = TRUE)
filtered_count_matrix_razooki <- count_matrix_razooki[!grepl("^__", rownames(count_matrix_razooki)), ]
write.csv(filtered_count_matrix_razooki, file = "razooki_count_matrix_filtered.csv", quote = FALSE, row.names = TRUE)
filtered_count_matrix_machkovech <- count_matrix_machkovech[!grepl("^__", rownames(count_matrix_machkovech)), ]
write.csv(filtered_count_matrix_machkovech, file = "machkovech_count_matrix_filtered.csv", quote = FALSE, row.names = TRUE)
finkel_sample_names <- colnames(filtered_count_matrix_finkel)
cat("Finkel Sample Names:\n")
print(finkel_sample_names)
razooki_sample_names <- colnames(filtered_count_matrix_razooki)
cat("\nRazooki Sample Names:\n")
print(razooki_sample_names)
machkovech_sample_names <- colnames(filtered_count_matrix_machkovech)
cat("\nMachkovech Sample Names:\n")
print(machkovech_sample_names)
selected_samples_finkel <- finkel_sample_names[grepl("uninf", finkel_sample_names) | grepl(" hpi", finkel_sample_names)]
cat("Selected Finkel Samples for Differential Analysis:\n")
print(selected_samples_finkel)
count_matrix_finkel_subset <- filtered_count_matrix_finkel[, selected_samples_finkel]
meta_finkel <- data.frame(
sample = selected_samples_finkel,
condition = ifelse(grepl("uninf", selected_samples_finkel), "uninfected", "infected"),
stringsAsFactors = FALSE
)
rownames(meta_finkel) <- selected_samples_finkel
cat("Finkel Metadata:\n")
print(meta_finkel)
dds_finkel <- DESeqDataSetFromMatrix(countData = count_matrix_finkel_subset,
colData = meta_finkel,
design = ~ condition)
dds_finkel <- DESeq(dds_finkel)
res_finkel <- results(dds_finkel, contrast = c("condition", "infected", "uninfected"))
cat("Summary of DESeq2 Results (infected vs uninfected):\n")
summary(res_finkel)
vsd_finkel <- varianceStabilizingTransformation(dds_finkel, blind = FALSE)
pcaData_finkel <- plotPCA(vsd_finkel, intgroup = "condition", returnData = TRUE)
percentVar_finkel <- round(100 * attr(pcaData_finkel, "percentVar"))
p_pca_finkel <- ggplot(pcaData_finkel, aes(PC1, PC2, color = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar_finkel[1], "% variance")) +
ylab(paste0("PC2: ", percentVar_finkel[2], "% variance")) +
ggtitle("Finkel Dataset: PCA (Uninfected vs Infected)")
print(p_pca_finkel)
de_genes_finkel <- rownames(res_finkel)[which(res_finkel$padj < 0.05 & abs(res_finkel$log2FoldChange) > 1)]
cat("Number of DE genes (padj < 0.05, |log2FC| > 1): ", length(de_genes_finkel), "\n")
if (length(de_genes_finkel) > 0) {
mat_finkel <- assay(vsd_finkel)[de_genes_finkel, ]
mat_finkel_scaled <- t(scale(t(mat_finkel)))
pheatmap(mat_finkel_scaled, annotation_col = meta_finkel, main = "Finkel: DE Genes Heatmap")
} else {
cat("No genes passed the filtering criteria for the heatmap.\n")
}
volcano_data_finkel <- data.frame(
log2FoldChange = res_finkel$log2FoldChange,
negLog10Padj = -log10(res_finkel$padj),
gene = rownames(res_finkel)
)
p_volcano_finkel <- ggplot(volcano_data_finkel, aes(x = log2FoldChange, y = negLog10Padj)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), color = "red", linetype = "dashed") +
xlab("Log2 Fold Change") +
ylab("-Log10 Adjusted p-value") +
ggtitle("Finkel: Volcano Plot (infected vs Uninfected)")
print(p_volcano_finkel)
plotMA(res_finkel, main = "Finkel: MA Plot (infected vs Uninfected)", ylim = c(-5, 5))
selected_samples <- razooki_sample_names[
grepl("RPF_control", razooki_sample_names) | (grepl("RPF_PR8", razooki_sample_names) & !grepl("delta_NS1", razooki_sample_names))
]
cat("Selected samples for analysis:\n")
print(selected_samples)
count_matrix_selected <- filtered_count_matrix_razooki[, selected_samples]
condition <- ifelse(grepl("control", selected_samples), "control", "PR8")
meta <- data.frame(
sample = selected_samples,
condition = factor(condition)
)
rownames(meta) <- selected_samples
cat("Metadata for differential analysis:\n")
print(meta)
dds <- DESeqDataSetFromMatrix(countData = count_matrix_selected, colData = meta, design = ~ condition)
dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "PR8", "control"))
cat("Summary of DE results:\n")
summary(res)
vsd <- varianceStabilizingTransformation(dds, blind = FALSE)
pcaData <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p_pca <- ggplot(pcaData, aes(PC1, PC2, color = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
ggtitle("Razooki dataset: PCAPCA Plot: PR8 vs Control (Razooki RPF samples)")
print(p_pca)
de_genes <- rownames(res)[which(res$padj < 0.05 & abs(res$log2FoldChange) > 1)]
cat("Number of differentially expressed genes (padj < 0.05, |log2FC| > 1): ", length(de_genes), "\n")
if(length(de_genes) > 0) {
mat <- assay(vsd)[de_genes, ]
mat_scaled <- t(scale(t(mat)))
pheatmap(mat_scaled, annotation_col = meta, main = "Heatmap of Differentially Expressed Genes")
} else {
cat("No genes passed the filtering criteria for the heatmap.\n")
}
volcano_data <- data.frame(
log2FoldChange = res$log2FoldChange,
negLog10Padj = -log10(res$padj),
gene = rownames(res)
)
p_volcano <- ggplot(volcano_data, aes(x = log2FoldChange, y = negLog10Padj)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), color = "red", linetype = "dashed") +
xlab("Log2 Fold Change") +
ylab("-Log10 Adjusted p-value") +
ggtitle("Volcano Plot: PR8 vs Control")
print(p_volcano)
plotMA(res, main = "MA Plot: PR8 vs Control", ylim = c(-5, 5))
selected_samples_razooki <- razooki_sample_names[
grepl("RPF_control", razooki_sample_names) | grepl("RPF_PR8", razooki_sample_names)
]
cat("selected razooki samples for differential analysis:\n")
print(selected_samples_razooki)
count_matrix_razooki_subset <- filtered_count_matrix_razooki[, selected_samples_razooki]
meta_razooki <- data.frame(
sample = selected_samples_razooki,
condition = ifelse(grepl("control", selected_samples_razooki), "uninfected", "infected"),
stringsAsFactors = FALSE
)
rownames(meta_razooki) <- selected_samples_razooki
cat("razooki metadata:\n")
print(meta_razooki)
dds_razooki <- DESeqDataSetFromMatrix(countData = count_matrix_razooki_subset, colData = meta_razooki, design = ~ condition)
dds_razooki <- DESeq(dds_razooki)
res_razooki <- results(dds_razooki, contrast = c("condition", "infected", "uninfected"))
cat("summary of deseq2 results (infected vs uninfected):\n")
summary(res_razooki)
vsd_razooki <- varianceStabilizingTransformation(dds_razooki, blind = FALSE)
pcaData_razooki <- plotPCA(vsd_razooki, intgroup = "condition", returnData = TRUE)
percentVar_razooki <- round(100 * attr(pcaData_razooki, "percentVar"))
p_pca_razooki <- ggplot(pcaData_razooki, aes(PC1, PC2, color = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar_razooki[1], "% variance")) +
ylab(paste0("PC2: ", percentVar_razooki[2], "% variance")) +
ggtitle("Razooki Dataset: PCA (Uninfected vs Infected)")
print(p_pca_razooki)
de_genes_razooki <- rownames(res_razooki)[which(res_razooki$padj < 0.05 & abs(res_razooki$log2FoldChange) > 1)]
cat("number of de genes (padj < 0.05, |log2fc| > 1): ", length(de_genes_razooki), "\n")
if (length(de_genes_razooki) > 0) {
mat_razooki <- assay(vsd_razooki)[de_genes_razooki, ]
mat_razooki_scaled <- t(scale(t(mat_razooki)))
pheatmap(mat_razooki_scaled, annotation_col = meta_razooki, main = "razooki: DE Genes Heatmap")
} else {
cat("no genes passed the filtering criteria for the heatmap.\n")
}
volcano_data_razooki <- data.frame(
log2FoldChange = res_razooki$log2FoldChange,
negLog10Padj = -log10(res_razooki$padj),
gene = rownames(res_razooki)
)
p_volcano_razooki <- ggplot(volcano_data_razooki, aes(x = log2FoldChange, y = negLog10Padj)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), color = "red", linetype = "dashed") +
xlab("Log2 Fold Change") +
ylab("-Log10 Adjusted p-value") +
ggtitle("Razooki: Volcano Plot (Infected vs Uninfected)")
print(p_volcano_razooki)
plotMA(res_razooki, main = "Razooki: MA Plot (Infected vs Uninfected)", ylim = c(-5, 5))