---
title: "02-RNA-seq"
author: "Claudia-Halibut"
format: html
editor: visual
---

4/9/2025 Objective: Installing Kallisto and running DESeq2
```{bash}
## the following code downloads and runs the kallisto software

/home/shared/kallisto/kallisto
```
```{bash}
## check working directory
pwd
cd data/
pwd
```

```{bash}
## the following code downloads the Pacific oyster fasta file of genes and does so ignoring the fact that gannet does not have a security certificate to authenticate via 'insecure'

cd data/
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
```
```{bash}
## calling on the rna file, while renaming it under cgigas

/home/shared/kallisto/kallisto \
index -i \
data/cgigas_roslin_rna.index \
data/rna.fna
```
```{bash}
## the following code will download the sequence reads 
cd data 
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
```
```{bash}
## check directory and print

pwd
```

```{bash}
## the following code will find files, remove L001, runs quant() function on each, locates index file, writes output file, and create sub-directory. 40 threads for computation and single end reads 
#NOTE: make sure files are routed correctly

cd data
mkdir ../output/kallisto_01

find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
```
```{bash}
## runs the abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to create a gene expression matrix from kallisto output files
#NOTE: Use ../output/kallisto_01 as the output directory and prefix for the gene expression matrix file

cd data 
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/D54_S145/abundance.tsv \
    ../output/kallisto_01/D56_S136/abundance.tsv \
    ../output/kallisto_01/D58_S144/abundance.tsv \
    ../output/kallisto_01/M45_S140/abundance.tsv \
    ../output/kallisto_01/M48_S137/abundance.tsv \
    ../output/kallisto_01/M89_S138/abundance.tsv \
    ../output/kallisto_01/D55_S146/abundance.tsv \
    ../output/kallisto_01/D57_S143/abundance.tsv \
    ../output/kallisto_01/D59_S142/abundance.tsv \
    ../output/kallisto_01/M46_S141/abundance.tsv \
    ../output/kallisto_01/M49_S139/abundance.tsv \
    ../output/kallisto_01/M90_S147/abundance.tsv \
    ../output/kallisto_01/N48_S194/abundance.tsv \
    ../output/kallisto_01/N50_S187/abundance.tsv \
    ../output/kallisto_01/N52_S184/abundance.tsv \
    ../output/kallisto_01/N54_S193/abundance.tsv \
    ../output/kallisto_01/N56_S192/abundance.tsv \
    ../output/kallisto_01/N58_S195/abundance.tsv \
    ../output/kallisto_01/N49_S185/abundance.tsv \
    ../output/kallisto_01/N51_S186/abundance.tsv \
    ../output/kallisto_01/N53_S188/abundance.tsv \
    ../output/kallisto_01/N55_S190/abundance.tsv \
    ../output/kallisto_01/N57_S191/abundance.tsv \
    ../output/kallisto_01/N59_S189/abundance.tsv
```

```{r}
## load and install necessary packages and rearrange some lines because it wont run properly

BiocManager::install("DESeq2")
library(DESeq2)
library(tidyverse)
install.packages("RColorBrewer")
install.packages("pheatmap")
library(pheatmap)
library(RColorBrewer)
library(data.table)
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
```
```{r}
getwd()
```

```{r}
## read in the count matrix (cell-by-gene matrix where each cell entry represents the number of RNA reads)

setwd("~/Claudia-Halibut/assignments /output")

countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
```

```{r}
## round integers up to whole numbers for further analysis:
#NOTE: 73,307 observations

countmatrix <- round(countmatrix, 0)
str(countmatrix)
```

```{r}
## get DEGs based on Desication
#NOTE: the function DESeqData..(), will not run without being in the same block as DESeq2 library. 

BiocManager::install("DESeq2")
library(DESeq2)

deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), 
                             type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)

deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
```
```{r}
## stop here for the night and assess data  results tomorrow. 4/10/2025

deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
```
```{r}

head(deseq2.res)
```
```{r}
## Count number of hits with adjusted p-value less then 0.05
#NOTE: 607 condition desicated vs 6 control?? 

dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```
```{r}
##Decipher this 

tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```
```{r}
#sent file to output, but double check tomorrow

setwd("~/Claudia-Halibut/assignments /output")
write.table(tmp.sig, "output/DEGlist.tab", row.names = T)
```