---
title: "3.0-RNASeq"
author: Susan Garcia
date: "2023-04-14"
output: 
  html_document:
    theme: cosmo
    highlight: tango
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(Biostrings)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```


#Kallisto Assignment: 

Creating a workflow for RNA Seq

##Downloading reference

```{r, engine='bash'}
# Setting Directory and downloading data
cd ../data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
```

##Kallisto is installed on the Raven server.

```{r, engine='bash'}
/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna
```

##Downloading sequence reads from the server.

```{r, engine='bash'}
#Setting Directory
cd ../data 

#Downloading fastq files using wget
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
```

#Running Kallisto on fastq for alignment.
```{r, engine='bash'}
#Creating kallisto_01 folder before alignment/run.
mkdir ../output/kallisto_01

find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
```

##COmbining output files for visualization

```{r, engine='bash'}
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/D54_S145/abundance.tsv \
    ../output/kallisto_01/D56_S136/abundance.tsv \
    ../output/kallisto_01/D58_S144/abundance.tsv \
    ../output/kallisto_01/M45_S140/abundance.tsv \
    ../output/kallisto_01/M48_S137/abundance.tsv \
    ../output/kallisto_01/M89_S138/abundance.tsv \
    ../output/kallisto_01/D55_S146/abundance.tsv \
    ../output/kallisto_01/D57_S143/abundance.tsv \
    ../output/kallisto_01/D59_S142/abundance.tsv \
    ../output/kallisto_01/M46_S141/abundance.tsv \
    ../output/kallisto_01/M49_S139/abundance.tsv \
    ../output/kallisto_01/M90_S147/abundance.tsv \
    ../output/kallisto_01/N48_S194/abundance.tsv \
    ../output/kallisto_01/N50_S187/abundance.tsv \
    ../output/kallisto_01/N52_S184/abundance.tsv \
    ../output/kallisto_01/N54_S193/abundance.tsv \
    ../output/kallisto_01/N56_S192/abundance.tsv \
    ../output/kallisto_01/N58_S195/abundance.tsv \
    ../output/kallisto_01/N49_S185/abundance.tsv \
    ../output/kallisto_01/N51_S186/abundance.tsv \
    ../output/kallisto_01/N53_S188/abundance.tsv \
    ../output/kallisto_01/N55_S190/abundance.tsv \
    ../output/kallisto_01/N57_S191/abundance.tsv \
    ../output/kallisto_01/N59_S189/abundance.tsv
```

#Run DESeq2

```{r, eval=T}
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]


head(countmatrix)
```

Rounding to whole numbers.

```{r, eval=T}
countmatrix <- round(countmatrix, 0)
str(countmatrix)
```

#GET DEGs BASED ON DESICATION
```{r, eval=T}
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), 
                             type=factor(rep("single-read", 24)))

rownames(deseq2.colData) <- colnames(data)
#Creates the DESeq dataframe
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)
```

```{r, eval=T}
#Runs the DESeq
deseq2.dds <- DESeq(deseq2.dds)

#Makes a dataframe for the results
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
```

```{r PCA, eval=TRUE}
vsd <- vst(deseq2.dds, blind = FALSE)
#Plots Graph based on condition
plotPCA(vsd, intgroup = "condition")
```

```{r heatmap, eval=TRUE}
#Selecting the top 50 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

#Extracting the counts and normalizing
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

#Log-transform counts
log_counts_top <- log2(counts_top + 1)

#Generate a heat map (assure package is installed)
pheatmap(log_counts_top, scale = "row")

head(deseq2.res)
```

```{r, eval=TRUE}

# Count number of hits with adjusted p-value less then 0.05

dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

#Visualizing Data

```{r, eval=TRUE}

temp <- deseq2.res

# The main plot
plot(temp$baseMean, temp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="gray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")

# Getting the significant points and plotting them again so they're a different color
temp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(temp.sig$baseMean, temp.sig$log2FoldChange, pch=20, cex=0.45, col="green")

# 2 FC lines
abline(h=c(-1,1), col="purple")
```

#Create a new plot.
```{r, eval=TRUE}

# Prepping data
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)

#Volcano Plot
volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("yellow", "red")) +
  labs(title = "Volcano Plot",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-value",
       color = "Significantly\nDifferentially Expressed") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "top")

#See your plot
print(volcano_plot)

```

```{r, eval=TRUE}
#Create a table for output
write.table(temp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T)
```

```{r, eval=TRUE}
deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] 

#See your plot
print(deglist)

#Check datable
datatable(deglist)
```

# What genes are these?
## The gene enrichment analysis

```{r, eval=TRUE}
gene_deg_stat <- res_df %>%
  mutate(degstaus = ifelse(padj < 0.05, 1, 0))
```

```{r, eval=TRUE}

# Read the FASTA file
fasta_data <- readDNAStringSet("../data/rna.fna")


# Calculate gene lengths
gene_lengths <- width(fasta_data)


# Extract gene names/IDs from sequence IDs
gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1])


# Create a data frame with gene IDs and lengths
gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths)
```