---
title: "Differential gene expression analysis & PCA using Kallisto"
output: md_document
date: "2023-04-22"
---
```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Setting this to false will allow us to knit the code without evaluating code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

# Accessing a reference to produce an index
Download the 99k mRNA "complete records" for *Botryllus schlosseri* from NCBI as FASTA file and store in data folder. 

Then run the following code chunk which will use the index command in Kallisto to create a reference index of all 99k records. It renames that fasta file you upload to 'bsc_rna.index'.

```{r, engine = 'bash'}
/home/shared/kallisto/kallisto \
index -i \
/home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/bsc_rna.index \
/home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/sequence.fasta
```

Running the following code chunks produces a bunch of errors regarding file pathways. I think I am running into issues because my RNAseq data is in a different hard drive than my current working directory. However, If I simplify the code chunk to ask it for just one of my files (thus skipping the find command). It's possible to run the Kallisto quant command. Eventually, I may come back to this code chunk to troubleshoot.  
```{r, engine = 'bash'}
find /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/*_1.fastq \
| sed 's/_1.fastq$//' \
| xargs -I{} /home/shared/kallisto/kallisto \
quant -i /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/bsc_rna.index \
-o /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/output/kallisto_01/{} \
-t 40 \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/{}_1.fastq /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/{}_2.fastq

```

# Conducting psueo-alignments one query at a time...

### Quant SRR10352205
```{r, engine = 'bash'}
/home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \
-o ../output/kallisto_01/SRR10352205 \
-t 40 \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352205_1.fastq \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352205_2.fastq
```
### Quant SRR10352206

```{r, engine = 'bash'}
/home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \
-o ../output/kallisto_01/SRR10352206 \
-t 40 \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352206_1.fastq \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352206_2.fastq
```

### Quant SRR10352224

```{r, engine = 'bash'}
/home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \
-o ../output/kallisto_01/SRR10352224 \
-t 40 \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352224_1.fastq \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352224_2.fastq
```

### Quant SRR10352254
```{r, engine = 'bash'}
/home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \
-o ../output/kallisto_01/SRR10352254 \
-t 40 \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352254_1.fastq \
/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352254_2.fastq
```

# Creating gene expression matracies from kallisto output files

This command runs the abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to create a gene expression matrix from kallisto output files.
```{r, engine = 'bash'}
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/SRR10352205/abundance.tsv \
    ../output/kallisto_01/SRR10352206/abundance.tsv \
    ../output/kallisto_01/SRR10352224/abundance.tsv \
    ../output/kallisto_01/SRR10352254/abundance.tsv 

```


# Using DESeq2 

```{r}
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X #this line renames the row names from 1-n to the value of the first column called x
countmatrix <- countmatrix[,-1] #this gets rid of the first column
countmatrix <- round(countmatrix, 0) #rounds values in count matrix
head(countmatrix)
```

The following code chunk will make a new data frame that will contain information about the treatment groups and the type of RNAseq data produced. This will be called the object "deseq2.colData". We then use the DESeqDataSetFromMatrix function from the DESeq2 package
```{r}
deseq2.colData <- data.frame(condition=factor(c(rep("b5-b6", 2), rep("TO", 2))), 
                             type=factor(rep("paired-end", 4)))
rownames(deseq2.colData) <- colnames(data) #not sure what the purpose of this line is, it doesn't seem to do anything
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)
```

```{r}
deseq2.dds <- DESeq(deseq2.dds) #takes the DESeq data set we created and conducts a DESeq analysis
deseq2.res <- results(deseq2.dds) #here we take a look at the results of the analysis
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
head(deseq2.res)
```

```{r}
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

```{r}
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```

PCA plot:
```{r}
vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition")
```

```{r}
# Select top 50 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")
```

```{r}
# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)

# Create volcano plot
volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("grey", "red")) +
  labs(title = "Volcano Plot",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-value",
       color = "Significantly\nDifferentially Expressed") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "top")

print(volcano_plot)
```

# Improving look of initial exploratory plots
Plotting only the significant points on deg:
```{r}
mp <- deseq2.res
sigPoints <- !is.na(mp$padj) & mp$padj <= 0.05
# Define color palette
colorPalette <- c(gray(seq(0.8, 0.2, length.out = 100)), "red")

# The main plot
plot(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=10, cex=0.45, ylim=c(-3, 3), log="x", col=colorPalette[1],
     main="Differentially Expressed Genes in Blastogenic Stage (pval <= 0.05)",
     xlab="Mean Expression Level", ylab="Log2 Fold Change (vs Control)", cex.lab=1.5, cex.main=2)
# Getting the significant points and plotting them again so they're a different color

points(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=20, cex=0.45, col=colorPalette[length(colorPalette)])
# 2 FC lines
abline(h=c(-1,1), col="blue")
# Add legend
```

```{r}
write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)
```