---
title: "03.10-D-Apul-sRNAseq-expression-DESeq2"
author: "Kathleen Durkin"
date: "2024-12-19"
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    number_sections: true
bibliography: references.bib
---

Gene expression summary for *Acropora pulchra* sRNA-seq data.

-   trimmed reads generated in `deep-dive` project

-   Reads aligned to *Acropora pulchra* genome 

### Install and load packages

```{r load_libraries, inlcude = TRUE}
library(tidyverse)
library(ggplot2)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
library(DESeq2)
library(ComplexHeatmap)
```


# sRNA

## Load count data and coldata

Load in the sRNA count matrix generated using ShortStack 4.1.0. Keep in mind this data includes counts of all sRNAs, not just miRNAs
Counts generated in `04-Apul-sRNA-discovery-ShortStack`. Coldata generated in `03.00-D-Apul-RNAseq-gene-expression-DESeq2`

```{r load-sRNA-counts}
# Read in sRNA counts data
Apul_counts_sRNA_data_OG <- read_delim("../output/04-Apul-sRNA-discovery-ShortStack/ShortStack_out/Counts.txt", delim="\t") 
head(Apul_counts_sRNA_data_OG)

# Read in coldata 
coldata_OG <- read.csv(file = "../output/03.00-D-Apul-RNAseq-gene-expression-DESeq2/DESeq2-coldata.tab", row.names=1, sep = "\t")
coldata_OG$time.point <- factor(coldata_OG$time.point)
```

## Count data munging

```{r sRNA-count-data-munging}
Apul_counts_sRNA <- Apul_counts_sRNA_data_OG
coldata <- coldata_OG

# Remove excess portions of sample column names to just "sample###"
colnames(Apul_counts_sRNA) <- sub("-fastp-adapters-polyG-31bp-merged_condensed", "", colnames(Apul_counts_sRNA))

# Keep just the counts and cluster names
Apul_counts_sRNA <- Apul_counts_sRNA %>% select(-Coords, -MIRNA)

# I'm not going to be doing any removal of low-count sRNAs for now

# Make the cluster names our new row names
Apul_counts_sRNA <- Apul_counts_sRNA %>% column_to_rownames(var = "Name")

# Append colony and timepoint info to sample names
colnames(Apul_counts_sRNA) <- paste(colnames(Apul_counts_sRNA), coldata[colnames(Apul_counts_sRNA), "colony.id"], coldata[colnames(Apul_counts_sRNA), "time.point"], sep = "_")
# Make sure coldata metadata has matching rownames (for DEseq2 formatting)
rownames(coldata) <- paste(rownames(coldata), coldata$colony.id, coldata$time.point, sep = "_")


write.table(Apul_counts_sRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_sRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)

head(Apul_counts_sRNA)
head(coldata)
```


## Expression levels

Plot histograms of the expression levels in each sample

```{r expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_sRNA_melted <- melt(Apul_counts_sRNA, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_sRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## Transcript counts

First let's check the total number of transcripts in each sample -- keep in mind this expression data has *not* been normalized yet, so there may be different totals for each sample
```{r transcript-counts-plot}
# Calculate the total number of transcripts for each sample
total_transcripts <- colSums(Apul_counts_sRNA)

# Create a data frame for plotting
total_transcripts_df <- data.frame(sample = names(total_transcripts),
                                   totals = total_transcripts)

# Plot the total number of transcripts for each sample
ggplot(total_transcripts_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```
No glaring discrepancies/patterns

Now let's check the number of unique transcripts in each sample -- that is, how many unique sRNAs are expressed in each sample? This should be pretty much the same across samples, even without normalization.

```{r total-unique-transcripts-plot}
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_transcripts <- colSums(Apul_counts_sRNA > 0)

# Create a data frame for plotting
unique_transcripts_df <- data.frame(sample = names(unique_transcripts),
                                    uniques = unique_transcripts)

# Plot the total number of unique transcripts for each sample
ggplot(unique_transcripts_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed Transcripts per Sample",
       x = "Sample",
       y = "Unique Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```
   

# miRNA

## Load miRNA metadata

The ShortStack output Results.txt includes all clusters of sRNA reads, including those not annotated as valid miRNAs. Now that we've looked at all the sRNAs a bit, let's focus in on those classified as miRNAs.

```{r miRNA-count-data-munging}
Apul_counts_miRNA <- Apul_counts_sRNA_data_OG
coldata <- coldata_OG

# Remove excess portions of sample column names to just "sample###"
colnames(Apul_counts_miRNA) <- sub("-fastp-adapters-polyG-31bp-merged_condensed", "", colnames(Apul_counts_miRNA))

# Keep only the sRNAs ID'd as valid miRNAs
Apul_counts_miRNA <- Apul_counts_miRNA %>% filter(MIRNA == "Y")

# Keep just the counts and cluster names
Apul_counts_miRNA <- Apul_counts_miRNA %>% select(-Coords, -MIRNA)

# Make the cluster names our new row names
Apul_counts_miRNA <- Apul_counts_miRNA %>% column_to_rownames(var = "Name")

# Append colony and timepoint info to sample names
colnames(Apul_counts_miRNA) <- paste(colnames(Apul_counts_miRNA), coldata[colnames(Apul_counts_miRNA), "colony.id"], coldata[colnames(Apul_counts_miRNA), "time.point"], sep = "_")
# Make sure coldata metadata has matching rownames (for DEseq2 formatting)
rownames(coldata) <- paste(rownames(coldata), coldata$colony.id, coldata$time.point, sep = "_")

write.table(Apul_counts_miRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)

head(Apul_counts_miRNA)

```

## Expression levels

Plot histograms of the expression levels in each sample

```{r miRNA-expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_miRNA_melted <- melt(Apul_counts_miRNA, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_miRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "miRNA Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## miRNA counts

First let's check the total number of miRNAs in each sample -- keep in mind this expression data has *not* been normalized yet, so there may be different totals for each sample
```{r miRNA-counts-plot}
# Calculate the total number of transcripts for each sample
total_miRNA <- colSums(Apul_counts_miRNA)

# Create a data frame for plotting
total_miRNA_df <- data.frame(sample = names(total_miRNA),
                                   totals = total_miRNA)

# Plot the total number of transcripts for each sample
ggplot(total_miRNA_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of miRNAs per Sample",
       x = "Sample",
       y = "Total miRNAs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```

Now let's check the number of unique miRNAs in each sample -- This should be pretty much the same across samples, even without normalization.

```{r total-unique-miRNA-plot}
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_miRNA <- colSums(Apul_counts_miRNA > 0)

# Create a data frame for plotting
unique_miRNA_df <- data.frame(sample = names(unique_miRNA),
                                    uniques = unique_miRNA)

# Plot the total number of unique transcripts for each sample
ggplot(unique_miRNA_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed miRNAs per Sample",
       x = "Sample",
       y = "Unique miRNA") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```

## Heatmap

```{r miRNA-heatmap}
pheatmap(Apul_counts_miRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)
```
Well... there's like 2 miRNAs with much higher expression than the others, which is making visualizing relative differences difficult. Let's redo the heatmap, normalizing each row to view relative difference in expression between samples (`scale='row'`)

```{r miRNA-heatmap-rowscale}
pheatmap(Apul_counts_miRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         scale = 'row',
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)
```

# siRNA

ShortStack's primary purpose is to identify miRNAs from sRNA-seq data, but it also automatically annotates siRNA loci! Since siRNA potentially play an important role in transposon silencing in invertebrates, we should generate count matrices for siRNAs as well. 

We can see clusters annotated as siRNAs in the `Results.gff3` output file of ShortStack (sRNA ID shown in the 3rd column)

```{r siRNA-count-data-munging}
Apul_Resultsgff <- read.table("../output/04-Apul-sRNA-discovery-ShortStack/ShortStack_out/Results.gff3")
# Separate last column info into multiple columns for filtering
Apul_Resultsgff <- Apul_Resultsgff %>%
  separate(V9, into = c("Name", "DicerCall", "MIRNA"), sep = ";") %>%
  mutate(Name = sub("ID=", "", Name),
         DicerCall = sub("DicerCall=", "", DicerCall),
         MIRNA = sub("MIRNA=", "", MIRNA))
head(Apul_Resultsgff)

# keep just the sRNA category column (V3), and the cluster names (Name)
# filter to only keep clusters ID'd as siRNAs
Apul_siRNA_clusters <- Apul_Resultsgff %>%
  select(V3, Name) %>%
  filter(str_detect(V3, regex("siRNA")))
head(Apul_siRNA_clusters)

# Now use this list of clusters ID'd as siRNAs to filter our sRNA count matrix
# keep only the sample counts and cluster names
Apul_counts_sRNA <- rownames_to_column(Apul_counts_sRNA, var = "Name")
Apul_counts_siRNA <- left_join(Apul_siRNA_clusters, Apul_counts_sRNA, by = c("Name" = "Name")) %>%
  select(-V3)

# convert the column of cluster names into the df row names
Apul_counts_sRNA <- Apul_counts_sRNA %>% column_to_rownames(var="Name")
Apul_counts_siRNA <- Apul_counts_siRNA %>% column_to_rownames(var="Name")

head(Apul_counts_siRNA)

write.table(Apul_counts_siRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_siRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
```

## Expression levels

Plot histograms of the expression levels in each sample

```{r siRNA-expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_siRNA_melted <- melt(Apul_counts_siRNA, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_siRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "siRNA Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## siRNA counts

First let's check the total number of siRNAs in each sample -- keep in mind this expression data has *not* been normalized yet, so there may be different totals for each sample
```{r siRNA-counts-plot}
# Calculate the total number of transcripts for each sample
total_siRNA <- colSums(Apul_counts_siRNA)

# Create a data frame for plotting
total_siRNA_df <- data.frame(sample = names(total_siRNA),
                                   totals = total_siRNA)

# Plot the total number of transcripts for each sample
ggplot(total_siRNA_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of siRNAs per Sample",
       x = "Sample",
       y = "Total siRNAs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```
All of the TP2s seem to have higher #s, but since this isn't normalized I don't want to put too much stock in that

Now let's check the number of unique siRNAs in each sample -- This should be pretty much the same across samples, even without normalization.

```{r total-unique-siRNA-plot}
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_siRNA <- colSums(Apul_counts_siRNA > 0)

# Create a data frame for plotting
unique_siRNA_df <- data.frame(sample = names(unique_siRNA),
                                    uniques = unique_siRNA)

# Plot the total number of unique transcripts for each sample
ggplot(unique_siRNA_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed siRNAs per Sample",
       x = "Sample",
       y = "Unique siRNA") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```

## Heatmap

```{r siRNA-heatmap}
pheatmap(Apul_counts_siRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)
```

```{r siRNA-heatmap-rowscale}
pheatmap(Apul_counts_siRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         scale = 'row',
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)
```


# ...........

# Normalized sRNA counts

## Normalize counts with DESeq2

### Metadata

DESeq2 requires a metadata data frame as input -- we'll use the coldata we've already formatted

```{r make-sRNA-metadata-dataframe}

head(coldata)
```

### DESeq object

## Verify rownames match
```{r check-rownames}
# Alphabetize rownames of coldata and colnames of Apul_counts_sRNA
coldata <- coldata[order(rownames(coldata)), ]
Apul_counts_sRNA <- Apul_counts_sRNA[, order(colnames(Apul_counts_sRNA))]

all(rownames(coldata) == colnames(Apul_counts_sRNA))
```

# Create DESeq2 data set
```{r create-deseq2-data-set, cache=TRUE}
dds_Apul_sRNA <- DESeqDataSetFromMatrix(countData = Apul_counts_sRNA,
                              colData = coldata,
                              design = ~ time.point + colony.id)
dds_Apul_sRNA
```
```{r}
dds_Apul_sRNA$time.point <- factor(dds_Apul_sRNA$time.point, levels = c("TP1","TP2", "TP3", "TP4"))

dds_Apul_sRNA <- DESeq(dds_Apul_sRNA)
```

## Pairwise results tables

```{r deseq2-pairwise-results-tables}
# Define the output directory path
output_dir <- "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/"

# Set desired false discovery rate threshold (i.e. adjusted p-value, padj)
fdr <- 0.05

# Set log2 fold change threshold (a value of '1' is equal to a fold change of '2')
log2fc <- 1

sRNA_tp1.v.tp2.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP2"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp1.v.tp3.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP3"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp1.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP4"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp2.v.tp3.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP2","TP3"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp2.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP2","TP4"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp3.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP3","TP4"), alpha = fdr, lfcThreshold = log2fc)

sRNA_tp2.v.tp4.results

summary(sRNA_tp2.v.tp4.results)

table(sRNA_tp2.v.tp4.results$padj < 0.05)
```

 Write DDS results tables to CSVs
```{r write-dds-results-csv}
# Create a named list of the data frames
results_list <- list(
  sRNA_tp1.v.tp2.results = sRNA_tp1.v.tp2.results,
  sRNA_tp1.v.tp3.results = sRNA_tp1.v.tp3.results,
  sRNA_tp1.v.tp4.results = sRNA_tp1.v.tp4.results,
  sRNA_tp2.v.tp3.results = sRNA_tp2.v.tp3.results,
  sRNA_tp2.v.tp4.results = sRNA_tp2.v.tp4.results,
  sRNA_tp3.v.tp4.results = sRNA_tp3.v.tp4.results
)

# Loop through the list and write each data frame to a CSV file in the specified directory
for (df_name in names(results_list)) {
  write.csv(results_list[[df_name]], file = paste0(output_dir, df_name, ".table.csv"), row.names = TRUE, quote = FALSE)
}
```

## Normalizations

It's worth noting here that I'm actually going to be doing two different types of transformation on the counts data, which serve different purposes. 

- First is **normalizing** the transcript counts, which adjusts for differences in library size or sequencing depth, but retains count-like properties. Normalized counts are most useful for things like visualizing expression levels and differential expression analysis.

- Second is **variance stabilizing** the counts data, which aims to make the variance of the transformed data approximately independent of the mean, reducing heteroscedasticity (the relationship between variance and mean) and "smoothing" out the variance at low counts. Notably, the transformed data is *no longer on the original count scale*. The transformation makes the variance roughly constant across the range of counts, which makes it easier to interpret patterns in the data visually. Variance stabilized data is most useful for exploratory data analysis, like PCA, clustering, and heatmaps, and is also the transformation we'll want to use before WGCNA.

```{r get-normalized-sRNA-counts, cache=TRUE}
# extract normalized counts
# (normalization is automatically performed by deseq2)
Apul_counts_sRNA_norm <- counts(dds_Apul_sRNA, normalized=TRUE) %>% data.frame()

write.table(Apul_counts_sRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)


# variance stabilized data
vsd_Apul_sRNA <- varianceStabilizingTransformation(dds_Apul_sRNA, blind=TRUE)
wpn_vsd_Apul_sRNA <- getVarianceStabilizedData(dds_Apul_sRNA)
rv_wpn_Apul_sRNA <- rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE)

Apul_counts_sRNA_vsd <- data.frame(wpn_vsd_Apul_sRNA)
write.table(Apul_counts_sRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)

q75_wpn_Apul_sRNA <- quantile(rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE), .75)  # 75th quantile variability
Apul_counts_sRNA_vsd_q75 <- wpn_vsd_Apul_sRNA[ rv_wpn_Apul_sRNA > q75_wpn_Apul_sRNA, ] %>% data.frame # filter to retain only the most variable genes
write.table(Apul_counts_sRNA_vsd_q75, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized_q75.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)

q95_wpn_Apul_sRNA <- quantile(rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE), .95)  # 95th quantile variability
Apul_counts_sRNA_vsd_q95 <- wpn_vsd_Apul_sRNA[ rv_wpn_Apul_sRNA > q95_wpn_Apul_sRNA, ] %>% data.frame # filter to retain only the most variable genes
write.table(Apul_counts_sRNA_vsd_q95, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized_q95.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
```

## Plot normalized data

```{r plot-normalized-sRNA}
Apul_counts_sRNA_norm_long <- Apul_counts_sRNA_norm %>%
  mutate(
    Gene_id = row.names(Apul_counts_sRNA_norm)
  ) %>%
  pivot_longer(-Gene_id)

Apul_counts_sRNA_norm_long %>%
  ggplot(., aes(x = name, y = value)) +
  geom_violin() +
  geom_point() +
  theme_bw() +
  theme(
    axis.text.x = element_text( angle = 90)
  ) +
  ylim(0, NA) +
  labs(
    title = "Normalized Expression",
    x = "Sample",
    y = "Normalized counts"
  )
```


## Plot variance stabilized data

```{r plot-vsd-sRNA}
Apul_counts_sRNA_vsd_long <- Apul_counts_sRNA_vsd %>%
  mutate(
    Gene_id = row.names(Apul_counts_sRNA_vsd)
  ) %>%
  pivot_longer(-Gene_id)

Apul_counts_sRNA_vsd_long %>%
  ggplot(., aes(x = name, y = value)) +
  geom_violin() +
  geom_point() +
  theme_bw() +
  theme(
    axis.text.x = element_text( angle = 90)
  ) +
  ylim(0, NA) +
  labs(
    title = "Variance Stabilized Expression",
    x = "Sample",
    y = "Variance stabilized data"
  )
```

## Normalized expression levels

Plot histograms of the normalized expression levels in each sample

```{r norm-expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_norm_melted <- melt(Apul_counts_sRNA_norm, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## Normalized transcript counts

Check the total number of transcripts in each sample -- now that we've normalized the data these totals should be similar
```{r norm-transcript-counts-plot}
# Calculate the total number of transcripts for each sample
total_transcripts_norm <- colSums(Apul_counts_sRNA_norm)

# Create a data frame for plotting
total_transcripts_norm_df <- data.frame(sample = names(total_transcripts_norm),
                                   totals = total_transcripts_norm)

# Plot the total number of transcripts for each sample
ggplot(total_transcripts_norm_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```

## PCA of variance stabilized data

```{r PCA}
plotPCA(vsd_Apul_sRNA, intgroup="time.point")

plotPCA(vsd_Apul_sRNA, intgroup="colony.id")
```
Samples are strongly clustering by colony. Interestingly time point doesn't appear to influence clustering.

## Sample clustering

```{r sample-clustering}
sample_dists <- dist(t(assay(vsd_Apul_sRNA)))
pheatmap(as.matrix(sample_dists), 
         clustering_distance_rows = "euclidean", 
         clustering_distance_cols = "euclidean", 
         main="Sample Clustering")
```
Samples are strongly clustering by colony.

## Heatmaps

Of most variable variance stabilized sRNA transcripts

```{r heatmpas}
# 75th quantile
heat_colors <- rev(brewer.pal(12, "RdYlBu"))
pheatmap(Apul_counts_sRNA_vsd_q75, 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")

# 95th quantile
pheatmap(Apul_counts_sRNA_vsd_q95, 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")
```

# Normalized miRNA counts

## Isolate normalized/vsd miRNA
```{r miRNA-normalized-miRNA}

Apul_counts_sRNA_norm$Name <- rownames(Apul_counts_sRNA_norm)
Apul_counts_sRNA_vsd$Name <- rownames(Apul_counts_sRNA_vsd)

Apul_counts_miRNA_namesdf <- data.frame(Name = rownames(Apul_counts_miRNA)) 

Apul_counts_miRNA_norm <- left_join(Apul_counts_miRNA_namesdf, Apul_counts_sRNA_norm, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")

colnames(Apul_counts_miRNA_norm) <- gsub("X", "", colnames(Apul_counts_miRNA_norm))
colnames(Apul_counts_miRNA_norm) <- gsub("\\.", "-", colnames(Apul_counts_miRNA_norm))

write.table(Apul_counts_miRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_miRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)

Apul_counts_miRNA_vsd <- left_join(Apul_counts_miRNA_namesdf, Apul_counts_sRNA_vsd, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_miRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_miRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
```

## Normalized expression levels

Plot histograms of the normalized expression levels in each sample

```{r miRNA-norm-expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_miRNA_norm_melted <- melt(Apul_counts_miRNA_norm, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_miRNA_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## Normalized transcript counts

Check the total number of transcripts in each sample -- now that we've normalized the data these totals should be similar
```{r miRNA-norm-transcript-counts-plot}
# Calculate the total number of transcripts for each sample
total_transcripts_miRNA_norm <- colSums(Apul_counts_miRNA_norm)

# Create a data frame for plotting
total_transcripts_miRNA_norm_df <- data.frame(sample = names(total_transcripts_miRNA_norm),
                                   totals = total_transcripts_miRNA_norm)

# Plot the total number of transcripts for each sample
ggplot(total_transcripts_miRNA_norm_df, aes(x = reorder(sample, totals), totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of miRNA Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```
## PCA of variance stabilized miRNAs

Note that I had to convert the counts to a data frame (instead of a DESeqTransform object) to select only miRNAs, so I can't use the plotPCA() function.
```{r miRNA-clustering-vsd}
# Perform PCA
pca_res <- prcomp(t(as.matrix(Apul_counts_miRNA_vsd)))

# Calculate percent variance explained
percentVar <- round(100 * (pca_res$sdev^2 / sum(pca_res$sdev^2)))

## Color by colony ID ##
# Convert PCA results to data frame
pca_df <- as.data.frame(pca_res$x)
pca_df$colony.id <- dds_Apul_sRNA$colony.id

# Create PCA plot with percent variance
ggplot(pca_df, aes(x = PC1, y = PC2, color = colony.id)) +
  geom_point(size = 3) +
  labs(
    x = paste0("PC1: ", percentVar[1], "% variance"),
    y = paste0("PC2: ", percentVar[2], "% variance"),
    title = "PCA of VSD miRNA Counts"
  )

## Color by colony ID ##
# Convert PCA results to data frame
pca_df <- as.data.frame(pca_res$x)
pca_df$time.point <- dds_Apul_sRNA$time.point

# Create PCA plot with percent variance
ggplot(pca_df, aes(x = PC1, y = PC2, color = time.point)) +
  geom_point(size = 3) +
  labs(
    x = paste0("PC1: ", percentVar[1], "% variance"),
    y = paste0("PC2: ", percentVar[2], "% variance"),
    title = "PCA of VSD miRNA Counts"
  )

```
Interesting! When considering all sRNAs, they strongly clustered by colony and not time point. However, miRNAs alone show clear clustering by time point (though some influence of colony is still apparent).

## Sample clustering

```{r miRNA-sample-clustering}

# Calculate the Euclidean distance matrix
sample_dists <- dist(t(Apul_counts_miRNA_vsd))

# Create the heatmap
pheatmap(
  as.matrix(sample_dists), 
  clustering_distance_rows = "euclidean", 
  clustering_distance_cols = "euclidean", 
  main = "Sample Clustering"
)
```


## Heatmap

Of all miRNAs

```{r miRNA-heatmaps-vsd}
heat_colors <- rev(brewer.pal(12, "RdYlBu"))

pheatmap(as.matrix(Apul_counts_miRNA_vsd[apply(Apul_counts_miRNA_vsd, 1, var) > 0, ]), 
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")
# 
# top_20_counts_all <- order(rowMeans(counts(dds,normalized=TRUE)),
#                 decreasing=TRUE)[1:200]
# 
# timepoint_annotation = colData(dds) %>% as.data.frame() %>% select(time.point)
# 
# 
# pheatmap(assay(vsd)[top_20_counts_all,], 
#          cluster_rows=FALSE, 
#          show_rownames=FALSE,
#          cluster_cols=TRUE, 
#          annotation_col = timepoint_annotation)

```


```{r}


# Perform clustering based on normalized expression
row_clustering <- hclust(dist(as.matrix(Apul_counts_miRNA_norm)))
col_clustering <- hclust(dist(t(as.matrix(Apul_counts_miRNA_norm))))

# Convert normalized counts to binary (presence/absence)
binary_counts <- Apul_counts_miRNA_norm > 0
binary_counts <- as.matrix(binary_counts) * 1  # Convert logical to numeric (1/0)

# Generate heatmap using predefined dendrograms
Heatmap(
  binary_counts,
  cluster_rows = as.dendrogram(row_clustering),  # Row clustering
  cluster_columns = as.dendrogram(col_clustering),  # Column clustering
  col = c("white", "black"),  # White for 0 (absence), black for 1 (presence)
  name = "Presence/Absence",  # Legend title
  show_row_names = TRUE,
  show_column_names = TRUE
)

```

```{r}
# Perform clustering based on presence/absence (binary data)
row_clustering <- hclust(dist(binary_counts))  # Use binary_counts for row clustering
col_clustering <- hclust(dist(t(binary_counts)))  # Use binary_counts for column clustering

# Generate heatmap using predefined dendrograms
Heatmap(
  binary_counts,
  cluster_rows = as.dendrogram(row_clustering),  # Row clustering based on presence/absence
  cluster_columns = as.dendrogram(col_clustering),  # Column clustering based on presence/absence
  col = c("lightgray", "#408EC6"),  # White for 0 (absence), black for 1 (presence)
  name = "Presence/Absence",  # Legend title
  show_row_names = TRUE,
  show_column_names = TRUE,
  row_names_gp = gpar(fontsize = 6),  # Adjust row names font size
  column_names_gp = gpar(fontsize = 8)  # Adjust column names font size
)

```


It looks like, of the 51 total miRNAs identified, most are consistently present in all A.pulchra colonies and time points. For 10-15 of the miRNAs, presence/absence is more variable accross colonies and timepoints.


```{r}
results_df <- as.data.frame(sRNA_tp2.v.tp4.results)
results_df$Name <- rownames(results_df)

# Assuming your metadata frame also has a 'Name' column
# Perform the left join to select only the genes listed in metadata
filtered_results <- dplyr::left_join(Apul_counts_miRNA_namesdf, results_df, by = "Name")

filter(filtered_results, padj < 0.4)
```


# Normalized siRNA counts

## Isolate normalized/vsd siRNA
```{r siRNA-normalized-siRNA}

Apul_counts_sRNA_norm$Name <- rownames(Apul_counts_sRNA_norm)
Apul_counts_sRNA_vsd$Name <- rownames(Apul_counts_sRNA_vsd)

Apul_counts_siRNA_namesdf <- data.frame(Name = rownames(Apul_counts_siRNA)) 

Apul_counts_siRNA_norm <- left_join(Apul_counts_siRNA_namesdf, Apul_counts_sRNA_norm, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_siRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_siRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)

Apul_counts_siRNA_vsd <- left_join(Apul_counts_siRNA_namesdf, Apul_counts_sRNA_vsd, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_siRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_siRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
```

## Normalized expression levels

Plot histograms of the normalized expression levels in each sample

```{r siRNA-norm-expression-level-histograms}
# Melt the count matrix into long format
Apul_counts_siRNA_norm_melted <- melt(Apul_counts_siRNA_norm, variable.name = "sample", value.name = "counts")

# Plot the expression level histograms for each sample
ggplot(Apul_counts_siRNA_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()
```

## Normalized transcript counts

Check the total number of transcripts in each sample -- now that we've normalized the data these totals should be similar
```{r siRNA-norm-transcript-counts-plot}
# Calculate the total number of transcripts for each sample
total_transcripts_siRNA_norm <- colSums(Apul_counts_siRNA_norm)

# Create a data frame for plotting
total_transcripts_siRNA_norm_df <- data.frame(sample = names(total_transcripts_siRNA_norm),
                                   totals = total_transcripts_siRNA_norm)

# Plot the total number of transcripts for each sample
ggplot(total_transcripts_siRNA_norm_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of siRNA Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
```

## Heatmap

Of all siRNAs

```{r siRNA-heatmaps-vsd}
heat_colors <- rev(brewer.pal(12, "RdYlBu"))
pheatmap(as.matrix(Apul_counts_siRNA_vsd[apply(Apul_counts_siRNA_vsd, 1, var) > 0, ]), 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")

```