---
title: "deseq2 with featurecounts"
author: "Megan Ewing"
date: "2024-05-13"
output: html_document
---

```{r}
# Load the DESeq2 library
# install.packages("BiocManager")
# BiocManager::install("DESeq2")
library(DESeq2)
library(dplyr)
```

### Getting Matrix from Feature Counts

```{r}

# Read the FeatureCounts output file into R
featureCounts <- read.csv("../output/STAR_count_data.csv", header=TRUE, row.names=1)
head(featureCounts)

#subset from feature counts of just the count data
countData <- featureCounts[, -(31:33)] 

```

Now, the STAR_count_data.csv, has not yet corrected for the mislabing of M-C-193 as M-T-193. The following chunks assigns that column to a new position (among the other control samples) and renames it appropriately.

```{r}

#confirm location
colnames(countData[16])

#we're moving it to first postion
countData <- countData[, c(16, 1:15, 17:ncol(countData))]
head(countData)

```

```{r}
metaData <- data.frame(
  sample = c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329", "M.C.334", "M.C.337", "M.C.339", "M.C.358", "M.C.360", "M.C.363", "M.C.373", "M.C.482","M.C.488", "M.T.18",  "M.T.20", "M.T.22", "M.T.235", "M.T.245", "M.T.29", "M.T.31", "M.T.32", "M.T.399", "M.T.43", "M.T.44", "M.T.500", "M.T.7", "M.T.83", "M.T.8"),
  treatment = c(rep("control", 15), rep("treatment", 15))
  )
```

### Running DESeq (initial, no pre-filter)

```{r}
# Create a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData,
                              colData = metaData,
                              design = ~ treatment)

# Normalize the data
dds <- DESeq(dds)

# Perform differential expression analysis
res <- results(dds)

```

### Filter Parameters: Determining Filter Parameters Based of Count Summary Stats

```{r}
# Load necessary libraries
library(DESeq2)
library(ggplot2)
library(reshape2)

# Assume `dds` is your DESeqDataSet object

# Extract the counts matrix
counts_matrix <- counts(dds)

# Convert the counts matrix to a long format for ggplot2
counts_long <- melt(counts_matrix)

# Rename columns for clarity
colnames(counts_long) <- c("Gene", "Sample", "Count")

# removing 0 counts
counts_long <- counts_long[counts_long$Count > 0,]

# summary
summary(counts_long)

#
# BOXPLOT
#

# Rename columns for clarity
colnames(counts_long) <- c("Gene", "Sample", "Count")

# Plot a single box plot for all counts combined
ggplot(counts_long, aes(x = "", y = Count)) +
  geom_boxplot(fill = "blue", color = "black") +
  theme_minimal() +
  scale_y_log10() +
  labs(title = "Box Plot of Read Counts",
       x = "",
       y = "Read Count")

# Save the plot
ggsave("../output/boxplot_of_all_read_counts.png")


#
# HISTOGRAM
#

counts_long <- counts_long[counts_long$Count > 0 & counts_long$Count <= 100, ]

# Plot the histogram
ggplot(counts_long, aes(x = Count)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Histogram of Read Counts",
       x = "Read Count",
       y = "Frequency")

# Save the plot
ggsave("../output/0807-histogram_of_read_counts_1to100.png")


```

### Filtered: Running DESeq, Seperated Treatment and Control – INDIVUDALS THRESHOLD, NOT AVERAGE BASED

```{r}
# Assume `dds` is your DESeqDataSet object

# Pre-filtering step
# Calculate the counts per gene in the control and treatment samples
counts_control <- counts(dds)[, 1:15]
counts_treatment <- counts(dds)[, 16:30]

# Function to check if a gene has at least 8 samples with at least 10 reads
has_min_samples <- function(counts, min_samples = 8, min_reads = 27) {
  rowSums(counts >= min_reads) >= min_samples
}

# Filter genes that have at least 8 samples with at least 10 reads in either control or treatment
filter <- has_min_samples(counts_control) | has_min_samples(counts_treatment)
dds2 <- dds[filter, ]

# Normalize the data
dds2 <- DESeq(dds2)

# Perform differential expression analysis
res2 <- results(dds2)

# Save the results
write.table(res2, "../output/1020-STAR-deseqres_ToC_8ind27r.tab", row.names = TRUE, col.names = TRUE)

```

Getting full counts

```{r}

# Extract counts and normalize
counts_all <- counts(dds2, normalized = TRUE)

# Log-transform counts
log_counts_all <- log2(counts_all + 1)

# shorten col names
short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18",  "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29"  ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43","M.T.44" , "M.T.500", "M.T.7" ,  "M.T.83" , "M.T.8")

colnames(log_counts_all) <- short_col
head(log_counts_all)

write.csv(log_counts_all, "../output/1020-STAR-logcounts_all_8ind27r_ToC.csv")

```

### Filtered: data explore and filtering significant

rename input/output files as needed

```{r}
allcounts <- read.csv("../output/1020-STAR-deseqres_ToC_8ind27r.tab", sep="")
head(allcounts)

# Extract significant results
sigresults <- res2[which(res2$padj < 0.05),]

# Explore the results
head(sigresults)

write.table(sigresults,"../output/1020-STAR-DEGstats_ToC_8ind27r.tab", row.names = T, col.names = T)

# peeking at sigresults
sigres_table <- read.csv("../output/1020-STAR-DEGstats_ToC_8ind27r.tab", sep="")
head(sigres_table)

sigDEGname <- row.names(sigres_table)

# write.table(sigDEGname, "../output/0807-DEGnames_ToC_8ind27r.tab", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

```

### Filtered: Heatmap of Top 25 Sig. DEG

```{r}
library(pheatmap)

# order by padj
res_ordered <- res2[order(res2$padj), ]

# getting top 25
all_deg <- row.names(res_ordered)[1:25]

# Extract counts and normalize
counts <- counts(dds2, normalized = TRUE)
counts_deg <- counts[all_deg, ]

# Log-transform counts
log_counts_top <- log2(counts_deg + 1)

# shorten col names
short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18",  "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29"  ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" ,  "M.T.83" , "M.T.8")

colnames(log_counts_top) <- short_col
head(log_counts_top)

write.csv(log_counts_top, "../output/log_counts_top25_DEGs_8ind23r_ToC.csv")


# Open a PNG device to save the plot
png("../output/heatmap_top25_DEGs_8ind23r_ToC_clust.png", width = 1000, height = 1000)

# Generate the heatmap
pheatmap(log_counts_top, 
         scale = "row", 
         cluster_cols = T,
         main = "Heatmap of Log2 Fold Change for Top 25 DEGs, Clustered")

# Close the PNG device
dev.off()
```

### Filtered: Heatmap of All Sig. DEG

```{r}

# library(pheatmap)

res_ordered <- res2[order(res2$padj), ]

# getting a all (replace 1:# with is total # of DEGs (can glance at stored value sigDEGname))
all_deg <- row.names(res_ordered)[1:48]

# Extract counts and normalize
counts <- counts(dds2, normalized = TRUE)
counts_deg <- counts[all_deg, ]

# Log-transform counts
log_counts_deg <- log2(counts_deg + 1)

# shorten col names
short_col <- c("M.C.193",  "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18",  "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29"  ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43"
,"M.T.44" , "M.T.500", "M.T.7" ,  "M.T.83" , "M.T.8")

colnames(log_counts_deg) <- short_col
head(log_counts_deg)

write.csv(log_counts_deg, "../output/1020-STAR-logcounts_allDEG_8ind27r_ToC.csv")

# Open a PNG device to save the plot
png("../output/1020-STAR-DEG_heatmap_8ind27r_ToC_clust.png", width = 500, height = 700)

# Generate the heatmap
pheatmap(log_counts_deg, 
         scale = "row", 
         cluster_cols = T,
         main = "Heatmap All DEGs (padj < 0.05) clustered")

# Close the PNG device
dev.off()

```

### Filtered: Heatmap with Full Gene Names

```{r}
# Assuming 'gene_loc' contains two columns: 
# "V1" (the existing row names in log_counts_deg) 
# and "V2" (the corresponding new gene names).

gene_loc <- read.csv("../output/gene_loc.csv", header=FALSE)

# Merge or match old names to gene names
row_names_current <- rownames(log_counts_deg)
# Create a named vector to map old names to gene names
gene_mapping <- setNames(gene_loc$V2, gene_loc$V1)

# Replace row names in log_counts_deg with gene names
new_gene_names <- gene_mapping[row_names_current]
rownames(log_counts_deg) <- new_gene_names

# Check if row names have been successfully replaced
head(log_counts_deg)

# Generate the heatmap with gene names as row labels
# Create the column annotation
annotation <- data.frame(
  Condition = factor(rep(c("ambient pH", "low pH"), each = 15))
)

# Set colors for the annotations (ambient pH -> lightpink, low pH -> lightblue)
annotation_colors <- list(
  Condition = c("ambient pH" = "green", "low pH" = "hotpink")
)

# Ensure row names of annotation match column names of the heatmap matrix
rownames(annotation) <- colnames(log_counts_deg)

# Open a PNG device to save the plot
png("../output/0930-DEG_heatmap_8ind23r_ToC_clust_genename2.png", width = 1400, height = 1500)

# Generate the heatmap
pheatmap(log_counts_deg, 
         scale = "row", 
         cluster_cols = TRUE,
         main = "Heatmap All DEGs (padj < 0.05), Clustered",
         annotation_col = annotation,  # Add the column annotations
         annotation_colors = annotation_colors,  # Add annotation colors
         legend = TRUE,
         border_color = "black",
         fontsize = 20)


# Close the PNG device
dev.off()


```

### Post Filter Recurrent Genes Heatmap

code below is using the 30n ToC as the base since it had the most genes

```{r}
# only selecting for LOC that popped up across all 4 pre-filters

# vector of genes that recurred 
occursmost <- c("LOC132713118", "LOC132722155", "LOC132722176", "LOC132723296", 
            "LOC132725056", "LOC132728236", "LOC132729677", "LOC132730403", 
            "LOC132731011", "LOC132732296", "LOC132734395", "LOC132740341", 
            "LOC132741452", "LOC132741637", "LOC132742068", "LOC132743875", 
            "LOC132745166", "LOC132745822", "LOC132746012", "LOC132746399", 
            "LOC132746437", "LOC132746792", "LOC132752133", "LOC132752300", 
            "LOC132758254", "LOC132758583", "LOC132758975", "LOC132759052", 
            "LOC132759372", "LOC132759828")

# dataframe of genes that recurred
# occursmost <- data.frame("gene" = occursmost)
```

```{r}
# Load the necessary library
library(pheatmap)

# Order the results by adjusted p-value
res_ordered <- res2[order(res2$padj), ]

# Get the top differentially expressed genes (all 84 genes)
all_deg <- row.names(res_ordered)[1:87]

# Extract counts and normalize
counts <- counts(dds2, normalized = TRUE)
counts_deg <- counts[all_deg, ]

# Log-transform the counts
log_counts_deg <- log2(counts_deg + 1)

# Shorten column names
short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18",  "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29"  ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43", "M.T.44" , "M.T.500", "M.T.7" ,  "M.T.83" , "M.T.8")
colnames(log_counts_deg) <- short_col

# Display the first few rows to check the transformation
head(log_counts_deg)

# Assume occursmost is a vector containing the gene names to include
# Example: occursmost <- c("Gene1", "Gene2", "Gene3")
# Filter log_counts_deg to include only the genes in occursmost
log_counts_deg_filtered <- log_counts_deg[rownames(log_counts_deg) %in% occursmost, ]

# Ensure all values are finite
log_counts_deg_filtered <- log_counts_deg_filtered[apply(log_counts_deg_filtered, 1, function(x) all(is.finite(x))), ]

# Save the filtered log-transformed counts to a CSV file
write.csv(log_counts_deg_filtered, "../output/0725-logcounts_filtered_occursmost.csv")

# Open a PNG device to save the heatmap
png("../output/0725-DEG_heatmap_occursmost.png", width = 1000, height = 1000)

# Generate the heatmap with filtered genes
pheatmap(log_counts_deg_filtered, 
         scale = "row", 
         cluster_cols = FALSE,
         main = "Heatmap Filtered DEGs (occursmost)")

# Close the PNG device
dev.off()


```

### Abandoned Chunks

```{r}


# COMMENTING OUT - ABANDONED CHUNK
#
#
#
#
# prepping DEG table for joining with blast results in next step by adding in the gene long ids
# 
# deg_noname <- read.csv("../output/0513-DEG.tab", sep = " ")
# colnames(deg_noname) <- c("LOC", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")
# 
# #getting the long gene id names that we will join with blast
# gene_id <- data.frame(rownames(featureCounts), featureCounts$Chr)
# 
# #merging those IDs back to our DEGs
# deg_names <- merge(x = deg_noname, y = gene_id, by.x = "LOC", by.y = "rownames.featureCounts.")
# 
# #writing the output to a file (overwriting previous one)
# write.table(deg_names,"../output/0513-DEG.tab", col.names = T)

```