---
title: Identification of differentially expressed transcripts in P.verrucosa exposed
  to enriched nutrient supply.
author: "Sam White"
date: "2/24/2023"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


# Use [Ballgown](https://github.com/alyssafrazee/ballgown) for identification of differentially expressed isoforms in _P.verrucosa_ exposed to elevated nutrient supplyu.

REQUIRES Linux-based system to run all chunks properly; some chunks will not work on Mac OS!

REQUIRES the following Bash programs:

- `wget`

- `tree`

- `md5sum`

REQUIRES the following R libraries:

- [`Ballgown`](https://github.com/alyssafrazee/ballgown) (Bioconductor)

- `tidyverse`

# Load `R` libraries
```{r load-libraries}
library("ballgown")
library("tidyverse")
library("ggplot2")
```


# Set user variables!
```{r set-variables}
# Set maximum pvalue for isoform expression cutoff
pvalue <- 0.05
# Set maximum qvalue (false discovery rate) for isoform expression cutoff
qvalue <- 0.05
```


# Download Ballgown input files.

Notebooks detailing their creation:

- [FastQ trimming](https://robertslab.github.io/sams-notebook/2023/02/15/FastQ-Trimming-and-QC-P.verrucosa-RNA-seq-Data-from-Danielle-Becker-in-Hollie-Putnam-Lab-Using-fastp-FastQC-and-MultiQC-on-Mox.html)

- [Genome indexing, and exon/splice sites with HISAT2](https://robertslab.github.io/sams-notebook/2023/01/31/Genome-Indexing-P.verrucosa-v1.0-Assembly-with-HiSat2-on-Mox.html)

- [Mapping and identificaion of isoforms with StingTie]()

```{bash download-ballgown-files}
# Download Ballgown input files and directory structure
wget \
--directory-prefix ../data/ballgown \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 2 \
--no-host-directories \
--no-parent \
--quiet \
--reject "input_fastqs_checksums.md5" \
--accept "*.ctab,*checksums.md5" https://gannet.fish.washington.edu/Atumefaciens/20230216-pver-stringtie-Pver_genome_assembly_v1.0-isoforms/
```

# Verify checksums

NOTE: Warnings are expected, as the checksums files have checksums for files that are not downloaded for this project.
```{bash checksum-verification}
cd ../data/ballgown

# Remove "top level" checksums file
# Prevents pattern matching downstream
rm checksums.md5

# Make a line
line="-----------------------------------------------------------------------------------------------"

# Set working directory
wd=$(pwd)

# Loop through directories and verify checksums
for directory in */
do
  cd "${directory}"
  # Get sample name; strips trailing slash from directory name
  sample="${directory%/}"
  
  echo ${line}
  echo "${sample}"
  echo ""
  
  # Confirm checksums; sorts for easier reading
  md5sum --check "${sample}"_checksums.md5 | sort -V
  echo ""
  echo "${line}"
  echo ""
  cd ${wd}
done
# Show downloaded directories/files
tree
```


# Read in _P.verrucosa_ genes BED file
```{r read-in-genes-BED}
genes_BED <- read.table(file = "../data/Pver_genome_assembly_v1.0-valid.genes.bed")

# Add BED column names for more clarity
colnames(genes_BED) <- c("chr", "start", "end", "name", "score", "strand")
head(genes_BED)
```

# Find Ballgown installation location
```{r}
data_directory <-  system.file('extdata', package='ballgown') # automatically finds ballgown's installation directory
# examine data_directory:
data_directory
```

# Create Ballgown object
```{r}
# Uses regular expression in samplePattern to find all pertinent folders
# Load all measurement data
bg_all <- ballgown(dataDir="../data/ballgown/", samplePattern='[CE]*', meas='all')
bg_all
bg_controls <- ballgown(dataDir="../data/ballgown/", samplePattern='C.*', meas='all')
bg_controls
bg_enriched <- ballgown(dataDir="../data/ballgown/", samplePattern='E.*', meas='all')
bg_enriched
```


# Download and filter metadata file

Filtered metadata will be used to create a phenotype dataframe needed for Ballgown differential expression analysis.

```{r create-dataframes-for-ballgown-pData}
# Read in metadata file
sample_metadata_full <- read.csv("../data/metadata.RNAseq.csv")

# View full metadata
head(sample_metadata_full)

# Organize metadata in preparation of creating pData data frame far Ballgown
# Sort by "sample_id" to ensure matches directory structure (required for Ballgown)
sample_metadata_full <- sample_metadata_full %>% 
  select(fragment.ID, treatment, block, sample_id) %>% 
  arrange(sample_id)
# View organized metadata
head(sample_metadata_full)
```

# Load ALL phenotype dataframe into Ballgown object
```{r}
# Load phenotype info into Ballgown
pData(bg_all) <- sample_metadata_full
# Examine phenotype data as it exists in Ballgown
phenotype_table <-  pData(bg_all)
head(phenotype_table)
```


### Look at ALL exon info
```{r}
# Exon info
structure(bg_all)$exon
```

### Look at ALL intron info
```{r}
# Intron info
structure(bg_all)$intron
```

###  Look at ALL transcript (isoform) info
```{r}
# Transcript info
structure(bg_all)$trans
```

# Load ALL transcript expression data
```{r load-all-transcript-expression-data}
# Expression data
whole_tx_table <-  texpr(bg_all, 'all')

# Rename gene_names listed as a "."
whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name))
head(whole_tx_table)

# FPKM values for all transcripts
# Converts output to data frame
transcript_fpkm <- as.data.frame(texpr(bg_all, 'FPKM'))

# Put rownames into column for further manipulation
transcript_fpkm <- rownames_to_column(transcript_fpkm, "t_id")
head(transcript_fpkm)
```

## Write all transcript data to files
```{r write-transcript-data-to-file}
# Write all transcript FPKM data to file
write.csv(transcript_fpkm,
          file = "../output/02-ballgown-differential_gene_expression/fpkm-all_transcripts.csv",
          quote = FALSE,
          row.names = FALSE)

# Write whole_tx_table to file
write.csv(whole_tx_table,
          file ="../output/02-ballgown-differential_gene_expression/whole_tx_table.csv",
          quote = FALSE,
          row.names = FALSE)
```

# Load ALL gene expression data
```{r load-gene-expression-data}
whole_gx_table <-  gexpr(bg_all)

# Convet to data frame
whole_gx_table <- as.data.frame(whole_gx_table)

# Convert rownames to a column
whole_gx_table <- rownames_to_column(whole_gx_table, "name")
head(whole_gx_table)
```

## Write ALL gene data to files
```{r write-gene-data-to-file}
# Write whole_gx_table to file
write.csv(whole_gx_table,
          file ="../output/02-ballgown-differential_gene_expression/whole_gx_table.csv",
          quote = FALSE,
          row.names = FALSE)
```

# Generate boxplots to compare FPKM across all samples
```{r boxplots-to-compare-FPKM-across-all-samples} 
# Load phenotype info into Ballgown
pData(bg_all) <- sample_metadata_full

# Pull all transcript expression values stored in FPKM measurement from ballgown object
fpkm <- texpr(bg_all, meas = "FPKM")

# Log transform data and add 1 to all values to artificially prevent log2(0).
fpkm_df <- as.data.frame(log2(fpkm+1))
head(fpkm_df)

# Rotate data frame
# Creates a "treatment" column by pulling last character from each sample name (C/E)
fpkm_df_pivot <- pivot_longer(
  fpkm_df, cols = starts_with("FPKM"), 
  names_to = "library") %>% 
  mutate(treatment = (str_sub(library, start = 6, end = 6)
                )
         )
head(fpkm_df_pivot)

# Sort data frame by treatment
fpkm_df_pivot_sorted <- fpkm_df_pivot %>% arrange(treatment)
head(fpkm_df_pivot_sorted)

# Set unique library names as vector
# Will be used to group boxplot by treatment
fpkm_libraries_sorted_unique <- (unique(fpkm_df_pivot_sorted$library))
head(fpkm_libraries_sorted_unique)

# Re-order data frame by treatment-sorted data
fpkm_df_pivot$library <- factor(fpkm_df_pivot$library, levels = fpkm_libraries_sorted_unique)

# Produce boxplots of FPKM for each library
# Grouped by treatment
ggplot(fpkm_df_pivot, aes(library, value, color = treatment)) +
  geom_boxplot() +
  ggtitle("Comparisons of transcript FPKM values across all libraries") +
  theme(plot.title = element_text(hjust = 0.5)) +
  ylab("FPKM") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# Save boxplot as PDF
ggsave(filename = "../output/02-ballgown-differential_gene_expression/figures//fpkm_c-vs-e_boxplot.pdf")
```

# Identify differentially expressed transcripts (DETs) between treatments
Returns FoldChange, too (getFC = TRUE)

```{r DETs-controls-vs-enriched}
# Set string describing comparison
# Used for final printout in chunk
comparison <- "controls vs. enriched"

# Load phenotype info into Ballgown
pData(bg_all) <- sample_metadata_full

# Identify DETs
DET_treatment_stat_results = stattest(bg_all, feature='transcript', meas='FPKM', covariate="treatment", getFC = TRUE)
head(DET_treatment_stat_results)

# Filter based on p-value and q-value
DET_treatment_filtered_stat_results <- filter(DET_treatment_stat_results, pval <= pvalue & qval <= qvalue)
head(DET_treatment_filtered_stat_results)

# Merge with full table to get subset of just differentially expressed isoforms
merged_DET_treatment_filtered <- merge(x = DET_treatment_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "t_id")
head(merged_DET_treatment_filtered)

# Convert to BED coordinate system, which is 0-based start
# and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY
merged_DET_treatment_filtered <- merged_DET_treatment_filtered %>% mutate(start = start - 1)
head(merged_DET_treatment_filtered)

# Filter for controls up-regulated transcripts (i.e. fold-change < 1)
# Convert to BED coordinate system, which is 0-based start
# and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY
DET_treatment_filtered_controls <- filter(merged_DET_treatment_filtered, fc < 1) %>% mutate(start = start - 1)
head(DET_treatment_filtered_controls)

# Filter for enriched up-regulated transcripts (i.e. fold-change > 1)
# Convert to BED coordinate system, which is 0-based start
# and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY
DET_treatment_filtered_enriched <- filter(merged_DET_treatment_filtered, fc > 1) %>% mutate(start = start - 1)
head(DET_treatment_filtered_enriched)

# Count number of DETs.
count_DET_treatment_filtered_stat_results <-  nrow(DET_treatment_filtered_stat_results)

# Print number of DETs
cat("Number of", comparison, "DET with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_stat_results, "\n")

# Count number of controls DETs.

count_DET_treatment_filtered_controls <-  nrow(DET_treatment_filtered_controls)

# Print number of controls DETs.
cat("Number of controls DET with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_controls, "\n")

# Count number of enriched DETs.
count_DET_treatment_filtered_enriched <-  nrow(DET_treatment_filtered_enriched)

# Print number of enriched DETs.
cat("Number of enriched DETs with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_enriched, "\n")
```

## Write controls vs enriched DET dataframes to files
```{r write-DETs-controls-vs-enriched-to-files}
write.csv(
  merged_DET_treatment_filtered, 
  file = "../output/02-ballgown-differential_gene_expression/DET_treatment_filtered_p-0.05_q-0.05.csv", 
  quote = FALSE, 
  row.names = FALSE)

# Creates a BED file of all DETs and inserts necessary columns to create properly formatted BED file. 
write.table(
  (
    merged_DET_treatment_filtered %>% 
    select(chr, start, end, t_name, strand) %>% 
      add_column(
        score = "0", # Inserts a "score" column and assigns a value of "0" to all rows.
        .before = "strand"
        )
    ), 
  file = "../output/02-ballgown-differential_gene_expression/DET_treatment_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)

# Creates a BED file of only controls DETs and inserts necessary columns to create properly formatted BED file. 
write.table(
  (
    DET_treatment_filtered_controls %>% 
    select(chr, start, end, t_name, strand) %>% 
      add_column(
        score = "0", # Inserts a "score" column and assigns a value of "0" to all rows.
        .before = "strand"
        )
    ), 
  file = "../output/02-ballgown-differential_gene_expression/DET_treatment_controls_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)

# Creates a BED file of only enriched DETs and inserts necessary columns to create properly formatted BED file. 
write.table(
  (
    DET_treatment_filtered_enriched %>% 
    select(chr, start, end, t_name, strand) %>% 
      add_column(
        score = "0", # Inserts a "score" column and assigns a value of "0" to all rows.
        .before = "strand"
        )
    ), 
  file = "../output/02-ballgown-differential_gene_expression/DET_treatment_enriched_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)
```

# Identify differentially expressed genes (DEGs) between treatment
Returns FoldChange, too (getFC = TRUE)
```{r DEGs-controls-vs.-enriched}
# Set string describing comparison
# Used for final printout in chunk
comparison <- "controls vs. enriched"

# Load phenotype info into Ballgown
pData(bg_all) <- sample_metadata_full

# Determine DEGs with ballgown
DEG_treatment_stat_results = stattest(bg_all, feature='gene', meas='FPKM', covariate="treatment", getFC = TRUE)
head(DEG_treatment_stat_results)

# Filter based on p-value and q-value
DEG_treatment_filtered_stat_results <- filter(DEG_treatment_stat_results, pval <= pvalue & qval <= qvalue)
head(DEG_treatment_filtered_stat_results)

# Merge with full table to get subset of just DEGs
# and their corresponding transcripts.
merged_DEG_treatment_filtered <- merge(x = DEG_treatment_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "gene_id")
head(merged_DEG_treatment_filtered)

# Merge with BED file to get DEGs WITHOUT associated transcripts
merged_DEG_treatment_filtered_genes_only <- merge(x = DEG_treatment_filtered_stat_results, y = genes_BED, by.x = "id", by.y = "name")

head(merged_DEG_treatment_filtered_genes_only)

# Convert to BED coordinate system, which is 0-based start
# and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY
merged_DEG_treatment_filtered_ <- merged_DEG_treatment_filtered %>% mutate(start = start -1)
head(merged_DEG_treatment_filtered)

# Filter for controls up-regulated genes (i.e. fold-change < 1)
DEG_treatment_filtered_controls <- filter(merged_DEG_treatment_filtered_genes_only, fc < 1)
head(DEG_treatment_filtered_controls)

# Filter for enriched up-regulated genes (i.e. fold-change < 1)
DEG_treatment_filtered_enriched <- filter(merged_DEG_treatment_filtered_genes_only, fc > 1)
head(DEG_treatment_filtered_enriched)

# Count number of DEG.
count_merged_DEG_treatment_filtered_genes_only <-  nrow(merged_DEG_treatment_filtered_genes_only)

# Print number of DEG
cat("Number of ", comparison, " DEG with p-values and q-values <= ", pvalue, ":", count_merged_DEG_treatment_filtered_genes_only, "\n")

# Count number of controls DEG.
count_DEG_treatment_filtered_controls <-  nrow(DEG_treatment_filtered_controls)

# Print number of controls DEG.
cat("Number of controls DEG with p-values and q-values <= ", pvalue, ":", count_DEG_treatment_filtered_controls, "\n")

# Count number of enriched DEG.
count_DEG_treatment_filtered_enriched <-  nrow(DEG_treatment_filtered_enriched)

# Print number of enriched DEG.
cat("Number of enriched DEG with p-values and q-values <= ", pvalue, ":", count_DEG_treatment_filtered_enriched, "\n")
```

## Write DEGs enriched vs. controls dataframes to files
```{r write-DEGs-controls-vs-enriched-to-files}
# Write merged dataframe to CSV
write.csv(
  merged_DEG_treatment_filtered, 
  file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_all_filtered_p-0.05_q-0.05.csv", 
  quote = FALSE, 
  row.names = FALSE)

# Creates a BED file of all DEG. 
write.table(
  (merged_DEG_treatment_filtered_genes_only %>% 
    select(chr, start, end, id, score, strand)
    ), 
  file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_all_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)

# Creates a BED file of only controls DEGs.
write.table(
  (
    DEG_treatment_filtered_controls %>% 
    select(chr, start, end, id, score, strand)
    ),
  file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_controls_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)

# Creates a BED file of only enriched DEGs. 
write.table(
  (
    DEG_treatment_filtered_enriched %>% 
    select(chr, start, end, id, score, strand)
    ),
  file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_enriched_filtered_p-0.05_q-0.05.bed",
  sep = "\t",
  col.names = FALSE,
  row.names = FALSE,
  quote = FALSE
)
```