---
title: Calculating transcript counts per gene per sample in P.verrucosa enriched to
  elevated nutrients using Ballgown
author: "Sam White"
date: "2/28/2023"
output: html_document
---

# Uses all transcripts expression table, generated using [Ballgown](https://github.com/alyssafrazee/ballgown), for calculating transcript counts per gene per sample in _P.verrucosa_ enriched to elevated nutrients.

REQUIRES the following R libraries:

- `tidyverse`

# Load `R` libraries

```{r}
library("tidyverse")
```

# Set variables

```{r set-variables}
# Vectors for subsetting samples by different groups
all <- c("C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27", "C28", "C29", "C30", "C31", "C32", "E1", "E10", "E11", "E12", "E13", "E14", "E15", "E16", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9")

controls <- c("C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27", "C28", "C29", "C30", "C31", "C32")

enriched <- c("E1", "E10", "E11", "E12", "E13", "E14", "E15", "E16", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9")


# Vector of comparisons
comparisons <- c("c_vs_e")

# Vector of count calculations column names
count_calcs <- c("sum_transcript_counts",
                 "median_transcript_counts",
                 "mean_transcript_counts",
                 "max_transcript_counts",
                 "min_transcript_counts",
                 "std_dev_transcript_counts",
                 "controls_sum_transcript_counts",
                 "controls_median_transcript_counts",
                 "controls_mean_transcript_counts",
                 "controls_max_transcript_counts",
                 "controls_min_transcript_counts",
                 "controls_std_dev_transcript_counts",
                 "enriched_sum_transcript_counts",
                 "enriched_median_transcript_counts",
                 "enriched_mean_transcript_counts",
                 "enriched_max_transcript_counts",
                 "enriched_min_transcript_counts",
                 "enriched_std_dev_transcript_counts"
                 )

# Initialize lists of data frames
list_transcript_counts_dfs <- list()
list_transcript_max_diffs_dfs <- list()
```

# Define functions
```{r define-function(s)}

#### Function for determining maximum transcripts per gene #####

# Function accepts two data frames (df1 and df2), and a string (comparison)
# Returns data frame containing:
# - gene_name
# - df1_max_transcript_counts
# - df2_max_transcript_counts
# - difference between max transcript counts

diff_max_transcripts <- function(df1, df2, comparison) {
  
  # Select columns from first data frame
  df1.max <- df1 %>% 
      select(gene_name, contains("max_transcript_counts"))
  
  # Get name of max transcript counts column
  df1.max.name <- df1.max %>%
    select(contains("max_transcript_counts")) %>% 
    colnames()
  
  # Select columns from second data frame
  df2.max <- df2 %>% 
      select(gene_name, contains("max_transcript_counts"))
  
   # Get name of max transcript counts column
  df2.max.name <- df2.max %>%
    select(contains("max_transcript_counts")) %>% 
    colnames()
  
  # Join the two max transcripts data frames on gene_name
  df1max.df2max.joined <- left_join(df1.max, df2.max, by = "gene_name")
  
  # Calculate difference between df1.max and df2.max
  # Filter for only samples with different max transcript counts.
  # The get() function is required to use the string of the corresponding variable.
  df1max.df2max.joined %>% 
    select(everything()) %>% 
    mutate(difference = (get(df1.max.name) - get(df2.max.name))) %>% 
    filter(difference != 0)

}

########################################################################
```

# Load all transcript expression data
```{r load-transcript-data}
# Expression data
whole_tx_table <-  read.csv("../output/02-ballgown-differential_gene_expression/whole_tx_table.csv")

head(whole_tx_table)

# Rename gene_names listed as a "." to the associated transcript name
whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name))

head(whole_tx_table)
```

# Count transcripts for each gene for each sample.
A transcript is counted if it has an FPKM value > 0.
```{r count-transcripts-per-gene-per-sample}

# Create table of transcript counts per gene per sample
transcript_counts <- whole_tx_table %>%
  select(starts_with(c("gene_name", "FPKM"))) %>%
  group_by(gene_name) %>%
  summarise((across(everything(), ~sum(. > 0))))

head(transcript_counts)

# Rename columns
names(transcript_counts) <- gsub(x = names(transcript_counts), pattern = "FPKM", replacement = "transcript_counts")

head(transcript_counts)
```

# Calculate max transcripts per gene across all samples
```{r max-transcripts-per-gene}
max_transcripts_per_gene <- transcript_counts %>% 
  rowwise() %>% 
  mutate(transcripts.max = max(across(contains(all, ignore.case = FALSE)))) %>% 
  select(ends_with(c("name", "max")))

head(max_transcripts_per_gene)
```

## Write max transcripts to tab-delimited file
```{r write-max-transcripts-to-file}
write.table(max_transcripts_per_gene,
          file ="../output/03-transcript-counts/transcripts-counts-max_per_gene.tab",
          quote = FALSE,
          row.names = FALSE,
          col.names = FALSE,
          sep = "\t")
```

# Perform transcript count calcs
```{r perform-transcript-count-calcs}
# Perform calculations
transcript_counts_per_gene_per_sample <- transcript_counts %>%
  rowwise() %>% 
  mutate(
    sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Male stats
transcript_counts_per_gene_per_sample_males <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with('M')) %>% 
  mutate(
    male_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    male_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    male_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    male_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    male_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    male_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Female stats
transcript_counts_per_gene_per_sample_females <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with('F')) %>% 
  mutate(
    female_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    female_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    female_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    female_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    female_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    female_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Controls stats
transcript_counts_per_gene_per_sample_controls <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(controls)) %>% 
  mutate(
    controls_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    controls_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    controls_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    controls_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    controls_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    controls_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# enriched stats
transcript_counts_per_gene_per_sample_enriched <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(enriched)) %>% 
  mutate(
    enriched_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Controls females stats
transcript_counts_per_gene_per_sample_controls_females <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(controls_females)) %>% 
  mutate(
    control_females_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_females_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_females_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_females_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_females_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_females_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# enriched females stats
transcript_counts_per_gene_per_sample_enriched_females <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(enriched_females)) %>% 
  mutate(
    enriched_females_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_females_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_females_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_females_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_females_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_females_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Controls males stats
transcript_counts_per_gene_per_sample_controls_males <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(controls_males)) %>% 
  mutate(
    control_males_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_males_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_males_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_males_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_males_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    control_males_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# enriched males stats
transcript_counts_per_gene_per_sample_enriched_males <- transcript_counts %>%
  rowwise() %>% 
  select("gene_name", ends_with(enriched_males)) %>% 
  mutate(
    enriched_males_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_males_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_males_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_males_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_males_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))),
    enriched_males_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs)))
  )

# Add data frames to list
# Wraps ls() with grep to allow for needed Perl regex (the  "^(?!list).*" aspect) because
# ls() doesn't support Perl regex
# Regex excludes any results beginning with the word "list"
list_transcript_counts_dfs <- mget(grep("^(?!list).*", ls(pattern = "transcript_counts_per_gene_per_sample"), value = TRUE, perl = TRUE))

head(transcript_counts)
```

## Write transcript per gene counts to files.
```{r write-transcript-per-gene-counts-to-files}

# Write data frames to CSVs in ../output/34-transcript-counts/ dir
# Uses names of data frames as names of output files.
sapply(names(list_transcript_counts_dfs),
       function(x) write.csv(list_transcript_counts_dfs[[x]],
                             file = file.path("../output/34-transcript-counts/", paste(x, "csv", sep=".")),
                             quote = FALSE,
                             row.names = FALSE)
       )
```

# Determine differences in max number of transcripts between comparisons
```{r calculate-difference-max-transcripts}

for (comparison in comparisons) {
  
  if (comparison == "f_vs_m") {
    diffs.max.transcripts_per_gene.females.vs.males <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_females,
                         transcript_counts_per_gene_per_sample_males,
                         comparison)
  } else if (comparison == "c_vs_e") {
    diffs.max.transcripts_per_gene.controls.vs.enriched <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_controls,
                         transcript_counts_per_gene_per_sample_enriched,
                         comparison)
  } else if (comparison == "c.f_vs_e.f") {
    diffs.max.transcripts_per_gene.controls_females.vs.enriched_females <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_controls_females,
                         transcript_counts_per_gene_per_sample_enriched_females,
                         comparison)
  } else if (comparison == "c.m_vs_e.m") {
    diffs.max.transcripts_per_gene.controls_males.vs.enriched_males <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_controls_males,
                         transcript_counts_per_gene_per_sample_enriched_males,
                         comparison)
  } else if (comparison == "c.f_vs_c.m") {
    diffs.max.transcripts_per_gene.controls_females.vs.controls_males <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_controls_females,
                         transcript_counts_per_gene_per_sample_controls_males,
                         comparison)
  } else if (comparison == "e.f_vs_e.m") {
    diffs.max.transcripts_per_gene.enriched_females.vs.enriched_males <-
      diff_max_transcripts(transcript_counts_per_gene_per_sample_enriched_females,
                         transcript_counts_per_gene_per_sample_enriched_males,
                         comparison)
  }
  
}

# Add data frames to list
# Wraps ls() with grep to allow for needed perl regex (the  "^(?!list).*" aspect) because
# ls() doesn't support perl regex
# Regex excludes any results beginning with the word "list"
list_transcript_max_diffs_dfs <- mget(grep("^(?!list).*",
                                           ls(pattern = "max.transcripts_per_gene"),
                                           value = TRUE,
                                           perl = TRUE)
                                      )

```

## Write max transcript diffs to files.
```{r write-transcript-per-gene-counts-to-files}

# Write data frames to CSVs in ../output/34-transcript-counts/ dir
# Uses names of data frames as names of output files.
sapply(names(list_transcript_max_diffs_dfs),
       function(x) write.csv(list_transcript_max_diffs_dfs[[x]],
                             file = file.path("../output/34-transcript-counts/", paste(x, "csv", sep=".")),
                             quote = FALSE,
                             row.names = FALSE)
       )
```