---
title: "Enrichment Result Compilation"
author: "Yaamini Venkataraman"
output: github_document
---

In this script, I'll create a summary table with all enrichment results from all analyses. This will aid the functional interpretation of results.

# Prepare R Markdown file

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = normalizePath("../output/98-enrichment-result-compilation/")) #Set root directory
```

```{r}
getwd()
```

# Install packages

```{r}
# if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse')
```

```{r}
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("topGO")
```

```{r}
require(tidyverse)
require(topGO)
```

# Obtain session information

```{r}
sessionInfo()
```

# Import and format annotation data

```{r}
mRNATrack <- read.delim("../../genome-features/C_virginica-3.0-mRNA.gff", header = FALSE) %>%
  separate(., V9, into = c("V10", "V11"), sep = "ID=rna-") %>%
  separate(., V11, into = c("V12", "V13"), sep = ";Parent=") %>%
  separate(., V13, into = c("V14", "V15"), sep = ";Dbxref=") %>%
  separate(., V15, into = c("V16", "V17"), sep = ";product=") %>%
  separate(., V17, into = c("V18", "V19"), sep = ";transcript_id=") %>%
  dplyr::select(., c("V12", "V14", "V18")) %>%
  dplyr::rename(., transcript = V12, gene = V14, product = V18) %>%
  mutate(., gene_name = gsub("gene-", "", gene)) %>%
  dplyr::select(., -gene) #Winnow down mRNA track
head(mRNATrack) #Confirm changes
```

```{r}
GOterms <- read.delim("../../genome-features/_blast-GO-unfolded.sorted", header = FALSE, col.names = c("GO", "transcript"))
head(GOterms)
```

```{r}
GOslim <- read.delim("../../genome-features/GO-GOslim.sorted", header = FALSE, col.names = c("GO", "GOterm", "GOslim", "process"))
head(GOslim)
```

```{r}
mRNAGOGOSlim <- mRNATrack %>%
  left_join(., GOterms, by = "transcript") %>%
  left_join(., GOslim, by = "GO") %>%
  filter(., process == "P") %>%
  unique(.) #Join mRNA track with GOterms such that all transcripts are retained even without GOterm matches (left_join). Join with GOslim information and filter for biological process terms only, since that was the only enrichment test conducted. Retain unique rows.
head(mRNAGOGOSlim)
```

# Import enrichment results

I want to combine enrichment results for all GO terms that were tested, not just significantly enriched terms. I will import the enrichment results then match them with genes from the relevant gene background.

## Females

### Change in maximum transcript

```{r}
allRes.maxTransFemPosDataBP <- read.csv("../75-max-transcript-enrichment/fem-maxTransPos-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "f_maxTransPos") %>%
  left_join(., 
            y = (read_delim("../75-max-transcript-enrichment/geneid2go-fem_maxTrans.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maxTransFemPosDataBP)
```

```{r}
allRes.maxTransFemNegDataBP <- read.csv("../75-max-transcript-enrichment/fem-maxTransNeg-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "f_maxTransNeg")  %>%
  left_join(., 
            y = (read_delim("../75-max-transcript-enrichment/geneid2go-fem_maxTrans.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maxTransFemNegDataBP)
```

### Predominant transcript shift

```{r}
allRes.predIsoFemdataBP <- read.csv("../42-predominant-isoform/fem-predIso-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "f_predIso") %>%
  left_join(., 
            y = (read_delim("../42-predominant-isoform/geneid2go-fem_predIso.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.predIsoFemdataBP)
```

### Alternatively spliced genes

```{r}
allRes.femAltSpliceDataBP <- read.csv("../78-asca-methdiff/fem-altSplice-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "f_altSplice") %>%
  left_join(., 
            y = (read_delim("../78-asca-methdiff/geneid2go-fem_altSplice.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.femAltSpliceDataBP)
```

### DML

```{r}
allRes.femDMLGeneListDataBP <- read.csv("../DML-characterization/fem-DML-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "f_DML") %>%
  left_join(., 
            y = (read_delim("../DML-characterization/geneid2go-fem_geneBackground.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.femDMLGeneListDataBP)
```

## Males

### Change in maximum transcript

```{r}
allRes.maxTransMalePosDataBP <- read.csv("../75-max-transcript-enrichment/male-maxTransPos-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "m_maxTransPos") %>%
  left_join(., 
            y = (read_delim("../75-max-transcript-enrichment/geneid2go-male_maxTrans.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maxTransMalePosDataBP)
```

```{r}
allRes.maxTransMaleNegDataBP <- read.csv("../75-max-transcript-enrichment/male-maxTransNeg-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "m_maxTransNeg")  %>%
  left_join(., 
            y = (read_delim("../75-max-transcript-enrichment/geneid2go-male_maxTrans.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maxTransMaleNegDataBP)
```

### Predominant transcript shift

```{r}
allRes.predIsoMaledataBP <- read.csv("../42-predominant-isoform/male-predIso-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "m_predIso") %>%
  left_join(., 
            y = (read_delim("../42-predominant-isoform/geneid2go-male_predIso.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.predIsoMaledataBP)
```

### Alternatively spliced genes

```{r}
allRes.maleAltSpliceDataBP <- read.csv("../78-asca-methdiff/male-altSplice-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "m_altSplice") %>%
  left_join(., 
            y = (read_delim("../78-asca-methdiff/geneid2go-male_altSplice.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maleAltSpliceDataBP)
```

### DML

```{r}
allRes.maleDMLGeneListDataBP <- read.csv("../DML-characterization/male-DML-BP-FisherTestResults.csv") %>%
  dplyr::rename(., pvalue = "classic", GO = "GO.ID", GOterm = "Term") %>%
  dplyr::select(., -c(Annotated, Significant, Expected)) %>% 
  mutate(., category = "m_DML") %>%
  left_join(., 
            y = (read_delim("../DML-characterization/geneid2go-male_geneBackground.tab", delim = "\t", col_names = c("gene_name", "GO")) %>% separate_longer_delim(., cols = GO, delim = ",")), 
            by = "GO") %>%
  group_by(., GO) %>%
  unique(.) #Import enrichment results. Rename columns and retain columns of interest. Add a column characterizing enrichment result category. Join with gene background annotated with GOterms. For the gene background, import and separate GO terms into individual rows prior to joining. Group by GO term then retain unique rows
head(allRes.maleDMLGeneListDataBP)
```

# Compile enrichment results

## Females

```{r}
allRes.allFem <- rbind(allRes.maxTransFemPosDataBP, allRes.maxTransFemNegDataBP, 
                       allRes.predIsoFemdataBP,
                       allRes.femAltSpliceDataBP,
                       allRes.femDMLGeneListDataBP) %>%
  left_join(., mRNAGOGOSlim, by = c("gene_name", "GO")) %>%
  dplyr::select(., -GOterm.x) %>%
  dplyr::rename(., GOterm = "GOterm.y") %>%
  separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C transcript") %>%
  dplyr::select(., gene_name, GO, GOterm, GOslim, pvalue, category) %>%
  unique(.) %>%
  group_by(., gene_name) %>%
  pivot_wider(., names_from = "category", values_from = "pvalue") %>%
  arrange(., gene_name, GO) %>%
  mutate(., f_geneActivity = case_when(f_maxTransPos < 0.01 & f_maxTransNeg < 0.01 & f_predIso < 0.01 & f_altSplice < 0.01 ~ "Y",
                                       f_maxTransPos >= 0.01 | f_maxTransNeg >= 0.01 | f_predIso >= 0.01 | f_altSplice >= 0.01 ~ "N")) %>%
  mutate(., f_geneActivityNoSplice = case_when(f_maxTransPos < 0.01 & f_maxTransNeg < 0.01 & f_predIso < 0.01 ~ "Y",
                                               f_maxTransPos >= 0.01 | f_maxTransNeg >= 0.01 | f_predIso >= 0.01 ~ "N")) %>%
  mutate(., f_geneActivityNoSpliceDML = case_when(f_maxTransPos < 0.01 & f_maxTransNeg < 0.01 & f_predIso < 0.01 & f_DML < 0.01 ~ "Y",
                                                  f_maxTransPos >= 0.01 | f_maxTransNeg >= 0.01 | f_predIso >= 0.01 | f_DML >= 0.01 ~ "N")) %>%
  mutate(., f_allTests = case_when(f_maxTransPos < 0.01 & f_maxTransNeg < 0.01 & f_predIso < 0.01 & f_altSplice < 0.01 & f_DML < 0.01 ~ "Y",
                                   f_maxTransPos >= 0.01 | f_maxTransNeg >= 0.01 | f_predIso >= 0.01 | f_altSplice >= 0.01 | f_DML >= 0.01 ~ "N")) #Combine all separate enrichment results using rbind. Join with annotation information by gene and GOterm. Remove GOterm column from topGO and rename and keep column with GOterm annotations since these are not truncated. Isolate product information and select columns of interest. Retain only unique rows. Group by gene name and pivot p-value information such that each enrichment test has a separate column for p-values. Arrange by gene name, then GO ID. Create summary columns detailing if the GO term was 1) enriched across all gene activity tests (geneActivity), gene activity tests besides alt splicing (geneActivityNoSplice), gene activitiy + DML - splice (geneActivityNoSpliceDML), and all enrichment tests (allTests). NAs will be used if a value is missing for one of the columns.
head(allRes.allFem) #Confirm changes
```

```{r}
write_delim(allRes.allFem, "fem-all-enrichment-results.txt", delim = "\t", col_names = TRUE, quote = "none")
```

## Males

```{r}
allRes.allMale <- rbind(allRes.maxTransMalePosDataBP, allRes.maxTransMaleNegDataBP, 
                        allRes.predIsoMaledataBP,
                        allRes.maleAltSpliceDataBP,
                        allRes.maleDMLGeneListDataBP) %>%
  left_join(., mRNAGOGOSlim, by = c("gene_name", "GO")) %>%
  dplyr::select(., -GOterm.x) %>%
  dplyr::rename(., GOterm = "GOterm.y") %>%
  separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C transcript") %>%
  dplyr::select(., gene_name, GO, GOterm, GOslim, pvalue, category) %>%
  unique(.) %>%
  group_by(., gene_name) %>%
  pivot_wider(., names_from = "category", values_from = "pvalue") %>%
  arrange(., gene_name, GO) %>%
  mutate(., m_geneActivity = case_when(m_maxTransPos < 0.01 & m_maxTransNeg < 0.01 & m_predIso < 0.01 & m_altSplice < 0.01 ~ "Y",
                                       m_maxTransPos >= 0.01 | m_maxTransNeg >= 0.01 | m_predIso >= 0.01 | m_altSplice >= 0.01 ~ "N")) %>%
  mutate(., m_geneActivityNoSplice = case_when(m_maxTransPos < 0.01 & m_maxTransNeg < 0.01 & m_predIso < 0.01 ~ "Y",
                                               m_maxTransPos >= 0.01 | m_maxTransNeg >= 0.01 | m_predIso >= 0.01 ~ "N")) %>%
  mutate(., m_geneActivityNoSpliceDML = case_when(m_maxTransPos < 0.01 & m_maxTransNeg < 0.01 & m_predIso < 0.01 & m_DML < 0.01 ~ "Y",
                                                  m_maxTransPos >= 0.01 | m_maxTransNeg >= 0.01 | m_predIso >= 0.01 | m_DML >= 0.01 ~ "N")) %>%
  mutate(., m_allTests = case_when(m_maxTransPos < 0.01 & m_maxTransNeg < 0.01 & m_predIso < 0.01 & m_altSplice < 0.01 & m_DML < 0.01 ~ "Y",
                                   m_maxTransPos >= 0.01 | m_maxTransNeg >= 0.01 | m_predIso >= 0.01 | m_altSplice >= 0.01 | m_DML >= 0.01 ~ "N")) #Combine all separate enrichment results using rbind. Join with annotation information by gene and GOterm. Remove GOterm column from topGO and rename and keep column with GOterm annotations since these are not truncated. Isolate product information and select columns of interest. Retain only unique rows. Group by gene name and pivot p-value information such that each enrichment test has a separate column for p-values. Arrange by gene name, then GO ID. Create summary columns detailing if the GO term was 1) enriched across all gene activity tests (geneActivity), gene activity tests besides alt splicing (geneActivityNoSplice), gene activitiy + DML - splice (geneActivityNoSpliceDML), and all enrichment tests (allTests). NAs will be used if a value is missing for one of the columns.
head(allRes.allMale) #Confirm changes
```

```{r}
write_delim(allRes.allMale, "male-all-enrichment-results.txt", delim = "\t", col_names = TRUE, quote = "none")
```

## All data

```{r}
allRes.bothSexes <- full_join(allRes.allFem, allRes.allMale, by = c("gene_name", "GO", "GOterm", "GOslim")) %>%
  mutate(., both_maxTransPos = case_when(f_maxTransPos < 0.01 & m_maxTransPos < 0.01 ~ "Y",
                                          f_maxTransPos >= 0.01 | m_maxTransPos >= 0.01 ~ "N")) %>%
  mutate(., both_maxTransNeg = case_when(f_maxTransNeg < 0.01 & m_maxTransNeg < 0.01 ~ "Y",
                                          f_maxTransNeg >= 0.01 | m_maxTransNeg >= 0.01 ~ "N")) %>%
  mutate(., both_predIso = case_when(f_predIso < 0.01 & m_predIso < 0.01 ~ "Y",
                                          f_predIso >= 0.01 | m_predIso >= 0.01 ~ "N")) %>%
  mutate(., both_altSplice = case_when(f_altSplice < 0.01 & m_altSplice < 0.01 ~ "Y",
                                          f_altSplice >= 0.01 | m_altSplice >= 0.01 ~ "N")) %>%
  mutate(., both_DML = case_when(f_DML < 0.01 & m_DML < 0.01 ~ "Y",
                                          f_DML >= 0.01 | m_DML >= 0.01 ~ "N")) %>%
  mutate(., both_geneActivity = case_when(f_geneActivity == "Y" & m_geneActivity == "Y" ~ "Y",
                                          f_geneActivity == "N" | m_geneActivity == "N" ~ "N")) %>%
  mutate(., both_geneActivityNoSplice = case_when(f_geneActivityNoSplice == "Y" & m_geneActivityNoSplice == "Y" ~ "Y",
                                                  f_geneActivityNoSplice == "N" | m_geneActivityNoSplice == "N" ~ "N")) %>%
  mutate(., both_geneActivityNoSpliceDML = case_when(f_geneActivityNoSpliceDML == "Y" & m_geneActivityNoSpliceDML == "Y" ~ "Y",
                                                     f_geneActivityNoSpliceDML == "N" | m_geneActivityNoSpliceDML == "N" ~ "N")) %>%
  mutate(., both_allTests = case_when(f_allTests == "Y" & m_allTests == "Y" ~ "Y",
                                      f_allTests == "N" | m_allTests == "N" ~ "N")) #Full join both datasets (ensure entries in both dataframes are retained). Create a bunch of columns summarizing commonalities between the two datasets
head(allRes.bothSexes)
```

```{r}
write_delim(allRes.bothSexes, "all-enrichment-results.txt", delim = "\t", col_names = TRUE, quote = "none")
```