---
title: "Apul topGO expressed mRNAs"
author: "Jill Ashey"
date: "2025-06-09"
output: html_document
---

Gene ontology enrichment analysis of mRNAs expressed in Apul. 

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("topGO")
BiocManager::install("genefilter")
BiocManager::install("ViSEAGO")
library(topGO)
library(tidyverse)
library(genefilter)
library(ViSEAGO)
```

Before analysis, make gene2go file from annotation information. Once this is done, it does not have to be redone. This is a little trickier with Apul than with the other species because the Apul BLAST results recorded the genomic location instead of the gene name (see annotation file [here](https://raw.githubusercontent.com/urol-e5/deep-dive-expression/refs/heads/main/D-Apul/output/02-Apul-reference-annotation/Apulcra-genome-mRNA-IDmapping-2024_12_12.tab)). Before making the gene2go file, we need to match the genomic locations with the actual gene names from the [gff](https://github.com/urol-e5/deep-dive-expression/blob/main/D-Apul/data/Apulcra-genome-mRNA_only.gff).

Read in gff file. 
```{r}
gff <- read.delim("../data/Apulcra-genome-mRNA_only.gff", header = F)
colnames(gff) <- c("chrom", "source", "type", "start", "end", "score", "strand", "phase", "attr")
```

Read in annotation file.
```{r}
apul_annot <- read.delim("../output/02-Apul-reference-annotation/Apulcra-genome-mRNA-IDmapping-2024_12_12.tab")
apul_annot <- apul_annot %>%
  dplyr::rename(Location = V1, GO = Gene.Ontology.IDs)
```

Create Location column in gff. The annotation and gff start locations are off by one so I am going to adjust the gff start location by subtracting one. 
```{r}
gff <- gff %>% 
  mutate(Location = paste0(chrom, ":", start - 1, "-", end))
```

Merge the gff and annotation information by Location
```{r}
merged_df <- inner_join(gff, apul_annot, by = "Location")
```

Extract gene name from attr col and select only mRNA and GO information
```{r}
# Select gene names
merged_df$mRNA <- sub(".*Parent=([^;]+);.*", "\\1", merged_df$attr)

# Select mRNA and GO cols 
apul_go <- merged_df[, c("mRNA", "GO")]
apul_go <- unique(apul_go)
```

Not many annotations when duplicate rows removed...check with team on this.

Now write out gene2go file! 
```{r}
write_tsv(apul_go, file = "../output/29-Apul-mRNA-GO-enrichment/Apul_gene2go.tab")
```

Read in count matrix 
```{r}
counts <- read.csv("../output/07-Apul-Hisat/Apul-gene_count_matrix.csv")
dim(counts)

# Set row names
rownames(counts) <- counts[,1] #set first column that contains gene names as rownames
counts <- counts[,-1] # remove column w/ gene names 
```

There are 44371 genes from the alignment and assembly. 

Perform normalization step. Normalization does not necessarily matter for GO enrichment, but it does matter for pOverA filtering, which is done below. Normalize counts with simple RPM
```{r}
# Function to normalize counts (simple RPM normalization)
normalize_counts <- function(counts) {
  rpm <- t(t(counts) / colSums(counts)) * 1e6
  return(rpm)
}

# Normalize counts
counts_norm <- normalize_counts(counts)
counts_norm_df <- as.data.frame(counts_norm)
```

Remove any genes that were not expressed (ie expression of 0 across all samples)
```{r}
counts_norm_filt <-counts_norm_df %>%
  mutate(Total = rowSums(.[, 1:5]))%>%
  filter(!Total==0)%>%
  dplyr::select(!Total)
dim(counts_norm_filt)
```

33624 genes were expressed. 

Filter data with pOverA. There are 5 samples for this time point. Genes will pass filtering if they are present in 1/5 = 0.2 of samples and have minimum count of 5
```{r}
ffun<-filterfun(pOverA(0.2,5))  #set up filtering parameters
filt_outrm_poa <- genefilter((counts_norm_filt[,1:5]), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left

counts_norm_filt_poa <- counts_norm_filt[filt_outrm_poa,] #keep only rows that passed filter
```

16855 genes remained after normalization and pOverA filtering. 

Make a list of genes for input to topGO
```{r}
# Genes of interest 
clust_genes <- as.character(rownames(counts_norm_filt_poa))

# All genes 
all_genes <- as.character(rownames(counts))

# Apply 1 or 0 if gene is gene of interest 
GeneList <- factor(as.integer(all_genes %in% clust_genes))
names(GeneList) <- all_genes
```

Read in gene-to-go-mappings
```{r}
gene2go_topgo <- readMappings("../output/29-Apul-mRNA-GO-enrichment/Apul_gene2go.tab", IDsep=";")
```

Set function to select genes of interest (ie those that have pvalue < 0.05)
```{r}
topDiffGenes <- function(allScore) {
return(allScore < 0.05)}
```

### Biological Processes

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_BP <-new("topGOdata", ontology="BP", gene2GO=gene2go_topgo, allGenes=GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_BP_FE <- runTest(GO_BP, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_BP_En <- GenTable(GO_BP, Fisher = GO_BP_FE, orderBy = "Fisher", numChar = 51, topNodes = 100)
```

Filter by significant results
```{r}
GO_BP_En$Fisher<-as.numeric(GO_BP_En$Fisher)
GO_BP_En_sig<-GO_BP_En[GO_BP_En$Fisher<0.05,]
```

Merge `GO_BP_En_sig` with GO and gene info. Then merge with expressed genes
```{r}
# Read in GO terms 
apul_go <- read.delim("../output/29-Apul-mRNA-GO-enrichment/Apul_gene2go.tab", sep = "\t")

# Separate GO terms 
apul_go <- apul_go %>%
  separate_rows(GO, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
apul_go$GO <- trimws(apul_go$GO)
GO_BP_En_sig$GO.ID <- trimws(GO_BP_En_sig$GO.ID)

# Join the datasets based on GO term
GO_BP_En_sig_gene <- apul_go %>%
  left_join(GO_BP_En_sig, by = c("GO" = "GO.ID")) %>%
  na.omit()

# Join the datasets based on expressed genes 
GO_BP_En_sig_gene <- GO_BP_En_sig_gene %>%
  filter(mRNA %in% clust_genes)

# Add ontology column 
GO_BP_En_sig_gene$ontology <- "Biological Processes"
```

### Cellular Components

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_CC <-new("topGOdata", ontology="CC", gene2GO=gene2go_topgo, allGenes=GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_CC_FE <- runTest(GO_CC, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_CC_En <- GenTable(GO_CC, Fisher = GO_CC_FE, orderBy = "Fisher", numChar = 51, topNodes = 100)
```

Filter by significant results
```{r}
GO_CC_En$Fisher<-as.numeric(GO_CC_En$Fisher)
GO_CC_En_sig<-GO_CC_En[GO_CC_En$Fisher<0.05,]
```

Merge `GO_CC_En_sig` with GO and gene info. Then merge with expressed genes
```{r}
# Read in GO terms 
# apul_go <- read.delim("../output/29-Apul-mRNA-GO-enrichment/Apul_gene2go.tab", sep = "\t")

# Separate GO terms 
# apul_go <- apul_go %>%
#   separate_rows(GO, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
#apul_go$GO <- trimws(apul_go$GO)
GO_CC_En_sig$GO.ID <- trimws(GO_CC_En_sig$GO.ID)

# Join the datasets based on GO term
GO_CC_En_sig_gene <- apul_go %>%
  left_join(GO_CC_En_sig, by = c("GO" = "GO.ID")) %>%
  na.omit()

# Join the datasets based on expressed genes 
GO_CC_En_sig_gene <- GO_CC_En_sig_gene %>%
  filter(mRNA %in% clust_genes)

# Add ontology column 
GO_CC_En_sig_gene$ontology <- "Cellular Components"
```

### Molecular Functions

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_MF <-new("topGOdata", ontology="MF", gene2GO=gene2go_topgo, allGenes=GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_MF_FE <- runTest(GO_MF, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_MF_En <- GenTable(GO_MF, Fisher = GO_MF_FE, orderBy = "Fisher", numChar = 51, topNodes = 100)
```

Filter by significant results
```{r}
GO_MF_En$Fisher<-as.numeric(GO_MF_En$Fisher)
GO_MF_En_sig<-GO_MF_En[GO_MF_En$Fisher<0.05,]
```

Merge `GO_MF_En_sig` with GO and gene info. Then merge with expressed genes
```{r}
# Read in GO terms 
# apul_go <- read.delim("../output/29-Apul-mRNA-GO-enrichment/Apul_gene2go.tab", sep = "\t")

# Separate GO terms 
#apul_go <- apul_go %>%
#   separate_rows(GO, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
#apul_go$GO <- trimws(apul_go$GO)
GO_MF_En_sig$GO.ID <- trimws(GO_MF_En_sig$GO.ID)

# Join the datasets based on GO term
GO_MF_En_sig_gene <- apul_go %>%
  left_join(GO_MF_En_sig, by = c("GO" = "GO.ID")) %>%
  na.omit()

# Join the datasets based on expressed genes 
GO_MF_En_sig_gene <- GO_MF_En_sig_gene %>%
  filter(mRNA %in% clust_genes)

# Add ontology column 
GO_MF_En_sig_gene$ontology <- "Molecular Functions"
```

### Join ontologies 

Bind so there is a df that has significantly enriched GO terms for all ontologies 
```{r}
GO_En_sig_gene <- rbind(GO_BP_En_sig_gene, GO_CC_En_sig_gene, GO_MF_En_sig_gene)

# Calculate proportion of significant v annotated genes 
GO_En_sig_gene <- GO_En_sig_gene %>%
  mutate(sig.prop = Significant/Annotated) %>%
  na.omit()
length(unique(GO_En_sig_gene$Term))

# Save as csv 
write.csv(GO_En_sig_gene, "../output/29-Apul-mRNA-GO-enrichment/Apul-GO_en_sig_expressed_genes.csv")
```

Plot by number of genes that have enriched terms
```{r}
plot_data <- GO_En_sig_gene %>%
  filter(ontology != "Cellular Components") %>%
  group_by(Term, ontology) %>%
  summarise(gene_count = n_distinct(mRNA),
            Fisher = mean(Fisher), .groups = 'drop') %>%
  group_by(ontology) %>%
  slice_max(order_by = gene_count, n = 10)

ggplot(plot_data, aes(x = reorder(Term, gene_count), y = gene_count, fill = Fisher)) +
  geom_bar(stat = "identity") +
  #geom_text(aes(label = gene_count), vjust = -0.5, color = "black", size = 3) +
  coord_flip() +
  scale_fill_gradient(low = "blue", high = "red") +
  facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  labs(x = "GO Term",
       y = "Number of Genes",
       fill = "Fisher Value") +
  theme_bw() +
  theme(
    axis.title = element_text(size = 36, face = "bold"),        # Axis title size
    axis.text = element_text(size = 34, colour = "black"),                        # Axis text size
    legend.title = element_text(size = 34, face = "bold"),      # Legend title size
    legend.text = element_text(size = 32),                      # Legend text size
    strip.text = element_text(size = 34, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside"    
  )

#ggsave(filename = "../output/29-Apul-mRNA-GO-enrichment/Apul-GO_en_sig_expressed_genes_top10.pdf", last_plot(), width = 30, height = 40)
#ggsave(filename = "../output/29-Apul-mRNA-GO-enrichment/Apul-GO_en_sig_expressed_genes_top10.png", last_plot(), width = 30, height = 40)
```