---
title: "GO Slim"
author: "Megan Ewing"
date: "2024-11-27"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

### Loading Packages

```{r}
# # Install Bioconductor packages
# BiocManager::install("GO.db", ask = FALSE)
# BiocManager::install("GSEABase", ask = FALSE)
# # 
# # Install CRAN packages
# install.packages("knitr")
# install.packages("tidyverse")


# Load all
library(GSEABase)
library(GO.db)
library(knitr)
library(tidyverse)
```

### Prepping GO_BP + Gene List

importing 'master list' with DEG list, counts, stats, etc.. to get GO biological process info

```{r}
# Load libraries
# library(dplyr)
# library(stringr)
# library(readr)

# Read your CSV file created during annotation
GOterms <- read_csv("../output/DEG_masterlist.csv")
head(GOterms)

```

```{r}
# Clean GO terms to keep only the GO IDs for biological process
GeneID_GOID <- GOterms %>%
  mutate(GO_IDs = str_extract_all(Gene.Ontology..biological.process., "GO:\\d+")) %>%  # extract all GO IDs
  mutate(GO_IDs = sapply(GO_IDs, function(x) paste(x, collapse = ";"))) %>%  # collapse into a single string
  dplyr::select(LOC, GO_IDs)

# previewiew the result
head(GeneID_GOID)

write_delim(GeneID_GOID, "../output/GeneID_GOID.tsv")
```

### Setting GO Variables

```{r set-variables, eval=TRUE}
# Column names corresponding to gene name/ID and GO IDs
GO.ID.column <- "GO_IDs"
gene.ID.column <- "LOC"

# Relative path or URL to input file
input.file <- "../output/GeneID_GOID.tsv"


##### Official GO info - no need to change #####
goslims_obo <- "goslim_generic.obo"
goslims_url <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo"
```

### Starting GO Retrival

Set GSEAbase location and download goslim_generic.obo

```{r download-generic-goslim-obo, eval=TRUE}
# Find GSEAbase installation location
gseabase_location <- find.package("GSEABase")

# Load path to GOslim OBO file
goslim_obo_destintation <- file.path(gseabase_location, "extdata", goslims_obo, fsep = "/")

# Download the GOslim OBO file
download.file(url = goslims_url, destfile = goslim_obo_destintation)

# Loads package files
gseabase_files <- system.file("extdata", goslims_obo, package="GSEABase")
```

Read in gene/GO file ( i did have to change this slightly from the tutorial)

```{r read-in-gene-file, eval=TRUE}
full.gene.df <- read_delim(file = input.file)

str(full.gene.df)
```

Remove rows with NA, remove whitespace in GO IDs column and keep just gene/GO IDs columns

```{r remove-NA-and-uniprotIDs, eval=TRUE}

# Clean whitespace, filter NA/empty rows, select columns, and split GO terms using column name variables
gene.GO.df <- full.gene.df %>%
  mutate(!!GO.ID.column := str_replace_all(.data[[GO.ID.column]], "\\s*;\\s*", ";")) %>% # Clean up spaces around ";"
  filter(!is.na(.data[[gene.ID.column]]) & !is.na(.data[[GO.ID.column]]) & .data[[GO.ID.column]] != "") %>% 
  dplyr::select(all_of(c(gene.ID.column, GO.ID.column)))


str(gene.GO.df)
```

This flattens the file so all of the GO IDs per gene are separated into one GO ID per gene per row.

```{r flatten-gene-and-GO-IDs, eval=TRUE}
flat.gene.GO.df <- gene.GO.df %>% separate_rows(!!sym(GO.ID.column), sep = ";")

str(flat.gene.GO.df)
```

Groups the genes by GO ID (i.e. lists all genes associated with each unique GO ID)

```{r group-by-GO, eval=TRUE}
grouped.gene.GO.df <- flat.gene.GO.df %>%
  group_by(!!sym(GO.ID.column)) %>%
  summarise(!!gene.ID.column := paste(.data[[gene.ID.column]], collapse = ","))

str(grouped.gene.GO.df)
```

Map GO IDs to GOslims

The mapping steps were derived from this bioconductor forum response

```{r vectorize-GOIDs, eval=TRUE}
# Vector of GO IDs
go_ids <- grouped.gene.GO.df[[GO.ID.column]]

str(go_ids)
```

Creates new OBO Collection object of just GOslims, based on provided GO IDs.

```{r extract-GOslims-from-OBO, eval=TRUE}

# Create GSEAbase GOCollection using `go_ids`
myCollection <- GOCollection(go_ids)

# Retrieve GOslims from GO OBO file set
slim <- getOBOCollection(gseabase_files)

str(slim)
```

Get Biological Process (BP) GOslims associated with provided GO IDs.

```{r retrieve-BP-GOslims, eval=TRUE}
# Retrieve Biological Process (BP) GOslims
slimdf <- goSlim(myCollection, slim, "BP", verbose)
str(slimdf)
```

Performs mapping of of GOIDs to GOslims

Returns:

GOslim IDs (as rownames) GOslim terms Counts of GO IDs matching to corresponding GOslim Percentage of GO IDs matching to corresponding GOslim GOIDs mapped to corresponding GOslim, in a semi-colon delimited format

```{r map-GO-to-GOslims, eval=TRUE}
# List of GOslims and all GO IDs from `go_ids`
gomap <- as.list(GOBPOFFSPRING[rownames(slimdf)])

# Maps `go_ids` to matching GOslims
mapped <- lapply(gomap, intersect, ids(myCollection))

# Append all mapped GO IDs to `slimdf`
# `sapply` needed to apply paste() to create semi-colon delimited values
slimdf$GO.IDs <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";")

# Remove "character(0) string from "GO.IDs" column
slimdf$GO.IDs[slimdf$GO.IDs == "character(0)"] <- ""

# Add self-matching GOIDs to "GO.IDs" column, if not present
for (go_id in go_ids) {
  # Check if the go_id is present in the row names
  if (go_id %in% rownames(slimdf)) {
    # Check if the go_id is not present in the GO.IDs column
    # Also removes white space "trimws()" and converts all to upper case to handle
    # any weird, "invisible" formatting issues.
    if (!go_id %in% trimws(toupper(strsplit(slimdf[go_id, "GO.IDs"], ";")[[1]]))) {
      # Append the go_id to the GO.IDs column with a semi-colon separator
      if (length(slimdf$GO.IDs) > 0 && nchar(slimdf$GO.IDs[nrow(slimdf)]) > 0) {
        slimdf[go_id, "GO.IDs"] <- paste0(slimdf[go_id, "GO.IDs"], "; ", go_id)
      } else {
        slimdf[go_id, "GO.IDs"] <- go_id
      }
    }
  }
}

str(slimdf)
```

"Flatten" file so each row is single GO ID with corresponding GOslim rownames_to_column needed to retain row name info

```{r flatten-GOslims-file, eval=TRUE}
# "Flatten" file so each row is single GO ID with corresponding GOslim
# rownames_to_column needed to retain row name info
slimdf_separated <- as.data.frame(slimdf %>%
  rownames_to_column('GOslim') %>%
  separate_rows(GO.IDs, sep = ";"))

# Group by unique GO ID
grouped_slimdf <- slimdf_separated %>%
  filter(!is.na(GO.IDs) & GO.IDs != "") %>%
  group_by(GO.IDs) %>%
  summarize(GOslim = paste(GOslim, collapse = ";"),
            Term = paste(Term, collapse = ";"))


str(grouped_slimdf)
```

Sorts GOslims by Count, in descending order and then selects just the Term and Count columns.

```{r sort-and-select-slimdf-counts, eval=TRUE}

slimdf.sorted <- slimdf %>% arrange(desc(Count))

slim.count.df <- slimdf.sorted %>% 
  dplyr::select(Term, Count)

str(slim.count.df)
```

save as file

```{r}

write.csv(slimdf.sorted, "../output/GOslims.csv")

```

```{r}
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)

# Step 1: Make sure GO terms are clean and split into vectors
slimdf_clean <- slimdf %>%
  mutate(GO_list = str_split(str_replace_all(GO.IDs, "\\s+", ""), ";"))

GeneID_GOID_clean <- GeneID_GOID %>%
  mutate(GO_list = str_split(str_replace_all(GO_IDs, "\\s+", ""), ";"))

# Step 2: Create a lookup table where each GO term maps to its LOC
GO_to_LOC <- GeneID_GOID_clean %>%
  unnest(GO_list) %>%
  dplyr::select(GO_term = GO_list, LOC)

# Step 3: For each row in slimdf, find matching LOCs
slimdf_with_LOC <- slimdf_clean %>%
  mutate(matching_LOCs = map(GO_list, function(go_terms) {
    matched <- GO_to_LOC %>%
      filter(GO_term %in% go_terms) %>%
      pull(LOC)
    unique(matched)  # Remove duplicates
  })) %>%
  mutate(matching_LOCs_str = sapply(matching_LOCs, function(x) paste(x, collapse = ";")))

# View result
head(slimdf_with_LOC)

```