---
title: "go-goslims"
author: "Sam White"
date: "9/15/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r load-libraries}
library(GSEABase)
library(tidyverse)
```

## Read in file
```{r read-in-file} 
goseq_file <- read.csv("./output/t25-blast-GO-unfolded.tab", header = TRUE, sep = "\t", na.strings = c("NA", ""))
```

### Remove NAs
```{r remove-nas}
goseq_file <- goseq_file %>% filter(!is.na(Gene.Ontology.IDs))
```


```{r output-file-naming}
### Set output filename suffix
output_suffix=("GOslims.csv")
```


## Vector of GOslim ontologies (e.g. Biological Process = BP, Molecular Function = MF, Cellular Component = CC)
```{r set-ontology-vector}
ontologies <- c("BP", "CC", "MF")
```


## Determine GOslims
```{r determine-goslims}
for (slim_ontology in ontologies) {
  
    ### Set GOOFFSPRING database, based on ontology group set above
    go_offspring <- paste("GO", slim_ontology, "OFFSPRING", sep = "")

    ## Grab just the individual GO terms from the "Gene.Ontology.IDs" column)
    goterms <- as.character(goseq_file$Gene.Ontology.IDs)
    
    ### Use GSEA to map GO terms to GOslims
    
    ## Store goterms as GSEA object
    myCollection <- GOCollection(goterms)
    
    ## Use generic GOslim file to create a GOslim collection
    
    # I downloaded goslim_generic.obo from http://geneontology.org/docs/go-subset-guide/
    # then i moved it to the R library for GSEABase in the extdata folder
    # in addition to using the command here - I think they're both required.
    slim <- getOBOCollection("../data/goslim_generic.obo")
    
    ## Map GO terms to GOslims and select Biological Processes group
    slimsdf <- goSlim(myCollection, slim, slim_ontology)
    
    ## Need to know the 'offspring' of each term in the ontology, and this is given by the data in:
    # GO.db::getFromNamespace(go_offspring, "GO.db")
    
    ## Create function to parse out GO terms assigned to each GOslim
    ## Courtesy Bioconductor Support: https://support.bioconductor.org/p/128407/
    mappedIds <-
      function(df, collection, OFFSPRING)
      {
        map <- as.list(OFFSPRING[rownames(df)])
        mapped <- lapply(map, intersect, ids(collection))
        df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L))
        df
      }
    
    ## Run the function
    slimsdf <- mappedIds(slimsdf, myCollection, getFromNamespace(go_offspring, "GO.db"))
    
    ## Provide column name for first column
    slimsdf <- cbind(GOslim = rownames(slimsdf), slimsdf)
    rownames(slimsdf) <- NULL
    

    ## Paste elements together to form output filename
    outfilename <- paste("t25blast-goslims", slim_ontology, output_suffix, collapse = ".", sep = ".")
    
    ## Set output file destination and name
    ## Adds proper subdirectory from split_dirs list
    outfile_dest <- file.path("../output/", outfilename)
    
    ## Write output file
    write.csv(slimsdf, file = outfile_dest, quote = FALSE, row.names = FALSE)
    }
```