--- title: "go-goslims" author: "Sam White" date: "9/15/2022" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r load-libraries} library(GSEABase) library(tidyverse) ``` ## Read in file ```{r read-in-file} goseq_file <- read.csv("./output/t25-blast-GO-unfolded.tab", header = TRUE, sep = "\t", na.strings = c("NA", "")) ``` ### Remove NAs ```{r remove-nas} goseq_file <- goseq_file %>% filter(!is.na(Gene.Ontology.IDs)) ``` ```{r output-file-naming} ### Set output filename suffix output_suffix=("GOslims.csv") ``` ## Vector of GOslim ontologies (e.g. Biological Process = BP, Molecular Function = MF, Cellular Component = CC) ```{r set-ontology-vector} ontologies <- c("BP", "CC", "MF") ``` ## Determine GOslims ```{r determine-goslims} for (slim_ontology in ontologies) { ### Set GOOFFSPRING database, based on ontology group set above go_offspring <- paste("GO", slim_ontology, "OFFSPRING", sep = "") ## Grab just the individual GO terms from the "Gene.Ontology.IDs" column) goterms <- as.character(goseq_file$Gene.Ontology.IDs) ### Use GSEA to map GO terms to GOslims ## Store goterms as GSEA object myCollection <- GOCollection(goterms) ## Use generic GOslim file to create a GOslim collection # I downloaded goslim_generic.obo from http://geneontology.org/docs/go-subset-guide/ # then i moved it to the R library for GSEABase in the extdata folder # in addition to using the command here - I think they're both required. slim <- getOBOCollection("../data/goslim_generic.obo") ## Map GO terms to GOslims and select Biological Processes group slimsdf <- goSlim(myCollection, slim, slim_ontology) ## Need to know the 'offspring' of each term in the ontology, and this is given by the data in: # GO.db::getFromNamespace(go_offspring, "GO.db") ## Create function to parse out GO terms assigned to each GOslim ## Courtesy Bioconductor Support: https://support.bioconductor.org/p/128407/ mappedIds <- function(df, collection, OFFSPRING) { map <- as.list(OFFSPRING[rownames(df)]) mapped <- lapply(map, intersect, ids(collection)) df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L)) df } ## Run the function slimsdf <- mappedIds(slimsdf, myCollection, getFromNamespace(go_offspring, "GO.db")) ## Provide column name for first column slimsdf <- cbind(GOslim = rownames(slimsdf), slimsdf) rownames(slimsdf) <- NULL ## Paste elements together to form output filename outfilename <- paste("t25blast-goslims", slim_ontology, output_suffix, collapse = ".", sep = ".") ## Set output file destination and name ## Adds proper subdirectory from split_dirs list outfile_dest <- file.path("../output/", outfilename) ## Write output file write.csv(slimsdf, file = outfile_dest, quote = FALSE, row.names = FALSE) } ```