--- title: "GO and GOslim CDS Annotation" author: "Sam White" date: "2023-10-05" output: html_document --- Script to annotate _C.virginica_ coding sequences (CDS) with Gene Ontology (GO) and GOslim IDs/terms. CDSs were BLASTp'd against SwissProt database here: [`17-Swiss-Prot-Annotation.Rmd`](https://github.com/sr320/ceabigr/blob/main/code/17-Swiss-Prot-Annotation.Rmd) and the resulting SPIDs were used for gene annotation. Output files can be used for future mapping of gene subsets (e.g. differentially expressed genes, predominant isoforms, etc.) Script uses SwissProt IDs (SPIDs) as input to a Python script for retrieving data via the UniProt API. Resulting GO IDs are used to create a GeneSetCollection in GSEAbase to map to GOslims. Load packages ```{r load-packages} library(tidyverse) library(GSEABase) library(httr) ``` # Set variables ```{r set-variables} goslims <- "goslim_generic.obo" goslims_url <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo" ``` # Create functions ```{r create-functions} # Function for mapping GOIDs to GOslims mappedIds <- function(df, collection, OFFSPRING, goannotGSC) { # Split semi-colon delimited strings in df$ids df$ids_split <- strsplit(as.character(df$ids), ";") # Determine the maximum number of elements per row max_elements <- max(lengths(df$ids_split)) # Fill in shorter rows with NA values df$ids_split <- lapply(df$ids_split, function(x) { length(x) <- max_elements x }) # Combine the split strings into a matrix ids_mat <- do.call(rbind, df$ids_split) # Convert the matrix to a data frame ids_df <- as.data.frame(ids_mat, stringsAsFactors = FALSE) # Rename the columns of the data frame colnames(ids_df) <- paste0("ids_", 1:max_elements) # Combine the original data frame and the new data frame df <- cbind(df, ids_df) # Perform the matching operation mt <- match(toupper(trimws(df$ids_1)), toupper(trimws(names(goannotGSC)))) # Add a new column to the data frame for the "Genes" result df$Genes <- NA_character_ # Fill in the "Genes" column for rows that have a match df$Genes[!is.na(mt)] <- vapply(geneIds(goannotGSC)[mt[!is.na(mt)]], paste, collapse = ";", character(1L)) df <- df %>% dplyr::select(Count, Percent, Term, ids, Genes) # Return the modified data frame return(df) } ``` # Download `goslim_generic.obo` from Gene Ontology Consortium ## Set GSEAbase location and download `goslim_generic.obo` ```{r download-goslim-obo} gseabase_location <- find.package("GSEABase") goslim_obo_dest <- file.path(gseabase_location, "extdata", goslims, fsep = "/") download.file(url = goslims_url, destfile = goslim_obo_dest) fl <- system.file("extdata", goslims, package="GSEABase") ``` # Retrieve GO annotations from UniProt ## Inspect SPID file ```{bash inspect-spid-file} head ../output/17-Swiss-Prot-Annotation/Cvir_cds-uniprot_blastp-SPID.txt echo "" wc -l ../output/17-Swiss-Prot-Annotation/Cvir_cds-uniprot_blastp-SPID.txt #10375 echo "" echo "Confirming all entries are unique:" sort --unique ../output/17-Swiss-Prot-Annotation/Cvir_cds-uniprot_blastp-SPID.txt | wc -l #10375 ``` ## Run Python UniProt retrieval script ```{bash uniprot-retrieval} python3 uniprot-retrieval.py ../output/17-Swiss-Prot-Annotation/Cvir_cds-uniprot_blastp-SPID.txt ``` ## Inspect UniProt retreival ```{bash inspect-uniprot retrival} zcat uniprot-retrieval.tsv.gz | head -n 2 zcat uniprot-retrieval.tsv.gz | awk 'NR>1 {print $1}' | wc -l ``` ## Rename and move UniProt retrieval output file ```{bash rename-and-move-uniprot-file} mv uniprot-retrieval.tsv.gz ../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-uniprot-full.tsv.gz ``` ## Unzip UniProt retrieval file ```{bash gunzip-uniprot-retrieval-file} gunzip ../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-uniprot-full.tsv.gz ``` ## Create dataframe ```{r create-dataframe} uniprot_full_df <- read.delim("../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-uniprot-full.tsv") ``` ## Get SPIDs and GO IDs ```{r get-SPIDs-GOIDs} SPID_GO_df <- uniprot_full_df %>% dplyr::select(Entry, Gene.Ontology.IDs) str(SPID_GO_df) ``` # Join SPIDs and GO with genes ## Load gene SPID mapping file ```{r load-gene-SPID-mapping-file} gene_SPID_df <- read.delim("../output/17-Swiss-Prot-Annotation/Cvir_cds-geneID-SPID.tab", header = TRUE) str(gene_SPID_df) ``` ## Join SPIDs and GO with genes ```{r join-genes-with-GO} gene_GO_df <- left_join(gene_SPID_df, SPID_GO_df, by = c("SPID" = "Entry")) %>% dplyr::select(gene,Gene.Ontology.IDs) str(gene_GO_df) ``` ## Write gene GOs to file ```{r write-gene-GOs-to-file} write.table(gene_GO_df, file = "../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-geneID-GOID.tab", col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t") ``` # Retrieve GOslims ## Create formatted file for GSEAbase GeneSet construction ```{bash rm-spaces-from-GOs} # Remove "hidden" spaces in GOs, but preserve tabs, using sed awk -F"\t" '{print $2, "\t", $1}' \ ../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-geneID-GOID.tab \ | sed 's/ \+//g' \ > ../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-GOID-geneID.tab head ../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-GOID-geneID.tab ``` ## Create named list and GSEAbase GeneSet ### Flatten gene list Creates one row per GO ID ```{r flatten-gene-GO-listt} # Read in tab-delimited file my_data <- read.table("../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-GOID-geneID.tab", header = FALSE, sep = "\t" ) # Remove rows with no GO ID my_data_cleaned <- my_data[!(my_data$V1 == ""), ] # "Flatten" file so each row is single GO ID with corresponding gene my_data_cleaned_separated <- separate_rows(my_data_cleaned, V1, sep = ";") str(my_data_cleaned_separated) ``` ### Map Biological Process GOslims to GO IDs ```{r BP-GOslims-to-GOIDs} # Vector of GO IDs go_ids <- my_data_cleaned_separated$V1 # Create custom collection with our GO IDs myCollection <- GOCollection(go_ids) # Pull out GOslims from GOslim obo file slim <- getOBOCollection(fl) # Get Biological Process (BP) GOslims for our GO IDs slimdf <- goSlim(myCollection, slim, "BP", verbose) # Convert to list gomap <- as.list(GOBPOFFSPRING[rownames(slimdf)]) # Map our GOIDs to GOslims mapped <- lapply(gomap, intersect, ids(myCollection)) # sapply needed to apply paste() to create semi-colon delimited values slimdf$ids <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";") # Remove "charcter(0) string from "ids" column slimdf$ids[slimdf$ids == "character(0)"] <- "" # Add self-matching GOIDs to "ids" column, if not present updated_slimdf <- slimdf for (go_id in go_ids) { # Check if the go_id is present in the row names if (go_id %in% rownames(updated_slimdf)) { # Check if the go_id is not present in the ids column # Also removes white space "trimws()" and converts all to upper case to handle # any weird, "invisible" formatting issues. if (!go_id %in% trimws(toupper(strsplit(updated_slimdf[go_id, "ids"], ";")[[1]]))) { # Append the go_id to the ids column with a semi-colon separator if (length(updated_slimdf$ids) > 0 && nchar(updated_slimdf$ids[nrow(updated_slimdf)]) > 0) { updated_slimdf[go_id, "ids"] <- paste0(updated_slimdf[go_id, "ids"], "; ", go_id) } else { updated_slimdf[go_id, "ids"] <- go_id } } } } str(updated_slimdf) ``` ### Create GSEAbase GeneSetCollection ```{r gene-set-collection} # Group by unique GO ID grouped_df <- my_data_cleaned_separated %>% group_by(V1)%>% summarise(V2 = paste(V2, collapse = ",")) # convert the data frame to a named list my_list <- as.list(grouped_df$V2) names(my_list) <- grouped_df$V1 gsc <- GeneSetCollection(mapply(function(geneIds, GOID) { GeneSet(geneIds, collectionType=GOCollection(GOID), setName=GOID) }, my_list, names(my_list))) gsc ``` ### Map genes to GOslims ```{r map-genes-to-GOslims} mapped_df <- mappedIds(updated_slimdf, myCollection, GOBPOFFSPRING, gsc) # Provide column name for first column mapped_df <- cbind(GOslim.BP = rownames(mapped_df), mapped_df) rownames(mapped_df) <- NULL # Rename "ids" column mapped_df <- mapped_df %>% dplyr::rename(GO.IDs = ids) mapped_df str(mapped_df) ``` ### Flatten GOslims to have one row per gene Will make future joins to other data sets easier. Done in two steps for easier readability: 1. Flattened just by gene and retains all info. 2. Flattened by gene with GOslims combined per gene. Drops GO IDs. For the second version: We use n() to count the number of rows within each group. We select the first Term value within each group using Term[1]. .groups = 'drop' is used to remove the grouping information. This code should create a dataframe with one row per gene, including the corresponding GOslim.BP entries in a single, semi-colon-delimited field, the first Term value within each group, and the count of rows in each group. ```{r flatten-GOslims-by-gene} flattened_mapped_df <- mapped_df %>% dplyr::select(Genes, GOslim.BP, Term, GO.IDs) %>% separate_rows(Genes, sep = ",") goslims_per_gene_df <- flattened_mapped_df %>% distinct(Genes, GOslim.BP, Term) %>% group_by(Genes) %>% summarise(GOslim.BP = paste(unique(GOslim.BP), collapse = ";"), Term = paste(unique(Term), collapse = ";"), # Concatenate Terms with semi-colon .groups = 'drop') str(goslims_per_gene_df) ``` ## Write mapped GOslims to file ```{r write-mapped-GOslims-to-file} mapped_df %>% dplyr::select(GOslim.BP, Term, GO.IDs, Genes) %>% write.table(file ="../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-GOslim.BP_term_GOIDs_genes.tab", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t" ) ``` ## Write flattened genes to file ```{r write-flattened-genes-to-file} goslims_per_gene_df %>% write.table(file ="../output/17.1-GO-and-GOslim-CDS-Annotation/Cvir-CDS-GOslim.BP_per_gene.tab", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t" ) ```