--- title: "30.00-Peve-transcriptome-GOslims" author: "Sam White" date: "2025-06-20" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # BACKGROUND This notebook will perform annotation of expressed genes, as previously determined by [`06.2-Peve-Hisat.qmd`](https://github.com/urol-e5/deep-dive-expression/blob/7958cd3fdd10f4f80fb499d60e8befa55f3600a7/E-Peve/code/06-Peve-Hisat.qmd) (GitHub). Briefly, the notebook will perform the following tasks: 1. Extract all genes from the genome, as GFF and FastA. 2. Create a subset of only *expressed* genes, based on gene count matrix. 3. BLASTx expressed genes against SwissProt database. 4. Get gene ontology. 5. Map gene ontology to GOslims and get counts. ::: callout-important Expressed genes were defined as those genes having at least one count across all samples. ::: ## INPUTS - Gene count matrix - Genome FastA - Genome GFF ## OUTPUTS - Genes BED - Genes FastA - Expressed genes FastA - Expressed genes SwissProt IDs only file. - Expressed genes to SwissProt IDs mapping file. - Expressed genes to SwissProtIDs and GO mapping file. - Counts file of expressed genes GOslims. ## SOFTWARE - [DIAMOND BLAST](https://github.com/bbuchfink/diamond) [@buchfink2021] - [bedtools](https://github.com/arq5x/bedtools2) [@quinlan2010] - [samtools](https://github.com/samtools/samtools) [@danecek2021] - [Biostrings](https://bioconductor.org/packages/release/bioc/html/Biostrings.html) [@h.pagès2017] (Bioconductor R package) - [GO.db](https://www.bioconductor.org/packages/release/data/annotation/html/GO.db.html) [@carlson2017] (Bioconductor R package) - [GSEABase](https://www.bioconductor.org/packages/release/bioc/html/GSEABase.html) [@martinmorgan2017] (Bioconductor R package) ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(Biostrings) library(GSEABase) library(GO.db) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # VARIABLES ```{r variables} # DIRECTORIES top_output_dir <- file.path("..", "output") output_dir <- file.path(top_output_dir, "30.00-Peve-transcriptome-GOslims") data_dir <- file.path("..", "data") # PROGRAMS blastdbs_dir <- file.path("..", "..", "M-multi-species", "data", "blastdbs") programs_dir <- file.path("", "home", "shared") bedtools_dir <- file.path(programs_dir, "bedtools-v2.30.0", "bin") blast_dir <- file.path(programs_dir, "ncbi-blast-2.15.0+", "bin") diamond <- file.path(programs_dir, "diamond-2.1.8") fastaFromBed <- file.path(bedtools_dir, "fastaFromBed") samtools_dir <- file.path(programs_dir, "samtools-1.12") samtools <- file.path(samtools_dir, "samtools") # FILES count_matrix <- file.path(top_output_dir, "06.2-Peve-Hisat/gene_count_matrix.csv") diamond_db <- "20250620-diamond" diamond_output <- file.path(output_dir, "Pevermanni-expressed-genes.blastx.outfmt6") genome_fasta <- file.path(data_dir, "Porites_evermanni_v1.fa") genes_fasta <- file.path(output_dir, "Pevermanni-genes.fasta") genes_fasta_index <- file.path(output_dir, "Pevermanni-genes.fasta.fai") genes_subset_fasta <- file.path(output_dir, "Pevermanni-subset-genes.fasta") genes_subset_fasta_index <- file.path(output_dir, "Pevermanni-subset-genes.fasta.fai") genes_bed <- file.path(output_dir, "Pevermanni-genes.bed") og_genome_gff <- file.path(data_dir, "Porites_evermanni_validated.gff3") # THREADS threads <- "40" ##### Official GO info - no need to change ##### goslims_obo <- "goslim_generic.obo" goslims_url <- "http://current.geneontology.org/ontology/subsets/goslim_generic.obo" # FORMATTING line <- "-----------------------------------------------" # Export these as environment variables for bash chunks. Sys.setenv( blastdbs_dir = blastdbs_dir, count_matrix = count_matrix, data_dir = data_dir, diamond = diamond, diamond_db = diamond_db, diamond_output = diamond_output, fastaFromBed = fastaFromBed, genes_bed = genes_bed, genes_fasta = genes_fasta, genes_fasta_index = genes_fasta_index, genes_subset_fasta = genes_subset_fasta, genes_subset_fasta_index = genes_subset_fasta_index, genome_fasta = genome_fasta, og_genome_gff = og_genome_gff, output_dir = output_dir, top_output_dir = top_output_dir, line = line, samtools = samtools, threads = threads ) ``` # Extract genes as FastA ## Extract genes from GFF ```{r extract-genes-from-gff} # Read GFF, skipping comment lines genome_gff <- readr::read_tsv( og_genome_gff, comment = "#", col_names = c("seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes") ) # Filter for gene features genes <- as.data.frame(genome_gff) %>% filter(type == "gene") %>% mutate( chrom = seqid, start = start - 1, # BED is 0-based end = end, gene_id = sub("ID=([^;]+);?.*", "\\1", attributes) ) %>% dplyr::select(chrom, start, end, gene_id) str(genes) ``` ### Write genes to BED ```{r write-genes-to-bed} # Write to BED write_tsv(genes, genes_bed, col_names = FALSE) ``` ## Create genes FastA ```{r create-genes-FastA, engine='bash'} "${fastaFromBed}" -fi "${genome_fasta}" -bed "${genes_bed}" -nameOnly > "${genes_fasta}" # Create FastA index "${samtools}" faidx "${genes_fasta}" head "${genes_fasta}" echo "" echo "" head "${genes_fasta_index}" ``` # Subset Expressed Genes Only those with at least *one* read in each sample ### Peek at count matrix ```{r check-count-matrix, engine='bash'} head "${count_matrix}" | column -t -s"," echo "" echo "" wc -l "${count_matrix}" ``` ## Import count matrix ```{r import-count-matrix} # Read the data into a data frame count_matrix_df <- read.csv(count_matrix, header = TRUE) str(count_matrix_df) ``` ## Only genes with at least one read per sample ```{r subset-count-matrix} # Filter rows where all values are greater than 0 filtered_count_matrix_df <- count_matrix_df[apply(count_matrix_df > 0, 1, all), ] str(filtered_count_matrix_df) ``` ## Subset genes FastA Only expressed genes ```{r subset-genes-FastA} # Get the row names (gene_ids) of the filtered data frame filtered_gene_ids <- filtered_count_matrix_df$gene_id fasta <- readDNAStringSet(genes_fasta) subset_fasta <- fasta[names(fasta) %in% filtered_gene_ids] ``` ```{r write-genes-subset-fasta} writeXStringSet(subset_fasta, genes_subset_fasta) ``` ### Peek at genes subset FastA ```{r check-genes-subset-fasta, engine='bash'} head "${genes_subset_fasta}" echo "" grep "^>" --count "${genes_subset_fasta}" ``` # BLASTx ## Download SwissProt ```{r download-sp, engine='bash'} cd "${blastdbs_dir}" curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz 20250618-uniprot_sprot.fasta.gz gunzip --keep 20250618-uniprot_sprot.fasta.gz ``` ## Create BLASTdb ```{r create-BLASTdb, engine='bash'} cd "${blastdbs_dir}" "${diamond}" makedb \ --in 20250618-uniprot_sprot.fasta.gz \ --db "${diamond_db}" \ --quiet \ --threads "${threads}" ``` ## Run DIAMOND BLASTx ```{r diamond-blast, engine='bash'} "${diamond}" blastx \ --db "${blastdbs_dir}"/"${diamond_db}.dmnd" \ --query "${genes_subset_fasta}" \ --out "${diamond_output}" \ --outfmt 6 \ --sensitive \ --evalue 1e-10 \ --max-target-seqs 1 \ --block-size 15.0 \ --index-chunks 4 \ --threads "${threads}" \ 2> "${output_dir}"/diamond-blastx.log head "${diamond_output}" echo "" wc -l "${diamond_output}" ``` # GENE ONTOLOGY ## Get gene IDs and SwissProt IDs ```{r gene-and-SPIDs, engine='bash'} awk -F"|" '{print $1"\t"$2}' "${diamond_output}" \ | awk '{print $1"\t"$3}' \ > "${output_dir}"/gene-SPIDs.txt head "${output_dir}"/gene-SPIDs.txt echo "" echo "" wc -l "${output_dir}"/gene-SPIDs.txt ``` ## Get SwissProt IDs ```{r get-SPIDs, engine='bash'} awk -F"|" '{print $2}' "${diamond_output}" \ | sort --unique \ > "${output_dir}"/SPIDs.txt head "${output_dir}"/SPIDs.txt echo "" echo "" wc -l "${output_dir}"/SPIDs.txt ``` ## Retrieve UniProt records A difference in number of records could be due to retrieval from only "reviewed" records, while BLAST may have included both "reviewed" and "unreviewed." SwissProt records ```{r uniprot-records, engine='bash'} python3 \ ../../M-multi-species/code/uniprot-retrieval.py \ "${output_dir}"/SPIDs.txt \ "${output_dir}"/ gunzip "${output_dir}"/uniprot-retrieval.tsv.gz echo "" echo "" wc -l "${output_dir}"/uniprot-retrieval.tsv ``` ## Map GO to Genes ```{r merge-genes-GOIDs} # Read gene-SPIDs.txt gene_spids <- read.delim(file.path(output_dir, "gene-SPIDs.txt"), header = FALSE, stringsAsFactors = FALSE) colnames(gene_spids) <- c("GeneID", "Entry") # Read uniprot-retrieval.tsv uniprot <- read.delim( file.path(output_dir, "uniprot-retrieval.tsv"), header = TRUE, stringsAsFactors = FALSE, check.names = FALSE ) # Merge on Entry gene_SPID_GOID_merged <- merge(gene_spids, uniprot[, c("Entry", "Gene Ontology IDs")], by = "Entry", all.x = TRUE) # View result str(gene_SPID_GOID_merged) ``` ### Write merged to file ```{r write-merged-gene-SP-GO} # Optionally, write to file write.table(gene_SPID_GOID_merged, file = file.path(output_dir, "gene-SPIDs-GOIDs.tsv"), sep = "\t", row.names = FALSE, quote = FALSE) ``` ## Clean up merged ```{r remove-NA-and-uniprotIDs, eval=TRUE} full.gene.df <- as.data.frame(gene_SPID_GOID_merged) # Clean whitespace, filter NA/empty rows, select columns, and split GO terms using column name variables gene.GO.df <- full.gene.df %>% mutate(!!"Gene Ontology IDs" := str_replace_all(.data[["Gene Ontology IDs"]], "\\s*;\\s*", ";")) %>% # Clean up spaces around ";" filter(!is.na(.data[["GeneID"]]) & !is.na(.data[["Gene Ontology IDs"]]) & .data[["Gene Ontology IDs"]] != "") %>% dplyr::select(all_of(c("GeneID", "Gene Ontology IDs"))) str(gene.GO.df) ``` ## Flatten gene GOID file ```{r flatten-gene-and-GO-IDs, eval=TRUE} flat.gene.GO.df <- gene.GO.df %>% separate_rows(!!sym("Gene Ontology IDs"), sep = ";") str(flat.gene.GO.df) ``` ## Group by GOID ```{r group-by-GO, eval=TRUE} grouped.gene.GO.df <- flat.gene.GO.df %>% group_by(!!sym("Gene Ontology IDs")) %>% summarise(!!"GeneID" := paste(.data[["GeneID"]], collapse = ",")) str(grouped.gene.GO.df) ``` ## Vectorize GOIDs ```{r vectorize-GOIDs, eval=TRUE} # Vector of GO IDs go_ids <- grouped.gene.GO.df[["Gene Ontology IDs"]] str(go_ids) ``` ## Prepare GOslim OBO ```{r download-generic-goslim-obo, eval=TRUE} # Find GSEAbase installation location gseabase_location <- find.package("GSEABase") # Load path to GOslim OBO file goslim_obo_destintation <- file.path(gseabase_location, "extdata", goslims_obo, fsep = "/") # Download the GOslim OBO file download.file(url = goslims_url, destfile = goslim_obo_destintation) # Loads package files gseabase_files <- system.file("extdata", goslims_obo, package="GSEABase") ``` ## GOslims from OBO ```{r extract-GOslims-from-OBO, eval=TRUE} # Create GSEAbase GOCollection using `go_ids` myCollection <- GOCollection(go_ids) # Retrieve GOslims from GO OBO file set slim <- getOBOCollection(gseabase_files) str(slim) ``` ## Biological Process GOslims ```{r retrieve-BP-GOslims, eval=TRUE} # Retrieve Biological Process (BP) GOslims slimdf <- goSlim(myCollection, slim, "BP", verbose) str(slimdf) ``` ## Map GO to GOslims ```{r map-GO-to-GOslims, eval=TRUE} # List of GOslims and all GO IDs from `go_ids` gomap <- as.list(GOBPOFFSPRING[rownames(slimdf)]) # Maps `go_ids` to matching GOslims mapped <- lapply(gomap, intersect, ids(myCollection)) # Append all mapped GO IDs to `slimdf` # `sapply` needed to apply paste() to create semi-colon delimited values slimdf$GO.IDs <- sapply(lapply(gomap, intersect, ids(myCollection)), paste, collapse=";") # Remove "character(0) string from "GO.IDs" column slimdf$GO.IDs[slimdf$GO.IDs == "character(0)"] <- "" # Add self-matching GOIDs to "GO.IDs" column, if not present for (go_id in go_ids) { # Check if the go_id is present in the row names if (go_id %in% rownames(slimdf)) { # Check if the go_id is not present in the GO.IDs column # Also removes white space "trimws()" and converts all to upper case to handle # any weird, "invisible" formatting issues. if (!go_id %in% trimws(toupper(strsplit(slimdf[go_id, "GO.IDs"], ";")[[1]]))) { # Append the go_id to the GO.IDs column with a semi-colon separator if (length(slimdf$GO.IDs) > 0 && nchar(slimdf$GO.IDs[nrow(slimdf)]) > 0) { slimdf[go_id, "GO.IDs"] <- paste0(slimdf[go_id, "GO.IDs"], "; ", go_id) } else { slimdf[go_id, "GO.IDs"] <- go_id } } } } str(slimdf) ``` ## Flatten GOslims ```{r flatten-GOslims-file, eval=TRUE} # "Flatten" file so each row is single GO ID with corresponding GOslim # rownames_to_column needed to retain row name info slimdf_separated <- as.data.frame(slimdf %>% rownames_to_column('GOslim') %>% separate_rows(GO.IDs, sep = ";")) # Group by unique GO ID grouped_slimdf <- slimdf_separated %>% filter(!is.na(GO.IDs) & GO.IDs != "") %>% group_by(GO.IDs) %>% summarize(GOslim = paste(GOslim, collapse = ";"), Term = paste(Term, collapse = ";")) str(grouped_slimdf) ``` ## Counts of GOslims ```{r sort-and-select-slimdf-counts, eval=TRUE} slimdf.sorted <- slimdf %>% arrange(desc(Count)) slim.count.df <- slimdf.sorted %>% dplyr::select(Term, Count, Percent) str(slim.count.df) ``` ### Write GOslims to file Need to create a column name for GOslimIDs from data frame rownames. ```{r write-goslim-counts-to-file} # Create header vector header <- c("GOslimID", colnames(slim.count.df)) # Write header to file writeLines(paste(header, collapse = "\t"), file.path(output_dir, "GOslim-counts.tsv")) # Append data frame contents to existing file, which contains header info write.table( slim.count.df, file = file.path(output_dir, "GOslim-counts.tsv"), sep = "\t", row.names = TRUE, quote = FALSE, col.names = FALSE, append = TRUE ) ``` # REFERENCES