--- title: "08-cod-RNAseq-GO-annotation" author: "Kathleen Durkin" date: "2024-04-15" always_allow_html: true output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true toc_depth: 3 number_sections: true html_preview: true --- ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` This code depends on the database (`G_macrocephalus_IDmapping_2024_04_17.tab`) of *G.macrocephalus* transcript IDs and associated GO terms generated in 03-transcriptome-annotation.Rmd, AND on lists of differentially expressed genes (DEGs) generated in 07-cod-RNAseq-DESeq2.Rmd. This code identifies overrepresented gene ontology (GO) terms in *G.macrocephalus* RNA sequencing data Inputs: - transcript ID/GO term database (`G_macrocephalus_rna_IDmapping_2024_04_17.tab`) - DEGs In 03-transcriptome-annotation.Rmd we used BLAST and Uniprot/SwissProt to obtain UniProt Accession numbers for each transcript in the *G. macrocephalus* transcriptome. We then used those Accession numbers to obtain GO terms for each transcript, functionally creating a database which matches *G.macrocephalus* transcripts with associated GO terms. In 07-cod-RNAseq-DESeq2.Rmd we identified sets of significantly differentially expressed genes (DEGs) between pairs of treatments (e.g., Liver tissue for 9C vs. 16C). Now we want to use the database to identify GO terms associated with our lists of DEGs, and to evaluate which GO terms are overrepresented. https://wikis.utexas.edu/display/bioiteam/GO+Enrichment+using+goseq#:~:text=goseq%20is%20an%20R%20package,in%20our%20differentially%20expressed%20genes. https://bioconductor.org/packages/devel/bioc/vignettes/goseq/inst/doc/goseq.pdf ```{r load-packages, eval=TRUE} # List of packages we want to install (run every time) library(tidyverse) library(dplyr) library(magrittr) library(knitr) library(ggvenn) ``` ```{r load-data} gmac_idmap <- read.csv("../output/03-transcriptome-annotation/G_macrocephalus_rna_IDmapping_2024_04_17.tab", sep='\t') %>% subset(select=-X) # remove superflous column containing rowIDs DEGs_L.9.0 <- read.csv("../output/07-cod-RNAseq-DESeq2/Gmac_DEGs_sig_L.9.0_norm.tab", sep='\t') %>% subset(select=-X) # remove superflous column containing rowIDs DEGs_L.9.5 <- read.csv("../output/07-cod-RNAseq-DESeq2/Gmac_DEGs_sig_L.9.5_norm.tab", sep='\t') %>% subset(select=-X) # remove superflous column containing rowIDs DEGs_L.9.16 <- read.csv("../output/07-cod-RNAseq-DESeq2/Gmac_DEGs_sig_L.9.16_norm.tab", sep='\t') %>% subset(select=-X) # remove superflous column containing rowIDs kallisto_counts_matrix <- read.csv("../output/06-cod-RNAseq-alignment/kallisto/kallisto.isoform.counts.matrix", sep='\t') kallisto_counts_matrix[,-1] <- round(kallisto_counts_matrix[,-1], digits = 0) # round counts to integers ``` # Annotate DEGs using GO terms ```{r join-DEG-GO-data} # All DEGs DEGs_GO_L.9.0 <- left_join(DEGs_L.9.0, gmac_idmap, by=c("gene" = "V1")) DEGs_GO_L.9.5 <- left_join(DEGs_L.9.5, gmac_idmap, by=c("gene" = "V1")) DEGs_GO_L.9.16 <- left_join(DEGs_L.9.16, gmac_idmap, by=c("gene" = "V1")) # Unique DEGs DEGs_GO_L.9.0_unique <- DEGs_GO_L.9.0 %>% distinct(gene, .keep_all = TRUE) DEGs_GO_L.9.5_unique <- DEGs_GO_L.9.5 %>% distinct(gene, .keep_all = TRUE) DEGs_GO_L.9.16_unique <- DEGs_GO_L.9.16 %>% distinct(gene, .keep_all = TRUE) # summarize counts print(paste("All DEGs:", nrow(DEGs_GO_L.9.0), "(9C v 0C),", nrow(DEGs_GO_L.9.5), "(9C v 5C),", nrow(DEGs_GO_L.9.16), "(9C v 16C)")) print(paste("Unique DEGs:", nrow(DEGs_GO_L.9.0_unique), "(9C v 0C),", nrow(DEGs_GO_L.9.5_unique), "(9C v 5C),", nrow(DEGs_GO_L.9.16_unique), "(9C v 16C)")) # Save write.table(DEGs_GO_L.9.0_unique, "../output/08-cod-RNAseq-GO-annotation/Gmac_DEGs_GO_L.9.0_unique.tab", sep = "\t", row.names = TRUE, col.names = NA) write.table(DEGs_GO_L.9.5_unique, "../output/08-cod-RNAseq-GO-annotation/Gmac_DEGs_GO_L.9.5_unique.tab", sep = "\t", row.names = TRUE, col.names = NA) write.table(DEGs_GO_L.9.16_unique, "../output/08-cod-RNAseq-GO-annotation/Gmac_DEGs_GO_L.9.16_unique.tab", sep = "\t", row.names = TRUE, col.names = NA) ``` ## Venn diagram of DEG counts accross treatments ```{r} # Make list deg_counts_unique <- list( "9C v 0C" = unique(DEGs_GO_L.9.0$gene), "9C v 5C" = unique(DEGs_GO_L.9.5$gene), "9C v 16C" = unique(DEGs_GO_L.9.16$gene) ) # Make venn diagrams ggvenn(deg_counts_unique, fill_color = c("blue", "yellow", "red")) ``` ## DEG bar plots ```{r} # Create a data frame with extracted columns combined_unique_DEG_counts <- data.frame( DEGs_count = c(length(unique(DEGs_GO_L.9.0$gene)), length(unique(DEGs_GO_L.9.5$gene)), length(unique(DEGs_GO_L.9.16$gene))), treatment = factor(c("9C v 0C", "9C v 5C", "9C v 16C"), levels=c("9C v 0C", "9C v 5C", "9C v 16C"))) # Assign colors custom_colors <- c("9C v 0C"="blue", "9C v 5C"="yellow", "9C v 16C"="red") # Create a bar plot ggplot(combined_unique_DEG_counts, aes(x = treatment, y = DEGs_count, fill = treatment)) + geom_col() + labs(title = "Bar Plot of Unique DEGs across Treatments", x = "DEGs", y = "Count") + scale_fill_manual(values = custom_colors) + geom_text(aes(label = DEGs_count), vjust = -0.5) + theme_minimal() ``` ## GO Enrichment First, isolate lists of Uniprot Accession numbers from each set off significant DEGs. ```{r get-uniprot-acession-nums-DEGs} uniprotAcessions_DEGs_L.9.0 <- na.omit(DEGs_GO_L.9.0_unique$V3) write.table(uniprotAcessions_DEGs_L.9.0, "../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.0.txt") uniprotAcessions_DEGs_L.9.5 <- na.omit(DEGs_GO_L.9.5_unique$V3) write.table(uniprotAcessions_DEGs_L.9.5, "../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.5.txt") uniprotAcessions_DEGs_L.9.16 <- na.omit(DEGs_GO_L.9.16_unique$V3) write.table(uniprotAcessions_DEGs_L.9.16, "../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.16.txt") ``` And reformat to get rid of superfluous characters ```{r format-acession-nums-DEGs, engine='bash'} # We need to 1) remove the first line, 2) get rid of the row numbers, and 3) remove extra " characters sed '1d' ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.0.txt | \ awk '{print $2}' | \ tr -d '"' \ > ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.0_formatted.txt sed '1d' ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.5.txt | \ awk '{print $2}' | \ tr -d '"' \ > ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.5_formatted.txt sed '1d' ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.16.txt | \ awk '{print $2}' | \ tr -d '"' \ > ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_DEGs_L.9.16_formatted.txt ``` We also want to get a list of Uniprot Accession numbers for our "background". This is a reference of all transcripts we would expect to find in our samples. We could just use IDs from our full reference transcriptome, but this may include transcripts we would not actually expect to find in our samples (e.g. the transcriptome may contain gonad-specific transcripts that would not appear in our liver tissue samples). To filter for only transcripts found in our samples, we can 1) filter our kallisto count matrix to retain only transcripts present in at least one sample, and then 2) join this filtered count matrix with our transcript/Uniprot ID database. ```{r combine-counts-and-IDmap} # Remove any transcripts that do not appear in any sample kallisto_counts_filtered <- kallisto_counts_matrix[rowSums(kallisto_counts_matrix[, -1] != 0) > 0, ] og_num <- nrow (kallisto_counts_matrix) filtered_num <- nrow(kallisto_counts_filtered) paste("Filtered kallisto counts matrix from ", og_num, "transcripts to ", filtered_num, "transcripts") # Join the filtered counts matrix and the transcript/Uniprot database counts_GO <- left_join(kallisto_counts_filtered, gmac_idmap, by=c("X" = "V1")) # Reorder columns to have gene name, GO annotation, and Uniprot ID next to each other counts_GO <- counts_GO[, c("Gene.Ontology..biological.process.", setdiff(names(counts_GO), "Gene.Ontology..biological.process."))] counts_GO <- counts_GO[, c("V3", setdiff(names(counts_GO), "V3"))] ``` ```{r get-uniprot-acession-nums-ref} uniprotAcessions_transcriptome_rna <- na.omit(counts_GO$V3) write.table(uniprotAcessions_transcriptome_rna, "../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_transcriptome_rna.txt") ``` ```{r format-acession-nums-ref, engine='bash'} # We need to 1) remove the first line, 2) get rid of the row numbers, and 3) remove extra " characters sed '1d' ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_transcriptome_rna.txt | \ awk '{print $2}' | \ tr -d '"' \ > ../output/08-cod-RNAseq-GO-annotation/Gmac_uniprotAcessions_transcriptome_rna_formatted.txt ``` Now we can download these formatted lists of accession numbers and run them through DAVID to obtain lists of associated Uniprot keywords. Unfortunately I don't think this can be done from command line though, so I'll be using the online tool: https://david.ncifcrf.gov/tools.jsp I upload my three DEG lists (`Gmac_uniprotAcessions_DEGs_L.9.0_formatted.txt`, `Gmac_uniprotAcessions_DEGs_L.9.5_formatted.txt`, `Gmac_uniprotAcessions_DEGs_L.9.16_formatted.txt`) as "Gene Lists" and upload the filtered transcripts list (`Gmac_uniprotAcessions_transcriptome_rna_formatted.txt`) as the "Background", selecting "UNIPROT_ACCESSION" as the identifier for each. I analyzed each DEG list using the "Functional Annotation" tool, and dowloaded the Functional Annotation Table and Functional Annotation Chart for each (below) ```{r load-DAVID-files, engine='bash'} curl https://david.ncifcrf.gov/data/download/tr_EDDFE74F4DD11715194057713.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.0.tab curl https://david.ncifcrf.gov/data/download/chart_EDDFE74F4DD11715195127232.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.0.tab curl https://david.ncifcrf.gov/data/download/tr_EDDFE74F4DD11715194461132.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.5.tab curl https://david.ncifcrf.gov/data/download/chart_EDDFE74F4DD11715194590902.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.5.tab curl https://david.ncifcrf.gov/data/download/tr_EDDFE74F4DD11715193943129.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.16.tab curl https://david.ncifcrf.gov/data/download/chart_EDDFE74F4DD11715195175792.txt > ../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.16.tab ``` ```{r load-David-files-in-R} DEGs_DAVID_FAtable_L.9.0 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.0.tab", sep="\t") DEGs_DAVID_FAtable_L.9.5 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.5.tab", sep="\t") DEGs_DAVID_FAtable_L.9.16 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAtable_L.9.16.tab", sep="\t") DEGs_DAVID_FAchart_L.9.0 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.0.tab", sep="\t") DEGs_DAVID_FAchart_L.9.5 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.5.tab", sep="\t") DEGs_DAVID_FAchart_L.9.16 <- read.delim("../output/08-cod-RNAseq-GO-annotation/Gmac_DAVID_FAchart_L.9.16.tab", sep="\t") ``` Now we can also make datasets that include differential expression stats (e.g., `padj`) AND high-level Uniprot keywords. ```{r join-DAVID-and-DEG} DEGs_DAVID_FAtable_GO_L.9.0 <- left_join(DEGs_GO_L.9.0_unique, DEGs_DAVID_FAtable_L.9.0, by=c("V3" = "ID")) DEGs_DAVID_FAtable_GO_L.9.5 <- left_join(DEGs_GO_L.9.5_unique, DEGs_DAVID_FAtable_L.9.5, by=c("V3" = "ID")) DEGs_DAVID_FAtable_GO_L.9.16 <- left_join(DEGs_GO_L.9.16_unique, DEGs_DAVID_FAtable_L.9.16, by=c("V3" = "ID")) ``` ### View top DEGs and annotations First we can look at the biological processes associated with our most significantly differentially expressed genes -- note that this is *NOT* making use of our enrichment analysis, it's just the GO terms associated with the top 25 DEGs ```{r view-top-DEG-GO} top_25_DEGs_L.9.0 <- head(na.omit(DEGs_DAVID_FAtable_GO_L.9.0[order(DEGs_DAVID_FAtable_GO_L.9.0$padj), ]), 25) top_25_DEGs_L.9.0$padj <- as.character(top_25_DEGs_L.9.0$padj) # prevents kable from auto-rounding our padj values kable(top_25_DEGs_L.9.0[, c("gene", "padj", "Gene.Ontology..biological.process.")], row.names = FALSE, caption = "Top 25 DEGs for 9C v 0C (based on adjusted p-value)") top_25_DEGs_L.9.5 <- head(na.omit(DEGs_DAVID_FAtable_GO_L.9.5[order(DEGs_DAVID_FAtable_GO_L.9.5$padj), ]), 25) top_25_DEGs_L.9.5$padj <- as.character(top_25_DEGs_L.9.5$padj) # prevents kable from auto-rounding our padj values kable(top_25_DEGs_L.9.5[, c("gene", "padj", "Gene.Ontology..biological.process.")], row.names = FALSE, caption = "Top 25 DEGs for 9C v 5C (based on adjusted p-value)") top_25_DEGs_L.9.16 <- head(na.omit(DEGs_DAVID_FAtable_GO_L.9.16[order(DEGs_DAVID_FAtable_GO_L.9.16$padj), ]), 25) top_25_DEGs_L.9.16$padj <- as.character(top_25_DEGs_L.9.16$padj) # prevents kable from auto-rounding our padj values kable(top_25_DEGs_L.9.16[, c("gene", "padj", "Gene.Ontology..biological.process.")], row.names = FALSE, caption = "Top 25 DEGs for 9C v 16C (based on adjusted p-value)") ``` ### View over-represented (enriched) processes #### GO terms First, let's grab just the GO terms for biological processes, and sort and filter by the Bonferroni-adjusted p-value ```{r select-enriched-GOBP} DEGs_DAVID_FAchart_L.9.0_enrichedGOBP <- DEGs_DAVID_FAchart_L.9.0 %>% filter(Category == "GOTERM_BP_DIRECT") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) DEGs_DAVID_FAchart_L.9.5_enrichedGOBP <- DEGs_DAVID_FAchart_L.9.5 %>% filter(Category == "GOTERM_BP_DIRECT") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) DEGs_DAVID_FAchart_L.9.16_enrichedGOBP <- DEGs_DAVID_FAchart_L.9.16 %>% filter(Category == "GOTERM_BP_DIRECT") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) ``` ```{r view-enriched-GOBP} DEGs_DAVID_FAchart_L.9.0_enrichedGOBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.0_enrichedGOBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.0_enrichedGOBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched GO Biological Processes for 9C v 0C (based on adjusted p-value)") DEGs_DAVID_FAchart_L.9.5_enrichedGOBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.5_enrichedGOBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.5_enrichedGOBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched GO Biological Processes for 9C v 5C (based on adjusted p-value)") DEGs_DAVID_FAchart_L.9.16_enrichedGOBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.16_enrichedGOBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.16_enrichedGOBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched GO Biological Processes for 9C v 16C (based on adjusted p-value)") ``` #### Uniprot Keywords First, let's grab just the Uniprot keywords for biological processes, and sort and filter by the Bonferroni-adjusted p-value ```{r select-enriched-KWBP} DEGs_DAVID_FAchart_L.9.0_enrichedKWBP <- DEGs_DAVID_FAchart_L.9.0 %>% filter(Category == "UP_KW_BIOLOGICAL_PROCESS") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) DEGs_DAVID_FAchart_L.9.5_enrichedKWBP <- DEGs_DAVID_FAchart_L.9.5 %>% filter(Category == "UP_KW_BIOLOGICAL_PROCESS") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) DEGs_DAVID_FAchart_L.9.16_enrichedKWBP <- DEGs_DAVID_FAchart_L.9.16 %>% filter(Category == "UP_KW_BIOLOGICAL_PROCESS") %>% filter(Bonferroni < 0.05) %>% arrange(Bonferroni) ``` ```{r view-enriched-KWBP} DEGs_DAVID_FAchart_L.9.0_enrichedKWBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.0_enrichedKWBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.0_enrichedKWBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched Biological Processes for 9C v 0C (based on adjusted p-value)") DEGs_DAVID_FAchart_L.9.5_enrichedKWBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.5_enrichedKWBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.5_enrichedKWBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched Biological Processes for 9C v 5C (based on adjusted p-value)") DEGs_DAVID_FAchart_L.9.16_enrichedKWBP$Bonferroni <- as.character(DEGs_DAVID_FAchart_L.9.16_enrichedKWBP$Bonferroni) # prevents kable from auto-rounding our padj values kable(DEGs_DAVID_FAchart_L.9.16_enrichedKWBP[, c("Term", "Bonferroni")], row.names = FALSE, caption = "Enriched Biological Processes for 9C v 16C (based on adjusted p-value)") ```