merge DEG data with GO annotation data

# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(genefilter) #for pOverA filtering
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:readr':
## 
##     spec
#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## 
## 
## Attaching package: 'ComplexHeatmap'
## 
## The following object is masked from 'package:genefilter':
## 
##     dist2
annot_GO_terms_host <- read_csv("../data/annot_GO.terms.host.csv") %>% select(-1)
## New names:
## Rows: 310402 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): gene_id, GO.ID dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
DEGs <- read_csv("../data/DEGSeq2.sig.results.host.csv") %>% select(-1)
## New names:
## Rows: 252 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): ...1, gene_id, direction dbl (6): baseMean, log2FoldChange, lfcSE, stat,
## pvalue, padj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Merge the data frames by 'gene_id'
merged_data <- left_join(DEGs, annot_GO_terms_host, by = "gene_id") %>% filter(GO.ID != "unknown")

write_csv(merged_data, "../data/merged_DEG_GO_data.csv")

load in count data

count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
## Rows: 27439 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): gene_id
## dbl (32): C17_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf, C18_R1....
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
#rownames(count_mat) <- all_genes

# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))

dim(count_mat)
## [1] 27439    33

pOverA filtering

ffun<-filterfun(pOverA(0.25,10))  #set up filtering parameters
counts <- count_mat %>% select(-gene_id)
count_mat_poa <- genefilter((counts), ffun) #apply filter
sum(count_mat_poa) #count number of genes left
## [1] 21100
count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter

count_mat <- count_mat_poa

The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the Pverr genome paper. They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df

Read in file with gene name iterations

names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
  select(Query, Gene)

count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))

count_mat <- count_mat %>% mutate(Query = str_remove(Query, "\\.t1$"))
expression_data_deg <- count_mat%>% filter(Query %in% unique(merged_data$gene_id)) %>% select(-c("gene_id","Query"))# Adjust the column selection if necessary

# Convert expression data to a matrix
expression_data_deg <- as.matrix(expression_data_deg)

# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_data_deg)))

# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0

heatmap

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("DEGs"),               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = TRUE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("DEGs"),    # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = FALSE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

go_enriched <- c("GO:0010884","GO:1904659","GO:0005536","GO:0005355","GO:0015149","GO:0015145","GO:0051119","GO:0015144","GO:0045721")
go_enriched_names <- c("positive regulation of lipid storage","glucose transmembrane transport","glucose binding","glucose transmembrane transporter activity","hexose transmembrane transporter activity","monosaccharide transmembrane transporter activity","sugar transmembrane transporter activity","carbohydrate transmembrane transporter activity","negative regulation of gluconeogenesis")

Go terms encriched by DEGs

# Loop over each GO term in go_enriched
for (i in seq_along(go_enriched)) {
  go_term <- go_enriched[i]
  print(go_enriched_names[i])
  
  # Filter the merged_data for the current GO term
  merged_go_interest <- merged_data %>% filter(GO.ID == go_term)
  
  # Filter count_mat for genes related to the current GO term
  expression_data_deg <- count_mat %>% 
    filter(Query %in% unique(merged_go_interest$gene_id)) # Adjust the column selection if necessary
  
  # Convert expression data to a matrix
  expression_data <- as.matrix(expression_data_deg %>% select(-c("gene_id", "Query")))
  
  # Calculate Z-scores for the expression matrix
  z_score_matrix <- t(scale(t(expression_data)))
  
  # Replace NA/NaN with 0
  z_score_matrix[is.nan(z_score_matrix)] <- 0
  
  # Set row names to gene IDs
  rownames(z_score_matrix) <- expression_data_deg$gene_id
  
  # Generate heatmaps for the current GO term
  print(Heatmap(z_score_matrix,
                na_col = "black",
                name = "Gene Count",         # Name for the heatmap color bar
                row_title = paste("DEGs for", go_enriched_names[i]),   # Title for the row side
                column_title = "Samples",    # Title for the column side
                show_row_names = TRUE,       # Show gene names
                show_column_names = TRUE,    # Show sample names
                cluster_rows = TRUE,         # Cluster rows
                cluster_columns = FALSE,     # Do not cluster columns
                row_dend_reorder = TRUE,     # Reorder dendrogram based on clustering
                column_dend_reorder = FALSE, 
                column_names_gp = gpar(fontsize = 6),
                heatmap_legend_param = list(title = "Gene Count")))
}
## [1] "positive regulation of lipid storage"

## [1] "glucose transmembrane transport"

## [1] "glucose binding"

## [1] "glucose transmembrane transporter activity"

## [1] "hexose transmembrane transporter activity"

## [1] "monosaccharide transmembrane transporter activity"

## [1] "sugar transmembrane transporter activity"

## [1] "carbohydrate transmembrane transporter activity"

## [1] "negative regulation of gluconeogenesis"