merge DEG data with GO annotation data
# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(genefilter) #for pOverA filtering
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:readr':
##
## spec
#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
##
##
## Attaching package: 'ComplexHeatmap'
##
## The following object is masked from 'package:genefilter':
##
## dist2
annot_GO_terms_host <- read_csv("../data/annot_GO.terms.host.csv") %>% select(-1)
## New names:
## Rows: 310402 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): gene_id, GO.ID dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
DEGs <- read_csv("../data/DEGSeq2.sig.results.host.csv") %>% select(-1)
## New names:
## Rows: 252 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): ...1, gene_id, direction dbl (6): baseMean, log2FoldChange, lfcSE, stat,
## pvalue, padj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Merge the data frames by 'gene_id'
merged_data <- left_join(DEGs, annot_GO_terms_host, by = "gene_id") %>% filter(GO.ID != "unknown")
write_csv(merged_data, "../data/merged_DEG_GO_data.csv")
load in count data
count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
## Rows: 27439 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene_id
## dbl (32): C17_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf, C18_R1....
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
#rownames(count_mat) <- all_genes
# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))
dim(count_mat)
## [1] 27439 33
ffun<-filterfun(pOverA(0.25,10)) #set up filtering parameters
counts <- count_mat %>% select(-gene_id)
count_mat_poa <- genefilter((counts), ffun) #apply filter
sum(count_mat_poa) #count number of genes left
## [1] 21100
count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter
count_mat <- count_mat_poa
The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the Pverr genome paper. They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df
Read in file with gene name iterations
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
select(Query, Gene)
count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))
count_mat <- count_mat %>% mutate(Query = str_remove(Query, "\\.t1$"))
expression_data_deg <- count_mat%>% filter(Query %in% unique(merged_data$gene_id)) %>% select(-c("gene_id","Query"))# Adjust the column selection if necessary
# Convert expression data to a matrix
expression_data_deg <- as.matrix(expression_data_deg)
# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_data_deg)))
# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs"), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = TRUE, # Cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
# Reorder dendrogram based on clustering
heatmap_legend_param = list(title = "Gene Count"))
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs"), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = FALSE, # Cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
# Reorder dendrogram based on clustering
heatmap_legend_param = list(title = "Gene Count"))
go_enriched <- c("GO:0010884","GO:1904659","GO:0005536","GO:0005355","GO:0015149","GO:0015145","GO:0051119","GO:0015144","GO:0045721")
go_enriched_names <- c("positive regulation of lipid storage","glucose transmembrane transport","glucose binding","glucose transmembrane transporter activity","hexose transmembrane transporter activity","monosaccharide transmembrane transporter activity","sugar transmembrane transporter activity","carbohydrate transmembrane transporter activity","negative regulation of gluconeogenesis")
# Loop over each GO term in go_enriched
for (i in seq_along(go_enriched)) {
go_term <- go_enriched[i]
print(go_enriched_names[i])
# Filter the merged_data for the current GO term
merged_go_interest <- merged_data %>% filter(GO.ID == go_term)
# Filter count_mat for genes related to the current GO term
expression_data_deg <- count_mat %>%
filter(Query %in% unique(merged_go_interest$gene_id)) # Adjust the column selection if necessary
# Convert expression data to a matrix
expression_data <- as.matrix(expression_data_deg %>% select(-c("gene_id", "Query")))
# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_data)))
# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] <- 0
# Set row names to gene IDs
rownames(z_score_matrix) <- expression_data_deg$gene_id
# Generate heatmaps for the current GO term
print(Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs for", go_enriched_names[i]), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = FALSE, # Do not cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = FALSE,
column_names_gp = gpar(fontsize = 6),
heatmap_legend_param = list(title = "Gene Count")))
}
## [1] "positive regulation of lipid storage"
## [1] "glucose transmembrane transport"
## [1] "glucose binding"
## [1] "glucose transmembrane transporter activity"
## [1] "hexose transmembrane transporter activity"
## [1] "monosaccharide transmembrane transporter activity"
## [1] "sugar transmembrane transporter activity"
## [1] "carbohydrate transmembrane transporter activity"
## [1] "negative regulation of gluconeogenesis"