---
title: "07 deg list"
output: html_document
date:
---
merge DEG data with GO annotation data
```{r}
# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)
library(genefilter) #for pOverA filtering
#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
annot_GO_terms_host <- read_csv("../data/annot_GO.terms.host.csv") %>% select(-1)
full_annot <- read_csv("../data/pver_annot_full.csv") %>% select(-1) %>% mutate(gene_id = str_remove(gene_id, "\\.t.*$"))
DEGs <- read_csv("../data/DEGSeq2.sig.results.host.csv") %>% select(-1)
DEGs_UP <- DEGs %>% filter(direction == "Upregulated")
# Merge the data frames by 'gene_id'
merged_data <- left_join(DEGs, annot_GO_terms_host, by = "gene_id") %>% filter(GO.ID != "unknown")
write_csv(merged_data, "../data/merged_DEG_GO_data.csv")
# merge full annotation
merged_data_full <- left_join(DEGs, full_annot, by = "gene_id")
write_csv(merged_data_full, "../data/merged_DEG_full_annot.csv")
# merge full annotation
merged_data_full_UP <- left_join(DEGs_UP, full_annot, by = "gene_id")
```
```{r}
library(stringr)
# Assuming your data is in a dataframe called df and the column is named 'your_column'
count_P <- sum(str_count(merged_data_full$SWISS_GO.Names[!is.na(merged_data_full$SWISS_GO.Names)], "P:"))
# Print the result
count_P
split_SWISS_names <- unlist(str_split(merged_data_full$SWISS_GO.Names, "; "))
split_SWISS_names_P <- split_SWISS_names[str_detect(split_SWISS_names, "P:")]
split_SWISS_names_P <- split_SWISS_names_P[!is.na(split_SWISS_names_P)]
unique_SWISS_names_P <- unique(split_SWISS_names_P)
length(unique_SWISS_names_P)
write.csv(unique_SWISS_names_P, file = "../output/07-degs/unique_P_SWISS_terms.csv", row.names = FALSE)
```
```{r}
summary_P_SWISS <- as.data.frame(table(split_SWISS_names_P)) %>% arrange(desc(Freq))
print(summary_P_SWISS)
write.csv(summary_P_SWISS, file = "../output/07-degs/counted_P_SWISS_terms.csv", row.names = FALSE)
```
```{r}
library(stringr)
# Assuming your data is in a dataframe called df and the column is named 'your_column'
count_P_UP <- sum(str_count(merged_data_full_UP$SWISS_GO.Names[!is.na(merged_data_full_UP$SWISS_GO.Names)], "P:"))
# Print the result
count_P_UP
split_SWISS_names <- unlist(str_split(merged_data_full_UP$SWISS_GO.Names, "; "))
split_SWISS_names_P <- split_SWISS_names[str_detect(split_SWISS_names, "P:")]
split_SWISS_names_P <- split_SWISS_names_P[!is.na(split_SWISS_names_P)]
unique_SWISS_names_P <- unique(split_SWISS_names_P)
length(unique_SWISS_names_P)
write.csv(unique_SWISS_names_P, file = "../output/07-degs/unique_P_SWISS_terms_Upregulated.csv", row.names = FALSE)
```
```{r}
summary_P_SWISS <- as.data.frame(table(split_SWISS_names_P)) %>% arrange(desc(Freq))
print(summary_P_SWISS)
write.csv(summary_P_SWISS, file = "../output/07-degs/counted_P_SWISS_terms_Upregulated.csv", row.names = FALSE)
```
load in count data
```{r}
count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
#rownames(count_mat) <- all_genes
# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))
dim(count_mat)
```
## pOverA filtering
```{r}
ffun<-filterfun(pOverA(0.25,10)) #set up filtering parameters
counts <- count_mat %>% select(-gene_id)
count_mat_poa <- genefilter((counts), ffun) #apply filter
sum(count_mat_poa) #count number of genes left
count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter
count_mat <- count_mat_poa
```
The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df
Read in file with gene name iterations
```{r}
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
select(Query, Gene)
count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))
count_mat <- count_mat %>% mutate(Query = str_remove(Query, "\\.t1$"))
```
```{r}
expression_data_deg <- count_mat%>% filter(Query %in% unique(merged_data$gene_id)) %>% select(-c("gene_id","Query"))# Adjust the column selection if necessary
# Convert expression data to a matrix
expression_data_deg <- as.matrix(expression_data_deg)
# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_data_deg)))
# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0
```
### heatmap
```{r}
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs"), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = TRUE, # Cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
# Reorder dendrogram based on clustering
heatmap_legend_param = list(title = "Gene Count"))
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs"), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = FALSE, # Cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
# Reorder dendrogram based on clustering
heatmap_legend_param = list(title = "Gene Count"))
```
```{r}
go_enriched <- c("GO:0010884","GO:1904659","GO:0005536","GO:0005355","GO:0015149","GO:0015145","GO:0051119","GO:0015144","GO:0045721")
go_enriched_names <- c("positive regulation of lipid storage","glucose transmembrane transport","glucose binding","glucose transmembrane transporter activity","hexose transmembrane transporter activity","monosaccharide transmembrane transporter activity","sugar transmembrane transporter activity","carbohydrate transmembrane transporter activity","negative regulation of gluconeogenesis")
```
## Go terms encriched by DEGs
s
```{r}
# Loop over each GO term in go_enriched
for (i in seq_along(go_enriched)) {
go_term <- go_enriched[i]
print(go_enriched_names[i])
# Filter the merged_data for the current GO term
merged_go_interest <- merged_data %>% filter(GO.ID == go_term)
# Filter count_mat for genes related to the current GO term
expression_data_deg <- count_mat %>%
filter(Query %in% unique(merged_go_interest$gene_id)) # Adjust the column selection if necessary
# Convert expression data to a matrix
expression_data <- as.matrix(expression_data_deg %>% select(-c("gene_id", "Query")))
# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_data)))
# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] <- 0
# Set row names to gene IDs
rownames(z_score_matrix) <- expression_data_deg$gene_id
# Generate heatmaps for the current GO term
print(Heatmap(z_score_matrix,
na_col = "black",
name = "Gene Count", # Name for the heatmap color bar
row_title = paste("DEGs for", go_enriched_names[i]), # Title for the row side
column_title = "Samples", # Title for the column side
show_row_names = TRUE, # Show gene names
show_column_names = TRUE, # Show sample names
cluster_rows = TRUE, # Cluster rows
cluster_columns = FALSE, # Do not cluster columns
row_dend_reorder = TRUE, # Reorder dendrogram based on clustering
column_dend_reorder = FALSE,
column_names_gp = gpar(fontsize = 6),
heatmap_legend_param = list(title = "Gene Count")))
}
```