--- title: "07 deg list" output: html_document date: --- merge DEG data with GO annotation data ```{r} # Load necessary libraries library(readr) library(readxl) library(tidyverse) library(genefilter) #for pOverA filtering #install.packages("ComplexHeatmap") library(ComplexHeatmap) annot_GO_terms_host <- read_csv("../data/annot_GO.terms.host.csv") %>% select(-1) full_annot <- read_csv("../data/pver_annot_full.csv") %>% select(-1) %>% mutate(gene_id = str_remove(gene_id, "\\.t.*$")) DEGs <- read_csv("../data/DEGSeq2.sig.results.host.csv") %>% select(-1) DEGs_UP <- DEGs %>% filter(direction == "Upregulated") # Merge the data frames by 'gene_id' merged_data <- left_join(DEGs, annot_GO_terms_host, by = "gene_id") %>% filter(GO.ID != "unknown") write_csv(merged_data, "../data/merged_DEG_GO_data.csv") # merge full annotation merged_data_full <- left_join(DEGs, full_annot, by = "gene_id") write_csv(merged_data_full, "../data/merged_DEG_full_annot.csv") # merge full annotation merged_data_full_UP <- left_join(DEGs_UP, full_annot, by = "gene_id") ``` ```{r} library(stringr) # Assuming your data is in a dataframe called df and the column is named 'your_column' count_P <- sum(str_count(merged_data_full$SWISS_GO.Names[!is.na(merged_data_full$SWISS_GO.Names)], "P:")) # Print the result count_P split_SWISS_names <- unlist(str_split(merged_data_full$SWISS_GO.Names, "; ")) split_SWISS_names_P <- split_SWISS_names[str_detect(split_SWISS_names, "P:")] split_SWISS_names_P <- split_SWISS_names_P[!is.na(split_SWISS_names_P)] unique_SWISS_names_P <- unique(split_SWISS_names_P) length(unique_SWISS_names_P) write.csv(unique_SWISS_names_P, file = "../output/07-degs/unique_P_SWISS_terms.csv", row.names = FALSE) ``` ```{r} summary_P_SWISS <- as.data.frame(table(split_SWISS_names_P)) %>% arrange(desc(Freq)) print(summary_P_SWISS) write.csv(summary_P_SWISS, file = "../output/07-degs/counted_P_SWISS_terms.csv", row.names = FALSE) ``` ```{r} library(stringr) # Assuming your data is in a dataframe called df and the column is named 'your_column' count_P_UP <- sum(str_count(merged_data_full_UP$SWISS_GO.Names[!is.na(merged_data_full_UP$SWISS_GO.Names)], "P:")) # Print the result count_P_UP split_SWISS_names <- unlist(str_split(merged_data_full_UP$SWISS_GO.Names, "; ")) split_SWISS_names_P <- split_SWISS_names[str_detect(split_SWISS_names, "P:")] split_SWISS_names_P <- split_SWISS_names_P[!is.na(split_SWISS_names_P)] unique_SWISS_names_P <- unique(split_SWISS_names_P) length(unique_SWISS_names_P) write.csv(unique_SWISS_names_P, file = "../output/07-degs/unique_P_SWISS_terms_Upregulated.csv", row.names = FALSE) ``` ```{r} summary_P_SWISS <- as.data.frame(table(split_SWISS_names_P)) %>% arrange(desc(Freq)) print(summary_P_SWISS) write.csv(summary_P_SWISS, file = "../output/07-degs/counted_P_SWISS_terms_Upregulated.csv", row.names = FALSE) ``` load in count data ```{r} count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv") all_genes <- count_mat$gene_id #count_mat <- count_mat %>% select(-gene_id) #rownames(count_mat) <- all_genes # Clean up the column names colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat)) dim(count_mat) ``` ## pOverA filtering ```{r} ffun<-filterfun(pOverA(0.25,10)) #set up filtering parameters counts <- count_mat %>% select(-gene_id) count_mat_poa <- genefilter((counts), ffun) #apply filter sum(count_mat_poa) #count number of genes left count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter count_mat <- count_mat_poa ``` The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df Read in file with gene name iterations ```{r} names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>% select(Query, Gene) count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene")) count_mat <- count_mat %>% mutate(Query = str_remove(Query, "\\.t1$")) ``` ```{r} expression_data_deg <- count_mat%>% filter(Query %in% unique(merged_data$gene_id)) %>% select(-c("gene_id","Query"))# Adjust the column selection if necessary # Convert expression data to a matrix expression_data_deg <- as.matrix(expression_data_deg) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_data_deg))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] = 0 ``` ### heatmap ```{r} # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("DEGs"), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = TRUE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("DEGs"), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) ``` ```{r} go_enriched <- c("GO:0010884","GO:1904659","GO:0005536","GO:0005355","GO:0015149","GO:0015145","GO:0051119","GO:0015144","GO:0045721") go_enriched_names <- c("positive regulation of lipid storage","glucose transmembrane transport","glucose binding","glucose transmembrane transporter activity","hexose transmembrane transporter activity","monosaccharide transmembrane transporter activity","sugar transmembrane transporter activity","carbohydrate transmembrane transporter activity","negative regulation of gluconeogenesis") ``` ## Go terms encriched by DEGs s ```{r} # Loop over each GO term in go_enriched for (i in seq_along(go_enriched)) { go_term <- go_enriched[i] print(go_enriched_names[i]) # Filter the merged_data for the current GO term merged_go_interest <- merged_data %>% filter(GO.ID == go_term) # Filter count_mat for genes related to the current GO term expression_data_deg <- count_mat %>% filter(Query %in% unique(merged_go_interest$gene_id)) # Adjust the column selection if necessary # Convert expression data to a matrix expression_data <- as.matrix(expression_data_deg %>% select(-c("gene_id", "Query"))) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_data))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] <- 0 # Set row names to gene IDs rownames(z_score_matrix) <- expression_data_deg$gene_id # Generate heatmaps for the current GO term print(Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("DEGs for", go_enriched_names[i]), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Do not cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), heatmap_legend_param = list(title = "Gene Count"))) } ```