--- title: "Closest gene comparisons across species" author: "Jill Ashey" date: "2024-09-11" output: html_document --- This code will examine the overlap (if any) between the GO enrichment of the closest gene across species. See the following rmds for TopGO analysis by species: XXXXXXXX ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(Biostrings) # Biostrings_2.62.0 require(ggplot2) # ggplot2_3.3.6 library(ggpubr) library(ggseqlogo) # install.packages("ggseqlogo") library(reshape2) # reshape2_1.4.4 library(data.table) library(gghighlight) #install.packages("gghighlight") library(plotrix) library(dplyr) library(tidyr) library(tibble) library(tidyverse) library(ggvenn) library(cowplot) library(gridExtra) library(ComplexUpset) ``` ## piRNA Compare terms identified in piRNA GO enrichment analysis across species Read in data from all species ```{r} apul_go_pi <- read.csv("../../D-Apul/output/19-Apul-bedtools-closest/Apul_GO_en_sig_gene_piRNA.csv") apul_go_pi$Species <- "Acropora pulchra" peve_go_pi <- read.csv("../../E-Peve/output/19-Peve-bedtools-closest/Peve_GO_en_sig_gene_piRNA.csv") peve_go_pi$Species <- "Porites evermanni" pmea_go_pi <- read.csv("../../F-Pmea/output/19-Pmea-bedtools-closest/Pmea_GO_en_sig_gene_piRNA.csv") pmea_go_pi$Species <- "Pocillopora tuahiniensis" all_go_pi <- rbind(apul_go_pi, pmea_go_pi, peve_go_pi) ``` Subset by ontology ```{r} # Apul apul_go_pi_bp <- apul_go_pi %>% dplyr::filter(ontology == "Biological Processes") apul_go_pi_cc <- apul_go_pi %>% dplyr::filter(ontology == "Cellular Components") apul_go_pi_mf <- apul_go_pi %>% dplyr::filter(ontology == "Molecular Functions") # Peve peve_go_pi_bp <- peve_go_pi %>% dplyr::filter(ontology == "Biological Processes") peve_go_pi_cc <- peve_go_pi %>% dplyr::filter(ontology == "Cellular Components") peve_go_pi_mf <- peve_go_pi %>% dplyr::filter(ontology == "Molecular Functions") # Pmea pmea_go_pi_bp <- pmea_go_pi %>% dplyr::filter(ontology == "Biological Processes") pmea_go_pi_cc <- pmea_go_pi %>% dplyr::filter(ontology == "Cellular Components") pmea_go_pi_mf <- pmea_go_pi %>% dplyr::filter(ontology == "Molecular Functions") ``` Look at overlaps with venn diagrams ```{r} # Biological Processes GOs_BP_wide_pi <- list(APUL=apul_go_pi_bp$GeneOntologyIDs, PTUH=pmea_go_pi_bp$GeneOntologyIDs,PEVE=peve_go_pi_bp$GeneOntologyIDs) bp_venn_pi <- ggvenn(GOs_BP_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_pi # Cellular Components GOs_CC_wide_pi <- list(APUL=apul_go_pi_cc$GeneOntologyIDs, PTUH=pmea_go_pi_cc$GeneOntologyIDs,PEVE=peve_go_pi_cc$GeneOntologyIDs) cc_venn_pi <- ggvenn(GOs_CC_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_pi # Molecular Functions GOs_MF_wide_pi <- list(APUL=apul_go_pi_mf$GeneOntologyIDs, PTUH=pmea_go_pi_mf$GeneOntologyIDs,PEVE=peve_go_pi_mf$GeneOntologyIDs) mf_venn_pi <- ggvenn(GOs_MF_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_pi ``` Assess shared GO terms between species ```{r} # Count the number of unique species for each GeneOntologyID shared_go_pi <- all_go_pi %>% group_by(GeneOntologyIDs) %>% summarise(unique_species = n_distinct(Species)) %>% filter(unique_species > 1) %>% dplyr::select(GeneOntologyIDs) # Create a new dataframe with only the shared GeneOntologyIDs shared_go_pi_df <- all_go_pi %>% filter(GeneOntologyIDs %in% shared_go_pi$GeneOntologyIDs) # Save as csv write.csv(shared_go_pi_df, "../output/19-bedtools-closest-gene/piRNA_shared_go_by_species.csv") # Plot GO_plot_pi <- ggplot(shared_go_pi_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 25, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + facet_grid(vars(ontology), scales = "free", space = "free_y") + theme_bw() + theme( axis.title = element_text(size = 36, face = "bold"), # Axis title size axis.text = element_text(size = 34, colour = "black"), # Axis text size legend.title = element_text(size = 34, face = "bold"), # Legend title size legend.text = element_text(size = 32), # Legend text size strip.text = element_text(size = 34, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside" ) + coord_flip(); GO_plot_pi ggsave(filename = "../figures/piRNA_shared_GOs_plot.png", GO_plot_pi, width = 25, height = 25) ggsave(filename = "../figures/piRNA_shared_GOs_plot.pdf", GO_plot_pi, width = 25, height = 25) ``` I like this plot a lot! Plot only BP ```{r} shared_go_pi_df_bp <- shared_go_pi_df %>% dplyr::filter(ontology == "Biological Processes") # Plot GO_BP_plot_pi <- ggplot(shared_go_pi_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 25, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species #scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + #facet_grid(vars(ontology), scales = "free", space = "free_y") + coord_flip() + theme_bw() + theme( axis.title = element_text(size = 51, face = "bold"), # Axis title size axis.text = element_text(size = 49, colour = "black"), # Axis text size axis.text.x = element_text(hjust = 1), # Rotate and adjust label position legend.title = element_text(size = 44, face = "bold"), # Legend title size legend.text = element_text(size = 42, face = "italic"), # Legend text size strip.text = element_text(size = 49, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside", plot.margin = margin(t = 10, r = 10, b = 20, l = 10) # Add space below the plot ); GO_BP_plot_pi ggsave(filename = "../figures/piRNA_shared_GO_BP_plot.png", GO_BP_plot_pi, width = 40, height = 25) ggsave(filename = "../figures/piRNA_shared_GO_BP_plot.pdf", GO_BP_plot_pi, width = 40, height = 25) ``` ## miRNA Compare terms identified in miRNA GO enrichment analysis across species Read in data from all species ```{r} apul_go_mi <- read.csv("../../D-Apul/output/19-bedtools-closest/Apul_GO_en_sig_gene_miRNA.csv") apul_go_mi$Species <- "Acropora pulchra" peve_go_mi <- read.csv("../../E-Peve/output/19-bedtools-closest/Peve_GO_en_sig_gene_miRNA.csv") peve_go_mi$Species <- "Porites evermanni" pmea_go_mi <- read.csv("../../F-Pmea/output/19-bedtools-closest/Pmea_GO_en_sig_gene_miRNA.csv") pmea_go_mi$Species <- "Pocillopora tuahiniensis" all_go_mi <- rbind(apul_go_mi, pmea_go_mi, peve_go_mi) ``` Subset by ontology ```{r} # Apul apul_go_mi_bp <- apul_go_mi %>% dplyr::filter(ontology == "Biological Processes") apul_go_mi_cc <- apul_go_mi %>% dplyr::filter(ontology == "Cellular Components") apul_go_mi_mf <- apul_go_mi %>% dplyr::filter(ontology == "Molecular Functions") # Peve peve_go_mi_bp <- peve_go_mi %>% dplyr::filter(ontology == "Biological Processes") peve_go_mi_cc <- peve_go_mi %>% dplyr::filter(ontology == "Cellular Components") peve_go_mi_mf <- peve_go_mi %>% dplyr::filter(ontology == "Molecular Functions") # Pmea pmea_go_mi_bp <- pmea_go_mi %>% dplyr::filter(ontology == "Biological Processes") pmea_go_mi_cc <- pmea_go_mi %>% dplyr::filter(ontology == "Cellular Components") pmea_go_mi_mf <- pmea_go_mi %>% dplyr::filter(ontology == "Molecular Functions") ``` Look at overlaps with venn diagrams ```{r} # Biological Processes GOs_BP_wide_mi <- list(APUL=apul_go_mi_bp$GeneOntologyIDs, PTUH=pmea_go_mi_bp$GeneOntologyIDs,PEVE=peve_go_mi_bp$GeneOntologyIDs) bp_venn_mi <- ggvenn(GOs_BP_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_mi # Cellular Components GOs_CC_wide_mi <- list(APUL=apul_go_mi_cc$GeneOntologyIDs, PTUH=pmea_go_mi_cc$GeneOntologyIDs,PEVE=peve_go_mi_cc$GeneOntologyIDs) cc_venn_mi <- ggvenn(GOs_CC_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_mi # Molecular Functions GOs_MF_wide_mi <- list(APUL=apul_go_mi_mf$GeneOntologyIDs, PTUH=pmea_go_mi_mf$GeneOntologyIDs,PEVE=peve_go_mi_mf$GeneOntologyIDs) mf_venn_mi <- ggvenn(GOs_MF_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_mi ``` Assess shared GO terms between species ```{r} # Count the number of unique species for each GeneOntologyID shared_go_mi <- all_go_mi %>% group_by(GeneOntologyIDs) %>% # Group by both GeneOntologyIDs and ontology summarise(unique_species = n_distinct(Species), .groups = 'keep') %>% # Count distinct species for each GO term and ontology filter(unique_species > 1) # Create a new dataframe with only the shared GeneOntologyIDs shared_go_mi_df <- all_go_mi %>% filter(GeneOntologyIDs %in% shared_go_mi$GeneOntologyIDs) # Identify terms associated with both CC and MF, and only keep CC shared_go_mi_df <- shared_go_mi_df %>% group_by(GeneOntologyIDs) %>% mutate(cc_present = any(ontology == "Cellular Components")) %>% filter(!(ontology == "Molecular Functions" & cc_present)) %>% ungroup() %>% dplyr::select(-cc_present) # Plot GO_plot_mi <- ggplot(shared_go_mi_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 25, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + facet_grid(vars(ontology), scales = "free", space = "free_y") + theme_bw() + theme( axis.title = element_text(size = 36, face = "bold"), # Axis title size axis.text = element_text(size = 34, colour = "black", angle = 75), # Axis text size legend.title = element_text(size = 34, face = "bold"), # Legend title size legend.text = element_text(size = 32), # Legend text size strip.text = element_text(size = 34, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside" ); GO_plot_mi ggsave(filename = "../figures/miRNA_shared_GOs_plot.png", GO_plot_mi, width = 25, height = 30) ggsave(filename = "../figures/miRNA_shared_GOs_plot.pdf", GO_plot_mi, width = 25, height = 30) ``` Plot only BP ```{r} shared_go_mi_df_bp <- shared_go_mi_df %>% dplyr::filter(ontology == "Biological Processes") # Plot GO_BP_plot_mi <- ggplot(shared_go_mi_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 30, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species #scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + #facet_grid(vars(ontology), scales = "free", space = "free_y") + theme_bw() + theme( axis.title = element_text(size = 51, face = "bold"), # Axis title size axis.text = element_text(size = 49, colour = "black"), # Axis text size axis.text.x = element_text(angle = 75, hjust = 1), # Rotate and adjust label position legend.title = element_text(size = 44, face = "bold"), # Legend title size legend.text = element_text(size = 42, face = "italic"), # Legend text size strip.text = element_text(size = 49, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside", plot.margin = margin(t = 10, r = 10, b = 20, l = 10) # Add space below the plot ); GO_BP_plot_mi ggsave(filename = "../figures/miRNA_shared_GO_BP_plot.png", GO_BP_plot_mi, width = 25, height = 35) ggsave(filename = "../figures/miRNA_shared_GO_BP_plot.pdf", GO_BP_plot_mi, width = 25, height = 35) ``` ## lncRNAs Compare terms identified in piRNA GO enrichment analysis across species Read in data from all species ```{r} apul_go_lnc <- read.csv("../../D-Apul/output/19-Apul-bedtools-closest/Apul_GO_en_sig_gene_lncRNA.csv") apul_go_lnc$Species <- "Acropora pulchra" peve_go_lnc <- read.csv("../../E-Peve/output/19-Peve-bedtools-closest/Peve_GO_en_sig_gene_lncRNA.csv") peve_go_lnc$Species <- "Porites evermanni" pmea_go_lnc <- read.csv("../../F-Pmea/output/19-Pmea-bedtools-closest/Pmea_GO_en_sig_gene_lncRNA.csv") pmea_go_lnc$Species <- "Pocillopora tuahiniensis" all_go_lnc <- rbind(apul_go_lnc, pmea_go_lnc, peve_go_lnc) ``` Subset by ontology ```{r} # Apul apul_go_lnc_bp <- apul_go_lnc %>% dplyr::filter(ontology == "Biological Processes") apul_go_lnc_cc <- apul_go_lnc %>% dplyr::filter(ontology == "Cellular Components") apul_go_lnc_mf <- apul_go_lnc %>% dplyr::filter(ontology == "Molecular Functions") # Peve peve_go_lnc_bp <- peve_go_lnc %>% dplyr::filter(ontology == "Biological Processes") peve_go_lnc_cc <- peve_go_lnc %>% dplyr::filter(ontology == "Cellular Components") peve_go_lnc_mf <- peve_go_lnc %>% dplyr::filter(ontology == "Molecular Functions") # Pmea pmea_go_lnc_bp <- pmea_go_lnc %>% dplyr::filter(ontology == "Biological Processes") pmea_go_lnc_cc <- pmea_go_lnc %>% dplyr::filter(ontology == "Cellular Components") pmea_go_lnc_mf <- pmea_go_lnc %>% dplyr::filter(ontology == "Molecular Functions") ``` Look at overlaps with venn diagrams ```{r} # Biological Processes GOs_BP_wide_lnc <- list(APUL=apul_go_lnc_bp$GeneOntologyIDs, PTUH=pmea_go_lnc_bp$GeneOntologyIDs,PEVE=peve_go_lnc_bp$GeneOntologyIDs) bp_venn_lnc <- ggvenn(GOs_BP_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_lnc # Cellular Components GOs_CC_wide_lnc <- list(APUL=apul_go_lnc_cc$GeneOntologyIDs, PTUH=pmea_go_lnc_cc$GeneOntologyIDs,PEVE=peve_go_lnc_cc$GeneOntologyIDs) cc_venn_lnc <- ggvenn(GOs_CC_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_lnc # Molecular Functions GOs_MF_wide_lnc <- list(APUL=apul_go_lnc_mf$GeneOntologyIDs, PTUH=pmea_go_lnc_mf$GeneOntologyIDs,PEVE=peve_go_lnc_mf$GeneOntologyIDs) mf_venn_lnc <- ggvenn(GOs_MF_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"), stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_lnc ``` Assess shared GO terms between species ```{r} # Count the number of unique species for each GeneOntologyID shared_go_lnc <- all_go_lnc %>% group_by(GeneOntologyIDs) %>% summarise(unique_species = n_distinct(Species)) %>% filter(unique_species > 1) %>% dplyr::select(GeneOntologyIDs) # Create a new dataframe with only the shared GeneOntologyIDs shared_go_lnc_df <- all_go_lnc %>% filter(GeneOntologyIDs %in% shared_go_lnc$GeneOntologyIDs) # Save as csv write.csv(shared_go_lnc_df, "../output/19-bedtools-closest-gene/lncRNA_shared_go_by_species.csv") # Plot GO_plot_lnc <- ggplot(shared_go_lnc_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 17, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + facet_grid(vars(ontology), scales = "free", space = "free_y") + theme_bw() + theme( axis.title = element_text(size = 36, face = "bold"), # Axis title size axis.text = element_text(size = 34, colour = "black"), # Axis text size legend.title = element_text(size = 34, face = "bold"), # Legend title size legend.text = element_text(size = 32), # Legend text size strip.text = element_text(size = 34, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside" ) + coord_flip(); GO_plot_lnc ggsave(filename = "../figures/lncRNA_shared_GOs_plot.png", GO_plot_lnc, width = 35, height = 49) ggsave(filename = "../figures/lncRNA_shared_GOs_plot.pdf", GO_plot_lnc, width = 35, height = 49) ``` Plot only BP ```{r} shared_go_lnc_df_bp <- shared_go_lnc_df %>% dplyr::filter(ontology == "Biological Processes") # Plot GO_BP_plot_lnc <- ggplot(shared_go_lnc_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 25, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species #scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + #facet_grid(vars(ontology), scales = "free", space = "free_y") + coord_flip() + theme_bw() + theme( axis.title = element_text(size = 51, face = "bold"), # Axis title size axis.text = element_text(size = 49, colour = "black"), # Axis text size axis.text.x = element_text(hjust = 1), # Rotate and adjust label position legend.title = element_text(size = 44, face = "bold"), # Legend title size legend.text = element_text(size = 42, face = "italic"), # Legend text size strip.text = element_text(size = 49, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside", plot.margin = margin(t = 10, r = 10, b = 20, l = 200) # Add space below the plot ); GO_BP_plot_lnc ggsave(filename = "../figures/lncRNA_shared_GO_BP_plot.png", GO_BP_plot_lnc, width = 40, height = 30) ggsave(filename = "../figures/lncRNA_shared_GO_BP_plot.pdf", GO_BP_plot_lnc, width = 40, height = 30) ``` ## All ncRNAs Filter out extra information from `shared_go_pi_df`, `shared_go_mi_df`, and `shared_go_lnc_df` ```{r} # Look at dimensions and column names for each df dim(shared_go_pi_df) dim(shared_go_mi_df) dim(shared_go_lnc_df) colnames(shared_go_pi_df) colnames(shared_go_mi_df) colnames(shared_go_lnc_df) # Select certain columns filt_shared_go_pi_df <- shared_go_pi_df %>% dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, pi_chrom, pi_start, pi_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>% rename(ncrna_chrom = pi_chrom, ncrna_start = pi_start, ncrna_end = pi_end) %>% mutate(ncrna = "piRNA") filt_shared_go_mi_df <- shared_go_mi_df %>% dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, m_chrom, m_start, m_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>% rename(ncrna_chrom = m_chrom, ncrna_start = m_start, ncrna_end = m_end) %>% mutate(ncrna = "miRNA") filt_shared_go_lnc_df <- shared_go_lnc_df %>% dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, l_chrom, l_start, l_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>% rename(ncrna_chrom = l_chrom, ncrna_start = l_start, ncrna_end = l_end) %>% mutate(ncrna = "lncRNA") # Bind dfs together all_shared <- rbind(filt_shared_go_pi_df, filt_shared_go_mi_df, filt_shared_go_lnc_df) # Save as csv write.csv(all_shared, "../output/19-bedtools-closest-gene/ncRNAs_shared_go_by_species.csv") ``` Plot ```{r} GO_plot_all <- ggplot(all_shared, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) + geom_point(size = 17, color = "black", stroke = 2) + # Black border around shapes scale_shape_manual(values = c(21, 22, 23)) + # Shapes that can be filled scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) + # Custom colors for each species scale_size(range = c(2, 12)) + xlab('') + ylab('log(Enrichment score)') + facet_grid(vars(ontology), vars(ncrna), scales = "free", space = "free_y") + theme_bw() + theme( axis.title = element_text(size = 36, face = "bold"), # Axis title size axis.text = element_text(size = 34, colour = "black"), # Axis text size legend.title = element_text(size = 34, face = "bold"), # Legend title size legend.text = element_text(size = 32), # Legend text size strip.text = element_text(size = 34, face = "bold"), # Facet text size strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Facet background axis.line = element_line(size = 1, colour = "black"), # Enhanced axis lines axis.ticks = element_line(size = 1), # Thicker axis ticks #panel.border = element_blank() # Remove panel border #panel.grid.major = element_blank(), # Remove major grid lines #panel.grid.minor = element_blank() # Remove minor grid lines panel.border = element_rect(color = "black", size = 1.2), # Enhanced facet border lines panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets panel.spacing = unit(1, "lines"), # Increase space between facets strip.placement = "outside" ) + coord_flip(); GO_plot_all ggsave(filename = "../figures/all_ncRNAs_shared_GOs_plot.png", GO_plot_all, width = 35, height = 49) ggsave(filename = "../figures/all_ncRNAs_shared_GOs_plot.pdf", GO_plot_all, width = 35, height = 49) ``` ## Plot lengths for each species I am going to make length plots for each species for lncRNAs, miRNAs and piRNAs so that the format is the same for everything. Also going to make heatmap-esque diagrams for shared sequences for lncRNAs and miRNAs here so that the format is the same and I can join them in one figure. ### miRNAs Kathleen wrote code for the miRNA [length comparison](https://github.com/urol-e5/deep-dive/blob/main/DEF-cross-species/code/10-shortRNA-ShortStack-comparison.Rmd). I'm going to modify her code Read in length data for miRNAs ```{r} Apul_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Apul_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length") Peve_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Peve_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length") Pmea_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Pmea_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length") ``` Combine dfs ```{r} # Add a new column to each data frame to label the source file Apul_lengths_mi <- Apul_lengths_mi %>% mutate(Species = 'A. pulchra') Peve_lengths_mi <- Peve_lengths_mi %>% mutate(Species = 'P. evermanni') Pmea_lengths_mi <- Pmea_lengths_mi %>% mutate(Species = 'P. tuahiniensis') # Combine the data frames into one all_lengths_mi <- rbind(Apul_lengths_mi, Peve_lengths_mi, Pmea_lengths_mi) ``` Count miRNA lengths by species ```{r} count_by_group_mi <- all_lengths_mi %>% dplyr::count(Species, length) count_by_group_mi ``` Plot miRNA lengths of all species ```{r} # Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761") length_plot_mi <- ggplot(count_by_group_mi, aes(x = length, y = n, fill = Species)) + geom_bar(position = "dodge", stat = "identity", color = "black", size = 2, width = 0.9) + scale_fill_manual(values = species_colors_nolabel) + scale_y_continuous(expand = c(0, 0)) + labs(x = "", y = "Read count") + theme_minimal() + theme( axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)), axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10)), # Increased bottom margin axis.line = element_line(size = 1.5, color = "black"), axis.ticks = element_line(size = 1.5, color = "black"), plot.margin = unit(c(1, 1, 1, 1), "cm"), legend.position = "none" ); length_plot_mi ggsave(filename = "../figures/miRNA_lengths.png", length_plot_mi, width = 34, height = 24) ggsave(filename = "../figures/miRNA_lengths.pdf", length_plot_mi, width = 34, height = 24) ``` ### piRNAs Javi wrote code for the piRNA [length comparison](https://github.com/urol-e5/deep-dive/blob/main/DEF-cross-species/code/12-piRNA-comparizon.Rmd). I'm going to modify his code. Read in length data for piRNAs ```{r} APUL_L = list.files(path = "../../D-Apul/output/18-Apul-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T) PMEA_L = list.files(path = "../../F-Pmea/output/18-Pmea-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T) PEVE_L = list.files(path = "../../E-Peve/output/18-Peve-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T) L = c(APUL_L,PMEA_L,PEVE_L) names <- L %>% substring(.,75) %>% gsub("_piRNA_length.txt", "", .) DFs = lapply(L, function(x) { data <- read.table(paste0(x), comment.char = "") colnames(data) <- c("freq", "length") return(data) }) ``` Count piRNA lengths by species ```{r} combo = Reduce(function(...) merge(..., by = "length", all=T), DFs) colnames(combo) <- c("length", names) combo <- combo %>% mutate(ACR = rowSums(combo[,2:6])) %>% mutate(POC = rowSums(combo[,7:11])) %>% mutate(POR = rowSums(combo[,12:14])) %>% select(length,ACR,POC,POR) %>% pivot_longer(c("ACR", "POC", "POR"), values_to = "sums", names_to = "group") %>% mutate(Species = case_when( group == "ACR" ~ "A. pulchra", group == "POR" ~ "P. evermanni", group == "POC" ~ "P. tuahiniensis")) ``` Plot piRNA lengths of all species ```{r} # Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761") length_plot_pi <- ggplot(combo, aes(x = length, y = sums, fill = Species)) + geom_bar(position = "dodge", stat = "identity", color = "black", # Bar outline color size = 2, # Thicker bar outlines width = 0.9) + scale_fill_manual(values = species_colors_nolabel) + scale_y_continuous(expand = c(0, 0)) + # Bars sit on x-axis scale_x_continuous(breaks = unique(combo$length)) + # Ensure every "length" value appears on the x-axis labs(x = "Length (bp)", y = "") + theme_minimal() + theme( axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)), axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10)), # Increased bottom margin axis.line = element_line(size = 1.5, color = "black"), # Darker, thicker axis lines axis.ticks = element_line(size = 1.5, color = "black"), # Thicker axis ticks plot.margin = unit(c(1, 1, 1, 1), "cm"), # Margin around plot (top, right, bottom, left) legend.title = element_text(size = 40, face = "bold"), # Increase legend title size legend.text = element_text(size = 40), legend.position = "none" ) + guides(fill = guide_legend(override.aes = list(size = 20))); # Increase the size of legend squares length_plot_pi ggsave(filename = "../figures/piRNA_lengths.png", length_plot_pi, width = 34, height = 20) ggsave(filename = "../figures/piRNA_lengths.pdf", length_plot_pi, width = 34, height = 20) ``` ### lncRNAs Calculate length from bed files ```{r} apul_lnc_bed <- read.delim("../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA.bed", header = F) %>% rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>% mutate(length = end-start) %>% mutate(Species = "Acropora pulchra") mean(apul_lnc_bed$length) median(apul_lnc_bed$length) max(apul_lnc_bed$length) min(apul_lnc_bed$length) peve_lnc_bed <- read.delim("../../E-Peve/output/Peve_lncRNA.bed", header = F) %>% rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>% mutate(length = end-start) %>% mutate(Species = "Porites evermanni") mean(peve_lnc_bed$length) median(peve_lnc_bed$length) max(peve_lnc_bed$length) min(peve_lnc_bed$length) pmea_lnc_bed <- read.delim("../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA.bed", header = F) %>% rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>% mutate(length = end-start) %>% mutate(Species = "Pocillopora tuahiniensis") mean(pmea_lnc_bed$length) median(pmea_lnc_bed$length) max(pmea_lnc_bed$length) min(pmea_lnc_bed$length) # Combine the data frames into one all_lengths_lnc <- rbind(apul_lnc_bed, peve_lnc_bed, pmea_lnc_bed) ``` Define custom bins so that the lncRNAs are grouped into 200-300 bp, 300-400 bp, etc ```{r} all_lengths_lnc <- all_lengths_lnc %>% mutate(length_bin = cut( length, breaks = c(200, 300, 400, 500, 600, 700, 800, 900, 1000, Inf), # Define custom breaks labels = c("200-300", "300-400", "400-500", "500-600", "600-700", "700-800", "800-900", "900-1000", ">1000"), # Custom labels for each bin include.lowest = TRUE, # Include the lowest value in the first interval right = FALSE # Make the intervals left-closed (e.g., 200-300 includes 200, but not 300) )) ``` Calculate frequency of lncRNAs in each bin by species ```{r} length_freq <- all_lengths_lnc %>% group_by(Species, length_bin) %>% summarize(frequency = n(), .groups = 'drop') ``` Plot lncRNA lengths for all species ```{r} # Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761") length_plot_lnc <- ggplot(length_freq, aes(x = length_bin, y = frequency, fill = Species)) + geom_bar(position = "dodge", stat = "identity", color = "black", # Bar outline color size = 2, # Thicker bar outlines width = 0.9) + scale_fill_manual(values = species_colors_nolabel) + # Use scale_x_discrete instead of scale_x_continuous scale_x_discrete(name = "") + # Ensure every "length_bin" value appears on the x-axis scale_y_continuous(expand = c(0, 0)) + # Remove padding on y-axis so bars sit on x-axis labs(y = "") + theme_minimal() + theme( axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)), axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10), angle = 45, hjust = 1), # Increased bottom margin axis.line = element_line(size = 1.5, color = "black"), # Darker, thicker axis lines axis.ticks = element_line(size = 1.5, color = "black"), # Thicker axis ticks plot.margin = unit(c(1, 1, 1, 1), "cm"), # Margin around plot (top, right, bottom, left) legend.title = element_text(size = 70, face = "bold"), # Increase legend title size legend.text = element_text(size = 70), legend.position = "top" ) + guides(fill = guide_legend(override.aes = list(size = 20))); # Increase the size of legend squares length_plot_lnc ggsave(filename = "../figures/lncRNA_lengths.png", length_plot_lnc, width = 34, height = 20) ggsave(filename = "../figures/lncRNA_lengths.pdf", length_plot_lnc, width = 34, height = 20) ``` Plot RNA lengths w/ legend ```{r} plot <- plot_grid(length_plot_lnc, length_plot_mi, length_plot_pi, labels = c("A", "B", "C"), label_size = 80, nrow = 3, ncol = 1) ggsave(filename = "../figures/ncRNA_lengths.png", plot, width = 65, height = 50, limitsize = F) ggsave(filename = "../figures/ncRNA_lengths.pdf", plot, width = 65, height = 50, limitsize = F) ```