---
title: "Closest gene comparisons across species"
author: "Jill Ashey"
date: "2024-09-11"
output: html_document
---

This code will examine the overlap (if any) between the GO enrichment of the closest gene across species. See the following rmds for TopGO analysis by species: XXXXXXXX

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(Biostrings) # Biostrings_2.62.0 
require(ggplot2)    # ggplot2_3.3.6
library(ggpubr)
library(ggseqlogo)  # install.packages("ggseqlogo")
library(reshape2)   # reshape2_1.4.4
library(data.table)
library(gghighlight) #install.packages("gghighlight")
library(plotrix)
library(dplyr)
library(tidyr)
library(tibble)
library(tidyverse)
library(ggvenn)
library(cowplot)
library(gridExtra)
library(ComplexUpset)
```

## piRNA

Compare terms identified in piRNA GO enrichment analysis across species

Read in data from all species
```{r}
apul_go_pi <- read.csv("../../D-Apul/output/19-Apul-bedtools-closest/Apul_GO_en_sig_gene_piRNA.csv")
apul_go_pi$Species <- "Acropora pulchra"
peve_go_pi <- read.csv("../../E-Peve/output/19-Peve-bedtools-closest/Peve_GO_en_sig_gene_piRNA.csv")
peve_go_pi$Species <- "Porites evermanni"
pmea_go_pi <- read.csv("../../F-Pmea/output/19-Pmea-bedtools-closest/Pmea_GO_en_sig_gene_piRNA.csv")
pmea_go_pi$Species <- "Pocillopora tuahiniensis"

all_go_pi <- rbind(apul_go_pi, pmea_go_pi, peve_go_pi)
```

Subset by ontology 
```{r}
# Apul
apul_go_pi_bp <- apul_go_pi %>%
  dplyr::filter(ontology == "Biological Processes")
apul_go_pi_cc <- apul_go_pi %>%
  dplyr::filter(ontology == "Cellular Components")
apul_go_pi_mf <- apul_go_pi %>%
  dplyr::filter(ontology == "Molecular Functions")

# Peve
peve_go_pi_bp <- peve_go_pi %>%
  dplyr::filter(ontology == "Biological Processes")
peve_go_pi_cc <- peve_go_pi %>%
  dplyr::filter(ontology == "Cellular Components")
peve_go_pi_mf <- peve_go_pi %>%
  dplyr::filter(ontology == "Molecular Functions")

# Pmea
pmea_go_pi_bp <- pmea_go_pi %>%
  dplyr::filter(ontology == "Biological Processes")
pmea_go_pi_cc <- pmea_go_pi %>%
  dplyr::filter(ontology == "Cellular Components")
pmea_go_pi_mf <- pmea_go_pi %>%
  dplyr::filter(ontology == "Molecular Functions")
```

Look at overlaps with venn diagrams 
```{r}
# Biological Processes
GOs_BP_wide_pi <- list(APUL=apul_go_pi_bp$GeneOntologyIDs, PTUH=pmea_go_pi_bp$GeneOntologyIDs,PEVE=peve_go_pi_bp$GeneOntologyIDs)

bp_venn_pi <- ggvenn(GOs_BP_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_pi

# Cellular Components
GOs_CC_wide_pi <- list(APUL=apul_go_pi_cc$GeneOntologyIDs, PTUH=pmea_go_pi_cc$GeneOntologyIDs,PEVE=peve_go_pi_cc$GeneOntologyIDs)

cc_venn_pi <- ggvenn(GOs_CC_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_pi

# Molecular Functions
GOs_MF_wide_pi <- list(APUL=apul_go_pi_mf$GeneOntologyIDs, PTUH=pmea_go_pi_mf$GeneOntologyIDs,PEVE=peve_go_pi_mf$GeneOntologyIDs)

mf_venn_pi <- ggvenn(GOs_MF_wide_pi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_pi
```

Assess shared GO terms between species 
```{r}
# Count the number of unique species for each GeneOntologyID
shared_go_pi <- all_go_pi %>%
  group_by(GeneOntologyIDs) %>%
  summarise(unique_species = n_distinct(Species)) %>%
  filter(unique_species > 1) %>%
  dplyr::select(GeneOntologyIDs)

# Create a new dataframe with only the shared GeneOntologyIDs
shared_go_pi_df <- all_go_pi %>%
  filter(GeneOntologyIDs %in% shared_go_pi$GeneOntologyIDs)

# Save as csv 
write.csv(shared_go_pi_df, "../output/19-bedtools-closest-gene/piRNA_shared_go_by_species.csv")

# Plot 
GO_plot_pi <- ggplot(shared_go_pi_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 25, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  theme_bw() +
  theme(
    axis.title = element_text(size = 36, face = "bold"),        # Axis title size
    axis.text = element_text(size = 34, colour = "black"),                        # Axis text size
    legend.title = element_text(size = 34, face = "bold"),      # Legend title size
    legend.text = element_text(size = 32),                      # Legend text size
    strip.text = element_text(size = 34, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside"    
  ) +
  coord_flip(); GO_plot_pi

ggsave(filename = "../figures/piRNA_shared_GOs_plot.png", GO_plot_pi, width = 25, height = 25)
ggsave(filename = "../figures/piRNA_shared_GOs_plot.pdf", GO_plot_pi, width = 25, height = 25)
```

I like this plot a lot!

Plot only BP 
```{r}
shared_go_pi_df_bp <- shared_go_pi_df %>%
  dplyr::filter(ontology == "Biological Processes")
  
# Plot 
GO_BP_plot_pi <- ggplot(shared_go_pi_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 25, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  #scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  #facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  coord_flip() +
  theme_bw() +
  theme(
    axis.title = element_text(size = 51, face = "bold"),        # Axis title size
    axis.text = element_text(size = 49, colour = "black"),                        # Axis text size
    axis.text.x = element_text(hjust = 1),  # Rotate and adjust label position
    legend.title = element_text(size = 44, face = "bold"),      # Legend title size
    legend.text = element_text(size = 42, face = "italic"),                      # Legend text size
    strip.text = element_text(size = 49, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside",
    plot.margin = margin(t = 10, r = 10, b = 20, l = 10)        # Add space below the plot
  ); GO_BP_plot_pi

ggsave(filename = "../figures/piRNA_shared_GO_BP_plot.png", GO_BP_plot_pi, width = 40, height = 25)
ggsave(filename = "../figures/piRNA_shared_GO_BP_plot.pdf", GO_BP_plot_pi, width = 40, height = 25)
```


## miRNA

Compare terms identified in miRNA GO enrichment analysis across species

Read in data from all species
```{r}
apul_go_mi <- read.csv("../../D-Apul/output/19-bedtools-closest/Apul_GO_en_sig_gene_miRNA.csv")
apul_go_mi$Species <- "Acropora pulchra"
peve_go_mi <- read.csv("../../E-Peve/output/19-bedtools-closest/Peve_GO_en_sig_gene_miRNA.csv")
peve_go_mi$Species <- "Porites evermanni"
pmea_go_mi <- read.csv("../../F-Pmea/output/19-bedtools-closest/Pmea_GO_en_sig_gene_miRNA.csv")
pmea_go_mi$Species <- "Pocillopora tuahiniensis"

all_go_mi <- rbind(apul_go_mi, pmea_go_mi, peve_go_mi)
```

Subset by ontology 
```{r}
# Apul
apul_go_mi_bp <- apul_go_mi %>%
  dplyr::filter(ontology == "Biological Processes")
apul_go_mi_cc <- apul_go_mi %>%
  dplyr::filter(ontology == "Cellular Components")
apul_go_mi_mf <- apul_go_mi %>%
  dplyr::filter(ontology == "Molecular Functions")

# Peve
peve_go_mi_bp <- peve_go_mi %>%
  dplyr::filter(ontology == "Biological Processes")
peve_go_mi_cc <- peve_go_mi %>%
  dplyr::filter(ontology == "Cellular Components")
peve_go_mi_mf <- peve_go_mi %>%
  dplyr::filter(ontology == "Molecular Functions")

# Pmea
pmea_go_mi_bp <- pmea_go_mi %>%
  dplyr::filter(ontology == "Biological Processes")
pmea_go_mi_cc <- pmea_go_mi %>%
  dplyr::filter(ontology == "Cellular Components")
pmea_go_mi_mf <- pmea_go_mi %>%
  dplyr::filter(ontology == "Molecular Functions")
```

Look at overlaps with venn diagrams 
```{r}
# Biological Processes
GOs_BP_wide_mi <- list(APUL=apul_go_mi_bp$GeneOntologyIDs, PTUH=pmea_go_mi_bp$GeneOntologyIDs,PEVE=peve_go_mi_bp$GeneOntologyIDs)

bp_venn_mi <- ggvenn(GOs_BP_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_mi

# Cellular Components
GOs_CC_wide_mi <- list(APUL=apul_go_mi_cc$GeneOntologyIDs, PTUH=pmea_go_mi_cc$GeneOntologyIDs,PEVE=peve_go_mi_cc$GeneOntologyIDs)

cc_venn_mi <- ggvenn(GOs_CC_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_mi

# Molecular Functions
GOs_MF_wide_mi <- list(APUL=apul_go_mi_mf$GeneOntologyIDs, PTUH=pmea_go_mi_mf$GeneOntologyIDs,PEVE=peve_go_mi_mf$GeneOntologyIDs)

mf_venn_mi <- ggvenn(GOs_MF_wide_mi, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_mi
```

Assess shared GO terms between species 
```{r}
# Count the number of unique species for each GeneOntologyID
shared_go_mi <- all_go_mi %>%
  group_by(GeneOntologyIDs) %>%  # Group by both GeneOntologyIDs and ontology
  summarise(unique_species = n_distinct(Species), .groups = 'keep') %>%  # Count distinct species for each GO term and ontology
  filter(unique_species > 1)

# Create a new dataframe with only the shared GeneOntologyIDs
shared_go_mi_df <- all_go_mi %>%
  filter(GeneOntologyIDs %in% shared_go_mi$GeneOntologyIDs)

# Identify terms associated with both CC and MF, and only keep CC
shared_go_mi_df <- shared_go_mi_df %>%
  group_by(GeneOntologyIDs) %>%
  mutate(cc_present = any(ontology == "Cellular Components")) %>%
  filter(!(ontology == "Molecular Functions" & cc_present)) %>%
  ungroup() %>%
  dplyr::select(-cc_present)

# Plot 
GO_plot_mi <- ggplot(shared_go_mi_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 25, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  theme_bw() +
  theme(
    axis.title = element_text(size = 36, face = "bold"),        # Axis title size
    axis.text = element_text(size = 34, colour = "black", angle = 75),                        # Axis text size
    legend.title = element_text(size = 34, face = "bold"),      # Legend title size
    legend.text = element_text(size = 32),                      # Legend text size
    strip.text = element_text(size = 34, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside"    
  ); GO_plot_mi

ggsave(filename = "../figures/miRNA_shared_GOs_plot.png", GO_plot_mi, width = 25, height = 30)
ggsave(filename = "../figures/miRNA_shared_GOs_plot.pdf", GO_plot_mi, width = 25, height = 30)
```

Plot only BP 
```{r}
shared_go_mi_df_bp <- shared_go_mi_df %>%
  dplyr::filter(ontology == "Biological Processes")
  
# Plot 
GO_BP_plot_mi <- ggplot(shared_go_mi_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 30, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  #scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  #facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  theme_bw() +
  theme(
    axis.title = element_text(size = 51, face = "bold"),        # Axis title size
    axis.text = element_text(size = 49, colour = "black"),                        # Axis text size
    axis.text.x = element_text(angle = 75, hjust = 1),  # Rotate and adjust label position
    legend.title = element_text(size = 44, face = "bold"),      # Legend title size
    legend.text = element_text(size = 42, face = "italic"),                      # Legend text size
    strip.text = element_text(size = 49, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside",
    plot.margin = margin(t = 10, r = 10, b = 20, l = 10)        # Add space below the plot
  ); GO_BP_plot_mi

ggsave(filename = "../figures/miRNA_shared_GO_BP_plot.png", GO_BP_plot_mi, width = 25, height = 35)
ggsave(filename = "../figures/miRNA_shared_GO_BP_plot.pdf", GO_BP_plot_mi, width = 25, height = 35)
```

## lncRNAs 

Compare terms identified in piRNA GO enrichment analysis across species

Read in data from all species
```{r}
apul_go_lnc <- read.csv("../../D-Apul/output/19-Apul-bedtools-closest/Apul_GO_en_sig_gene_lncRNA.csv")
apul_go_lnc$Species <- "Acropora pulchra"
peve_go_lnc <- read.csv("../../E-Peve/output/19-Peve-bedtools-closest/Peve_GO_en_sig_gene_lncRNA.csv")
peve_go_lnc$Species <- "Porites evermanni"
pmea_go_lnc <- read.csv("../../F-Pmea/output/19-Pmea-bedtools-closest/Pmea_GO_en_sig_gene_lncRNA.csv")
pmea_go_lnc$Species <- "Pocillopora tuahiniensis"

all_go_lnc <- rbind(apul_go_lnc, pmea_go_lnc, peve_go_lnc)
```

Subset by ontology 
```{r}
# Apul
apul_go_lnc_bp <- apul_go_lnc %>%
  dplyr::filter(ontology == "Biological Processes")
apul_go_lnc_cc <- apul_go_lnc %>%
  dplyr::filter(ontology == "Cellular Components")
apul_go_lnc_mf <- apul_go_lnc %>%
  dplyr::filter(ontology == "Molecular Functions")

# Peve
peve_go_lnc_bp <- peve_go_lnc %>%
  dplyr::filter(ontology == "Biological Processes")
peve_go_lnc_cc <- peve_go_lnc %>%
  dplyr::filter(ontology == "Cellular Components")
peve_go_lnc_mf <- peve_go_lnc %>%
  dplyr::filter(ontology == "Molecular Functions")

# Pmea
pmea_go_lnc_bp <- pmea_go_lnc %>%
  dplyr::filter(ontology == "Biological Processes")
pmea_go_lnc_cc <- pmea_go_lnc %>%
  dplyr::filter(ontology == "Cellular Components")
pmea_go_lnc_mf <- pmea_go_lnc %>%
  dplyr::filter(ontology == "Molecular Functions")
```

Look at overlaps with venn diagrams 
```{r}
# Biological Processes
GOs_BP_wide_lnc <- list(APUL=apul_go_lnc_bp$GeneOntologyIDs, PTUH=pmea_go_lnc_bp$GeneOntologyIDs,PEVE=peve_go_lnc_bp$GeneOntologyIDs)

bp_venn_lnc <- ggvenn(GOs_BP_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); bp_venn_lnc

# Cellular Components
GOs_CC_wide_lnc <- list(APUL=apul_go_lnc_cc$GeneOntologyIDs, PTUH=pmea_go_lnc_cc$GeneOntologyIDs,PEVE=peve_go_lnc_cc$GeneOntologyIDs)

cc_venn_lnc <- ggvenn(GOs_CC_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); cc_venn_lnc

# Molecular Functions
GOs_MF_wide_lnc <- list(APUL=apul_go_lnc_mf$GeneOntologyIDs, PTUH=pmea_go_lnc_mf$GeneOntologyIDs,PEVE=peve_go_lnc_mf$GeneOntologyIDs)

mf_venn_lnc <- ggvenn(GOs_MF_wide_lnc, fill_color = c("#408EC6", "#7A2048", "#1E2761"),
  stroke_size = 0.5, set_name_size = 0, show_percentage = F, stroke_color = F); mf_venn_lnc
```

Assess shared GO terms between species 
```{r}
# Count the number of unique species for each GeneOntologyID
shared_go_lnc <- all_go_lnc %>%
  group_by(GeneOntologyIDs) %>%
  summarise(unique_species = n_distinct(Species)) %>%
  filter(unique_species > 1) %>%
  dplyr::select(GeneOntologyIDs)

# Create a new dataframe with only the shared GeneOntologyIDs
shared_go_lnc_df <- all_go_lnc %>%
  filter(GeneOntologyIDs %in% shared_go_lnc$GeneOntologyIDs)

# Save as csv 
write.csv(shared_go_lnc_df, "../output/19-bedtools-closest-gene/lncRNA_shared_go_by_species.csv")

# Plot 
GO_plot_lnc <- ggplot(shared_go_lnc_df, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 17, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  theme_bw() +
  theme(
    axis.title = element_text(size = 36, face = "bold"),        # Axis title size
    axis.text = element_text(size = 34, colour = "black"),                        # Axis text size
    legend.title = element_text(size = 34, face = "bold"),      # Legend title size
    legend.text = element_text(size = 32),                      # Legend text size
    strip.text = element_text(size = 34, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
    panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside"    
  ) +
  coord_flip(); GO_plot_lnc

ggsave(filename = "../figures/lncRNA_shared_GOs_plot.png", GO_plot_lnc, width = 35, height = 49)
ggsave(filename = "../figures/lncRNA_shared_GOs_plot.pdf", GO_plot_lnc, width = 35, height = 49)
```

Plot only BP
```{r}
shared_go_lnc_df_bp <- shared_go_lnc_df %>%
  dplyr::filter(ontology == "Biological Processes")
 
# Plot 
GO_BP_plot_lnc <- ggplot(shared_go_lnc_df_bp, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 25, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  #scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  #facet_grid(vars(ontology), scales = "free", space = "free_y") + 
  coord_flip() +
  theme_bw() +
  theme(
    axis.title = element_text(size = 51, face = "bold"),        # Axis title size
    axis.text = element_text(size = 49, colour = "black"),                        # Axis text size
    axis.text.x = element_text(hjust = 1),  # Rotate and adjust label position
    legend.title = element_text(size = 44, face = "bold"),      # Legend title size
    legend.text = element_text(size = 42, face = "italic"),                      # Legend text size
    strip.text = element_text(size = 49, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside",
    plot.margin = margin(t = 10, r = 10, b = 20, l = 200)        # Add space below the plot
  ); GO_BP_plot_lnc

ggsave(filename = "../figures/lncRNA_shared_GO_BP_plot.png", GO_BP_plot_lnc, width = 40, height = 30)
ggsave(filename = "../figures/lncRNA_shared_GO_BP_plot.pdf", GO_BP_plot_lnc, width = 40, height = 30)
```

## All ncRNAs 

Filter out extra information from `shared_go_pi_df`, `shared_go_mi_df`, and `shared_go_lnc_df`
```{r}
# Look at dimensions and column names for each df 
dim(shared_go_pi_df)
dim(shared_go_mi_df)
dim(shared_go_lnc_df)
colnames(shared_go_pi_df)
colnames(shared_go_mi_df)
colnames(shared_go_lnc_df)

# Select certain columns 
filt_shared_go_pi_df <- shared_go_pi_df %>%
  dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, pi_chrom, pi_start, pi_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>%
  rename(ncrna_chrom = pi_chrom, ncrna_start = pi_start, ncrna_end = pi_end) %>%
  mutate(ncrna = "piRNA")

filt_shared_go_mi_df <- shared_go_mi_df %>%
  dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, m_chrom, m_start, m_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>%
  rename(ncrna_chrom = m_chrom, ncrna_start = m_start, ncrna_end = m_end) %>%
  mutate(ncrna = "miRNA")

filt_shared_go_lnc_df <- shared_go_lnc_df %>%
  dplyr::select(c(gene_name, GeneOntologyIDs, Term, Annotated, Significant, Expected, Fisher, ontology, l_chrom, l_start, l_end, gf_chrom, gf_type, gf_start, gf_end, overlap, prop.sig.genes, Species)) %>%
  rename(ncrna_chrom = l_chrom, ncrna_start = l_start, ncrna_end = l_end) %>%
  mutate(ncrna = "lncRNA")

# Bind dfs together 
all_shared <- rbind(filt_shared_go_pi_df, filt_shared_go_mi_df, filt_shared_go_lnc_df)

# Save as csv 
write.csv(all_shared, "../output/19-bedtools-closest-gene/ncRNAs_shared_go_by_species.csv")
```

Plot 
```{r}
GO_plot_all <- ggplot(all_shared, aes(x = Term, y = -log10(Fisher), shape = Species, fill = Species)) +
  geom_point(size = 17, color = "black", stroke = 2) +  # Black border around shapes
  scale_shape_manual(values = c(21, 22, 23)) +  # Shapes that can be filled
  scale_fill_manual(values = c("#408EC6", "#7A2048", "#1E2761")) +  # Custom colors for each species
  scale_size(range = c(2, 12)) + 
  xlab('') + 
  ylab('log(Enrichment score)') +
  facet_grid(vars(ontology), vars(ncrna), scales = "free", space = "free_y") + 
  theme_bw() +
  theme(
    axis.title = element_text(size = 36, face = "bold"),        # Axis title size
    axis.text = element_text(size = 34, colour = "black"),                        # Axis text size
    legend.title = element_text(size = 34, face = "bold"),      # Legend title size
    legend.text = element_text(size = 32),                      # Legend text size
    strip.text = element_text(size = 34, face = "bold"),        # Facet text size
    strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5),         # Facet background
    axis.line = element_line(size = 1, colour = "black"),       # Enhanced axis lines
    axis.ticks = element_line(size = 1),                        # Thicker axis ticks
     #panel.border = element_blank()                             # Remove panel border
    #panel.grid.major = element_blank(),                         # Remove major grid lines
    #panel.grid.minor = element_blank()                          # Remove minor grid lines
        panel.border = element_rect(color = "black", size = 1.2),   # Enhanced facet border lines
    panel.grid.major = element_line(size = 0.5, color = "gray"), # Grid lines inside the facets
    panel.spacing = unit(1, "lines"),                           # Increase space between facets
    strip.placement = "outside"
  ) +
  coord_flip(); GO_plot_all

ggsave(filename = "../figures/all_ncRNAs_shared_GOs_plot.png", GO_plot_all, width = 35, height = 49)
ggsave(filename = "../figures/all_ncRNAs_shared_GOs_plot.pdf", GO_plot_all, width = 35, height = 49)
```

## Plot lengths for each species 

I am going to make length plots for each species for lncRNAs, miRNAs and piRNAs so that the format is the same for everything. Also going to make heatmap-esque diagrams for shared sequences for lncRNAs and miRNAs here so that the format is the same and I can join them in one figure. 


### miRNAs 

Kathleen wrote code for the miRNA [length comparison](https://github.com/urol-e5/deep-dive/blob/main/DEF-cross-species/code/10-shortRNA-ShortStack-comparison.Rmd). I'm going to modify her code 

Read in length data for miRNAs
```{r}
Apul_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Apul_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length")
Peve_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Peve_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length")
Pmea_lengths_mi <- read.table("../data/10-shortRNA-ShortStack-comparison/Pmea_ShortStack_mature_lengths.txt", header = FALSE, col.names = "length")
```

Combine dfs
```{r}
# Add a new column to each data frame to label the source file
Apul_lengths_mi <- Apul_lengths_mi %>% mutate(Species = 'A. pulchra')
Peve_lengths_mi <- Peve_lengths_mi %>% mutate(Species = 'P. evermanni')
Pmea_lengths_mi <- Pmea_lengths_mi %>% mutate(Species = 'P. tuahiniensis')

# Combine the data frames into one
all_lengths_mi <- rbind(Apul_lengths_mi, Peve_lengths_mi, Pmea_lengths_mi)
```

Count miRNA lengths by species
```{r}
count_by_group_mi <- all_lengths_mi %>%
  dplyr::count(Species, length)
count_by_group_mi
```

Plot miRNA lengths of all species
```{r}
# Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names
species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761")

length_plot_mi <- ggplot(count_by_group_mi, aes(x = length, y = n, fill = Species)) + 
  geom_bar(position = "dodge",
           stat = "identity",
           color = "black",  
           size = 2,         
           width = 0.9) + 
  scale_fill_manual(values = species_colors_nolabel) + 
  scale_y_continuous(expand = c(0, 0)) +   
  labs(x = "", y = "Read count") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)),
    axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)),
    axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10)),  # Increased bottom margin
    axis.line = element_line(size = 1.5, color = "black"),
    axis.ticks = element_line(size = 1.5, color = "black"),
    plot.margin = unit(c(1, 1, 1, 1), "cm"),
    legend.position = "none"
  ); length_plot_mi


ggsave(filename = "../figures/miRNA_lengths.png", length_plot_mi, width = 34, height = 24)
ggsave(filename = "../figures/miRNA_lengths.pdf", length_plot_mi, width = 34, height = 24)
```

### piRNAs

Javi wrote code for the piRNA [length comparison](https://github.com/urol-e5/deep-dive/blob/main/DEF-cross-species/code/12-piRNA-comparizon.Rmd). I'm going to modify his code.

Read in length data for piRNAs 
```{r}
APUL_L = list.files(path = "../../D-Apul/output/18-Apul-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T)

PMEA_L = list.files(path = "../../F-Pmea/output/18-Pmea-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T)

PEVE_L = list.files(path = "../../E-Peve/output/18-Peve-piRNA/0_piRNA_pipeline_proTRAC/piRNA_length_data", pattern=".txt", recursive = T, full.names = T)

L = c(APUL_L,PMEA_L,PEVE_L)
names <- L %>%
  substring(.,75) %>%
  gsub("_piRNA_length.txt", "", .)

DFs = lapply(L, function(x) {
  data <- read.table(paste0(x), comment.char = "")
  colnames(data) <- c("freq", "length")
  return(data)
})
```

Count piRNA lengths by species  
```{r}
combo = Reduce(function(...) merge(..., by = "length", all=T), DFs)
colnames(combo) <- c("length", names)
combo <- combo %>%
  mutate(ACR = rowSums(combo[,2:6])) %>%
  mutate(POC = rowSums(combo[,7:11])) %>%
  mutate(POR = rowSums(combo[,12:14])) %>%
  select(length,ACR,POC,POR) %>%
  pivot_longer(c("ACR", "POC", "POR"), values_to = "sums", names_to = "group") %>%
  mutate(Species = case_when(
    group == "ACR" ~ "A. pulchra", 
    group == "POR" ~ "P. evermanni",
    group == "POC" ~ "P. tuahiniensis"))
```

Plot piRNA lengths of all species
```{r}
# Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names
species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761")

length_plot_pi <- ggplot(combo, aes(x = length, y = sums, fill = Species)) + 
  geom_bar(position = "dodge",
           stat = "identity",
           color = "black",  # Bar outline color
           size = 2,         # Thicker bar outlines
           width = 0.9) + 
  scale_fill_manual(values = species_colors_nolabel) + 
  scale_y_continuous(expand = c(0, 0)) +   # Bars sit on x-axis
  scale_x_continuous(breaks = unique(combo$length)) +  # Ensure every "length" value appears on the x-axis
  labs(x = "Length (bp)", y = "") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)),
    axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)),
    axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10)),  # Increased bottom margin
    axis.line = element_line(size = 1.5, color = "black"),  # Darker, thicker axis lines
    axis.ticks = element_line(size = 1.5, color = "black"), # Thicker axis ticks
    plot.margin = unit(c(1, 1, 1, 1), "cm"),                # Margin around plot (top, right, bottom, left)
    legend.title = element_text(size = 40, face = "bold"),  # Increase legend title size
    legend.text = element_text(size = 40),
    legend.position = "none"                              
  ) +
  guides(fill = guide_legend(override.aes = list(size = 20)));  # Increase the size of legend squares
length_plot_pi

ggsave(filename = "../figures/piRNA_lengths.png", length_plot_pi, width = 34, height = 20)
ggsave(filename = "../figures/piRNA_lengths.pdf", length_plot_pi, width = 34, height = 20)
```

### lncRNAs 

Calculate length from bed files
```{r}
apul_lnc_bed <- read.delim("../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA.bed", header = F) %>%
  rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>%
  mutate(length = end-start) %>%
  mutate(Species = "Acropora pulchra")
mean(apul_lnc_bed$length)
median(apul_lnc_bed$length)
max(apul_lnc_bed$length)
min(apul_lnc_bed$length)

peve_lnc_bed <- read.delim("../../E-Peve/output/Peve_lncRNA.bed", header = F) %>%
  rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>%
  mutate(length = end-start) %>%
  mutate(Species = "Porites evermanni")
mean(peve_lnc_bed$length)
median(peve_lnc_bed$length)
max(peve_lnc_bed$length)
min(peve_lnc_bed$length)

pmea_lnc_bed <- read.delim("../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA.bed", header = F) %>%
  rename("lncRNA" = "V1", "start" = "V2", "end" = "V3") %>%
  mutate(length = end-start) %>%
  mutate(Species = "Pocillopora tuahiniensis")
mean(pmea_lnc_bed$length)
median(pmea_lnc_bed$length)
max(pmea_lnc_bed$length)
min(pmea_lnc_bed$length)

# Combine the data frames into one
all_lengths_lnc <- rbind(apul_lnc_bed, peve_lnc_bed, pmea_lnc_bed)
```

Define custom bins so that the lncRNAs are grouped into 200-300 bp, 300-400 bp, etc
```{r}
all_lengths_lnc <- all_lengths_lnc %>%
  mutate(length_bin = cut(
    length,
    breaks = c(200, 300, 400, 500, 600, 700, 800, 900, 1000, Inf),  # Define custom breaks
    labels = c("200-300", "300-400", "400-500", "500-600", "600-700", "700-800", "800-900", "900-1000", ">1000"),  # Custom labels for each bin
    include.lowest = TRUE,  # Include the lowest value in the first interval
    right = FALSE           # Make the intervals left-closed (e.g., 200-300 includes 200, but not 300)
  ))
```

Calculate frequency of lncRNAs in each bin by species
```{r}
length_freq <- all_lengths_lnc %>%
  group_by(Species, length_bin) %>%
  summarize(frequency = n(), .groups = 'drop')
```

Plot lncRNA lengths for all species 
```{r}
# Set our color scheme for plotting -- options for both the abbreviated labels or the full, correct species names
species_colors_nolabel <- c("#408EC6", "#7A2048", "#1E2761")

length_plot_lnc <- ggplot(length_freq, aes(x = length_bin, y = frequency, fill = Species)) +
  geom_bar(position = "dodge",
           stat = "identity",
           color = "black",  # Bar outline color
           size = 2,         # Thicker bar outlines
           width = 0.9) + 
  scale_fill_manual(values = species_colors_nolabel) + 
  # Use scale_x_discrete instead of scale_x_continuous
  scale_x_discrete(name = "") +  # Ensure every "length_bin" value appears on the x-axis
  scale_y_continuous(expand = c(0, 0)) +  # Remove padding on y-axis so bars sit on x-axis
  labs(y = "") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 85, face = "bold", margin = margin(t = 100, r = 100, b = 100, l = 150)),
    axis.text.y = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 10, l = 10)),
    axis.text.x = element_text(size = 80, colour = "black", margin = margin(t = 10, r = 10, b = 30, l = 10), angle = 45, hjust = 1),  # Increased bottom margin
    axis.line = element_line(size = 1.5, color = "black"),  # Darker, thicker axis lines
    axis.ticks = element_line(size = 1.5, color = "black"), # Thicker axis ticks
    plot.margin = unit(c(1, 1, 1, 1), "cm"),                # Margin around plot (top, right, bottom, left)
    legend.title = element_text(size = 70, face = "bold"),  # Increase legend title size
    legend.text = element_text(size = 70),
    legend.position = "top"                              
  ) +
  guides(fill = guide_legend(override.aes = list(size = 20)));  # Increase the size of legend squares

length_plot_lnc

ggsave(filename = "../figures/lncRNA_lengths.png", length_plot_lnc, width = 34, height = 20)
ggsave(filename = "../figures/lncRNA_lengths.pdf", length_plot_lnc, width = 34, height = 20)
```

Plot RNA lengths w/ legend 
```{r}
plot <- plot_grid(length_plot_lnc, length_plot_mi, length_plot_pi, labels = c("A", "B", "C"), label_size = 80, nrow = 3, ncol = 1)

ggsave(filename = "../figures/ncRNA_lengths.png", plot, width = 65, height = 50, limitsize = F)
ggsave(filename = "../figures/ncRNA_lengths.pdf", plot, width = 65, height = 50, limitsize = F)
```