---
title: "Closest genomic features to ncRNAs - Peve"
author: "Jill Ashey"
date: "2024-08-24"
output: html_document
---

## Closest genomic features to ncRNAs in Porites evermanni 

This code will investigate the proximity of ncRNAs to mRNAs in Porites evermanni. 

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Load packages
library(tidyverse)
library(topGO)
```

See [bash code](https://github.com/JillAshey/JillAshey_Putnam_Lab_Notebook/blob/master/_posts/2024-08-19-e5-deepdive-ncRNA-closest-features.md) for `bedtools closest` analysis. 

Before analysis, make gene2go file from the annotation information
```{r}
GO_PEVE <- read.delim("../../data/Peve-rna-GO.tsv", sep = "\t", header = T)  %>%
  select(query, GeneOntologyIDs) %>%
  mutate(gene_name = gsub("Parent=","", query)) %>%
  select(gene_name, GeneOntologyIDs)
write_tsv(GO_PEVE, file = "../../data/Peve_gene2go.tab")
```

## piRNA 

Load Peve piRNA closest file into R. This file represents the genomic features that are closest/are overlapping with piRNA clusters. 
```{r}
pi_bed <- read.delim("../output/19-bedtools-closest/Peve_piRNA_output.bed", header = F)
head(pi_bed)
```

Keep specific columns and rename them 
```{r}
pi_bed_filt <- pi_bed %>%
  dplyr::select(V1, V2, V3, V4, V6, V7, V8, V12) %>%
  dplyr::rename("pi_chrom" = "V1", "pi_start" = "V2", "pi_end" = "V3", "gf_chrom" = "V4", "gf_type" = "V6", "gf_start" = "V7", "gf_end" = "V8", "gf_attr" = "V12")
```

Pull out gene name information from the `gf_attr` column and subset by mRNA
```{r}
pi_bed_filt <- pi_bed_filt %>%
  mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>%
  mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix
  dplyr::filter(gf_type == "mRNA")
```

Calculate the overlap and distance based on the l_start, l_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. 
```{r}
pi_bed_filt$overlap <- with(pi_bed_filt, pmin(pi_end, gf_end) - pmax(pi_start, gf_start))

sum(pi_bed_filt$overlap < 0)
sum(pi_bed_filt$overlap > 0)
```

Calculate average/percentages of distance and overlap. 
```{r}
summary_stats <- pi_bed_filt %>%
  summarize(
    avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE),
    avg_distance = mean(overlap[overlap < 0], na.rm = TRUE),
    percent_overlap = sum(overlap > 0) / n() * 100,
    percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100,
    percent_distance = sum(overlap < 0) / n() * 100
  )

# Display the summary statistics
print(summary_stats)
```

Results: 
avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance
1416.043      -4545.6        84.66258                4.907975         15.33742

Read in gene2go information 
```{r}
PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t")
```

Make list of genes for input to TopGO
```{r}
peve_clust_genes <- as.character(unique(pi_bed_filt$gene_name))
peve_all_genes <- as.character(PEVE_gene2go$gene_name)
Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes))
names(Peve_GeneList) <- peve_all_genes
```

The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. 

Read in gene-to-go-mappings
```{r}
PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t")
```

Set function to select genes of interest (ie those that have pvalue < 0.05)
```{r}
topDiffGenes <- function(allScore) {
return(allScore < 0.05)}
```

### Biological Processes

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_BP_pi <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_BP_FE_pi <- runTest(GO_PEVE_BP_pi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_BP_En_pi <- GenTable(GO_PEVE_BP_pi, Fisher = GO_PEVE_BP_FE_pi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_BP_En_pi$Fisher<-as.numeric(GO_PEVE_BP_En_pi$Fisher)
GO_PEVE_BP_En_sig_pi<-GO_PEVE_BP_En[GO_PEVE_BP_En_pi$Fisher<0.05,]
```

Merge `GO_PEVE_BP_En_sig_pi` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_BP_En_sig_pi$GO.ID <- trimws(GO_PEVE_BP_En_sig_pi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_BP_En_sig_gene_pi <- PEVE_gene2go %>%
  left_join(GO_PEVE_BP_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_BP_En_sig_gene_pi$ontology <- "Biological Processes"
```

### Cellular Components 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_CC_pi <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_CC_FE_pi <- runTest(GO_PEVE_CC_pi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_CC_En_pi <- GenTable(GO_PEVE_CC_pi, Fisher = GO_PEVE_CC_FE_pi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_CC_En_pi$Fisher<-as.numeric(GO_PEVE_CC_En_pi$Fisher)
GO_PEVE_CC_En_sig_pi<-GO_PEVE_CC_En_pi[GO_PEVE_CC_En_pi$Fisher<0.05,]
```

Merge `GO_PEVE_CC_En_sig_pi` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_CC_En_sig_pi$GO.ID <- trimws(GO_PEVE_CC_En_sig_pi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_CC_En_sig_gene_pi <- PEVE_gene2go %>%
  left_join(GO_PEVE_CC_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_CC_En_sig_gene_pi$ontology <- "Cellular Components"
```

### Molecular Functions 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_MF_pi <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_MF_FE_pi <- runTest(GO_PEVE_MF_pi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_MF_En_pi <- GenTable(GO_PEVE_MF_pi, Fisher = GO_PEVE_MF_FE_pi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_MF_En_pi$Fisher<-as.numeric(GO_PEVE_MF_En_pi$Fisher)
GO_PEVE_MF_En_sig_pi<-GO_PEVE_MF_En_pi[GO_PEVE_MF_En_pi$Fisher<0.05,]
```

Merge `GO_PEVE_MF_En_sig` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_MF_En_sig_pi$GO.ID <- trimws(GO_PEVE_MF_En_sig_pi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_MF_En_sig_gene_pi <- PEVE_gene2go %>%
  left_join(GO_PEVE_MF_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_MF_En_sig_gene_pi$ontology <- "Molecular Functions"
```

### Join ontologies for piRNAs

Bind `GO_PEVE_BP_En_sig_gene`, `GO_PEVE_CC_En_sig_gene`, and `GO_PEVE_MF_En_sig_gene` to make a df that has significantly enriched GO terms for all ontologies 
```{r}
GO_PEVE_En_sig_gene_pi <- rbind(GO_PEVE_BP_En_sig_gene_pi, GO_PEVE_CC_En_sig_gene_pi, GO_PEVE_MF_En_sig_gene_pi)
```

Merge `pi_bed_filt` with `GO_PEVE_En_sig_gene` and calculate proportion of significant genes
```{r}
merged_pirna <- GO_PEVE_En_sig_gene_pi %>%
  left_join(pi_bed_filt, by = "gene_name") %>%
  na.omit() %>%
  mutate(prop.sig.genes = Significant/Annotated)

# Write to csv 
write.csv(merged_pirna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_piRNA.csv")
```

Plot 
```{r}
GO_plot_pi<-ggplot(merged_pirna, aes(x = Term, y = Fisher, size = prop.sig.genes, fill = Fisher)) +
  #expand_limits(y = 1.5) +
  #ylim(1, 7.25) +
  # Add horizontal lines with a single aesthetic value
  #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) +
  #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) +
  geom_point(shape = 21) + 
  scale_size(range = c(2, 12)) + 
  scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") +
  xlab('') + 
  ylab('Enrichment score') +
  #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') +
  theme_bw() +
  facet_grid(vars(ontology), scales = "free", space = "free_y") +
  coord_flip(); GO_plot_pi
```

## miRNA 

Load Peve miRNA closest file into R. This file represents the genomic features that are closest/are overlapping with piRNA clusters. 
```{r}
mi_bed <- read.delim("../output/19-bedtools-closest/miRNA_filtered_Peve_output.bed", header = F)
head(mi_bed)
```

Keep specific columns and rename them. I'm going to label columns that relate to miRNAs with an "m" and columns that relate to genomic features with a "gf".
```{r}
mi_bed_filt <- mi_bed %>%
  dplyr::select(V1, V2, V3, V4, V5, V7, V9, V10, V11, V12, V13, V14, V16, V18) %>%
  dplyr::rename("m_chrom" = "V1", "m_source" = "V2", "m_type" = "V3", "m_start" = "V4", "m_end" = "V5", "m_strand" = "V7", "m_attr" = "V9", "gf_chrom" = "V10", "gf_source" = "V11", "gf_type" = "V12", "gf_start" = "V13", "gf_end" = "V14", "gf_strand" = "V16", "gf_attr" = "V18")
```

Pull out gene name information from the `gf_attr` column and subset by mRNA and mature miRNA. 
```{r}
mi_bed_filt <- mi_bed_filt %>%
  mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>%
  mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix
  dplyr::filter(gf_type == "mRNA") %>%
  dplyr::filter(m_type == "mature_miRNA")
```

Calculate the overlap and distance based on the m_start, m_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. 
```{r}
mi_bed_filt$overlap <- with(mi_bed_filt, pmin(m_end, gf_end) - pmax(m_start, gf_start))

sum(mi_bed_filt$overlap < 0)
sum(mi_bed_filt$overlap > 0)
```

Calculate average/percentages of distance and overlap. 
```{r}
# Perform calculations grouped by m_type
summary_stats <- mi_bed_filt %>%
  #group_by(m_type) %>%
  summarize(
    avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE),
    avg_distance = mean(overlap[overlap < 0], na.rm = TRUE),
    percent_overlap = sum(overlap > 0) / n() * 100,
    percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100,
    percent_distance = sum(overlap < 0) / n() * 100
  )

# Display the summary statistics
print(summary_stats)
```

Results:
avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance
    20.93939    -2480.154        71.73913                6.521739         28.26087

Read in gene2go information 
```{r}
PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t")
```

Make list of genes for input to TopGO
```{r}
peve_clust_genes <- as.character(unique(mi_bed_filt$gene_name))
peve_all_genes <- as.character(PEVE_gene2go$gene_name)
Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes))
names(Peve_GeneList) <- peve_all_genes
```

The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. 

Read in gene-to-go-mappings
```{r}
PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t")
```

Set function to select genes of interest (ie those that have pvalue < 0.05)
```{r}
topDiffGenes <- function(allScore) {
return(allScore < 0.05)}
```

### Biological Processes

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_BP_mi <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_BP_FE_mi <- runTest(GO_PEVE_BP_mi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_BP_En_mi <- GenTable(GO_PEVE_BP_mi, Fisher = GO_PEVE_BP_FE_mi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_BP_En_mi$Fisher<-as.numeric(GO_PEVE_BP_En_mi$Fisher)
GO_PEVE_BP_En_sig_mi<-GO_PEVE_BP_En_mi[GO_PEVE_BP_En_mi$Fisher<0.05,]
```

Merge `GO_PEVE_BP_En_sig_mi` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_BP_En_sig_mi$GO.ID <- trimws(GO_PEVE_BP_En_sig_mi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_BP_En_sig_gene_mi <- PEVE_gene2go %>%
  left_join(GO_PEVE_BP_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_BP_En_sig_gene_mi$ontology <- "Biological Processes"
```

### Cellular Components 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_CC_mi <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_CC_FE_mi <- runTest(GO_PEVE_CC_mi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_CC_En_mi <- GenTable(GO_PEVE_CC_mi, Fisher = GO_PEVE_CC_FE_mi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_CC_En_mi$Fisher<-as.numeric(GO_PEVE_CC_En_mi$Fisher)
GO_PEVE_CC_En_sig_mi<-GO_PEVE_CC_En_mi[GO_PEVE_CC_En_mi$Fisher<0.05,]
```

Merge `GO_PEVE_CC_En_sig_mi` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_CC_En_sig_mi$GO.ID <- trimws(GO_PEVE_CC_En_sig_mi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_CC_En_sig_gene_mi <- PEVE_gene2go %>%
  left_join(GO_PEVE_CC_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_CC_En_sig_gene_mi$ontology <- "Cellular Components"
```

### Molecular Functions 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_MF_mi <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_MF_FE_mi <- runTest(GO_PEVE_MF_mi, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_MF_En_mi <- GenTable(GO_PEVE_MF_mi, Fisher = GO_PEVE_MF_FE_mi, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_MF_En_mi$Fisher<-as.numeric(GO_PEVE_MF_En_mi$Fisher)
GO_PEVE_MF_En_sig_mi<-GO_PEVE_MF_En_mi[GO_PEVE_MF_En_mi$Fisher<0.05,]
```

Merge `GO_PEVE_MF_En_sig_mi` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_MF_En_sig_mi$GO.ID <- trimws(GO_PEVE_MF_En_sig_mi$GO.ID)

# Join the datasets based on GO term
GO_PEVE_MF_En_sig_gene_mi <- PEVE_gene2go %>%
  left_join(GO_PEVE_MF_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_MF_En_sig_gene_mi$ontology <- "Molecular Functions"
```

### Join ontologies for piRNAs

Bind `GO_PEVE_BP_En_sig_gene_mi`, `GO_PEVE_CC_En_sig_gene_mi`, and `GO_PEVE_MF_En_sig_gene_mi` to make a df that has significantly enriched GO terms for all ontologies 
```{r}
GO_PEVE_En_sig_gene_mi <- rbind(GO_PEVE_BP_En_sig_gene_mi, GO_PEVE_CC_En_sig_gene_mi, GO_PEVE_MF_En_sig_gene_mi)
```

Merge `mi_bed_filt` with `GO_PEVE_En_sig_gene_mi` and calculate proportion of significant genes
```{r}
merged_mirna <- GO_PEVE_En_sig_gene_mi %>%
  left_join(mi_bed_filt, by = "gene_name") %>%
  na.omit() %>%
  mutate(prop.sig.genes = Significant/Annotated)

# Write to csv 
write.csv(merged_mirna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_miRNA.csv")
```

Plot 
```{r}
GO_plot_mi<-ggplot(merged_mirna, aes(x = Term, y = -log10(Fisher), size = prop.sig.genes, fill = -log10(Fisher))) +
  #expand_limits(y = 1.5) +
  #ylim(1, 7.25) +
  # Add horizontal lines with a single aesthetic value
  #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) +
  #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) +
  geom_point(shape = 21) + 
  scale_size(range = c(2, 12)) + 
  scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") +
  xlab('') + 
  ylab('Enrichment score') +
  #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') +
  theme_bw() +
  facet_grid(vars(ontology), scales = "free", space = "free_y") +
  coord_flip(); GO_plot_mi
```

## lncRNAs

Load Peve lncRNA closest file into R. This file represents the genomic features that are closest/are overlapping with lncRNA clusters. 
```{r}
lnc_bed <- read.delim("../output/19-Peve-bedtools-closest/Peve_lncRNA_output.bed", header = F)
head(lnc_bed)
```

Keep specific columns and rename them 
```{r}
lnc_bed_filt <- lnc_bed %>%
  dplyr::select(V1, V2, V3, V4, V6, V7, V8, V12) %>%
  dplyr::rename("l_chrom" = "V1", "l_start" = "V2", "l_end" = "V3", "gf_chrom" = "V4", "gf_type" = "V6", "gf_start" = "V7", "gf_end" = "V8", "gf_attr" = "V12")
```

Pull out gene name information from the `gf_attr` column and subset by mRNA
```{r}
lnc_bed_filt <- lnc_bed_filt %>%
  mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>%
  mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix
  dplyr::filter(gf_type == "mRNA")
```

Calculate the overlap and distance based on the l_start, l_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. 
```{r}
lnc_bed_filt$overlap <- with(lnc_bed_filt, pmin(l_end, gf_end) - pmax(l_start, gf_start))

sum(lnc_bed_filt$overlap < 0)
sum(lnc_bed_filt$overlap > 0)
```

Calculate average/percentages of distance and overlap. 
```{r}
summary_stats <- lnc_bed_filt %>%
  summarize(
    avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE),
    median_overlap = median(overlap[overlap > 0], na.rm = TRUE),
    avg_distance = mean(overlap[overlap < 0], na.rm = TRUE),
    median_distance = median(overlap[overlap < 0], na.rm = TRUE),
    percent_overlap = sum(overlap > 0) / n() * 100,
    percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100,
    percent_distance = sum(overlap < 0) / n() * 100
  )

# Display the summary statistics
print(summary_stats)
```

Results: 
avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance
  3751.919    -6298.915        2.032885                17.18984         97.96712

Read in gene2go information 
```{r}
PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t")
```

Make list of genes for input to TopGO
```{r}
peve_clust_genes <- as.character(unique(lnc_bed_filt$gene_name))
peve_all_genes <- as.character(PEVE_gene2go$gene_name)
Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes))
names(Peve_GeneList) <- peve_all_genes
```

The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. 

Read in gene-to-go-mappings
```{r}
PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t")
```

Set function to select genes of interest (ie those that have pvalue < 0.05)
```{r}
topDiffGenes <- function(allScore) {
return(allScore < 0.05)}
```

### Biological Processes

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_BP_lnc <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_BP_FE_lnc <- runTest(GO_PEVE_BP_lnc, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_BP_En_lnc <- GenTable(GO_PEVE_BP_lnc, Fisher = GO_PEVE_BP_FE_lnc, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_BP_En_lnc$Fisher<-as.numeric(GO_PEVE_BP_En_lnc$Fisher)
GO_PEVE_BP_En_sig_lnc<-GO_PEVE_BP_En_lnc[GO_PEVE_BP_En_lnc$Fisher<0.05,]
```

Merge `GO_PEVE_BP_En_sig_lnc` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_BP_En_sig_lnc$GO.ID <- trimws(GO_PEVE_BP_En_sig_lnc$GO.ID)

# Join the datasets based on GO term
GO_PEVE_BP_En_sig_gene_lnc <- PEVE_gene2go %>%
  left_join(GO_PEVE_BP_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_BP_En_sig_gene_lnc$ontology <- "Biological Processes"
``` 

### Cellular Components 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_CC_lnc <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_CC_FE_lnc <- runTest(GO_PEVE_CC_lnc, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_CC_En_lnc <- GenTable(GO_PEVE_CC_lnc, Fisher = GO_PEVE_CC_FE_lnc, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_CC_En_lnc$Fisher<-as.numeric(GO_PEVE_CC_En_lnc$Fisher)
GO_PEVE_CC_En_sig_lnc<-GO_PEVE_CC_En_lnc[GO_PEVE_CC_En_lnc$Fisher<0.05,]
```

Merge `GO_PEVE_CC_En_sig_lnc` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_CC_En_sig_lnc$GO.ID <- trimws(GO_PEVE_CC_En_sig_lnc$GO.ID)

# Join the datasets based on GO term
GO_PEVE_CC_En_sig_gene_lnc <- PEVE_gene2go %>%
  left_join(GO_PEVE_CC_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_CC_En_sig_gene_lnc$ontology <- "Cellular Components"
``` 

### Molecular Functions 

Create `topGOdata` object, which is required for topGO analysis
```{r}
GO_PEVE_MF_lnc <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes)
```

Run GO enrichment test 
```{r}
GO_PEVE_MF_FE_lnc <- runTest(GO_PEVE_MF_lnc, algorithm="weight01", statistic="fisher")
```

Generate results table 
```{r}
GO_PEVE_MF_En_lnc <- GenTable(GO_PEVE_MF_lnc, Fisher = GO_PEVE_MF_FE_lnc, orderBy = "Fisher",  topNodes = 100, numChar = 51)
```
Only taking the top 100 GO terms 

Filter by significant results
```{r}
GO_PEVE_MF_En_lnc$Fisher<-as.numeric(GO_PEVE_MF_En_lnc$Fisher)
GO_PEVE_MF_En_sig_lnc<-GO_PEVE_MF_En_lnc[GO_PEVE_MF_En_lnc$Fisher<0.05,]
```

Merge `GO_PEVE_MF_En_sig_lnc` with GO and gene info. 
```{r}
# Separate GO terms 
PEVE_gene2go <- PEVE_gene2go %>%
  separate_rows(GeneOntologyIDs, sep = ";")

# Ensure GO terms in both datasets are formatted similarly (trim whitespaces)
PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs)
GO_PEVE_MF_En_sig_lnc$GO.ID <- trimws(GO_PEVE_MF_En_sig_lnc$GO.ID)

# Join the datasets based on GO term
GO_PEVE_MF_En_sig_gene_lnc <- PEVE_gene2go %>%
  left_join(GO_PEVE_MF_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>%
  na.omit()

# Add ontology column 
GO_PEVE_MF_En_sig_gene_lnc$ontology <- "Molecular Functions"
``` 

### Join ontologies for lncRNAs

Bind `GO_PEVE_BP_En_sig_gene_lnc`, `GO_PEVE_CC_En_sig_gene_lnc`, and `GO_PEVE_MF_En_sig_gene_lnc` to make a df that has significantly enriched GO terms for all ontologies 
```{r}
GO_PEVE_En_sig_gene_lnc <- rbind(GO_PEVE_BP_En_sig_gene_lnc, GO_PEVE_CC_En_sig_gene_lnc, GO_PEVE_MF_En_sig_gene_lnc)
```

Merge `lnc_bed_filt` with `GO_APUL_En_sig_gene_lnc` and calculate proportion of significant genes
```{r}
merged_lncrna <- GO_PEVE_En_sig_gene_lnc %>%
  left_join(lnc_bed_filt, by = "gene_name") %>%
  na.omit() %>%
  mutate(prop.sig.genes = Significant/Annotated)

# Write to csv 
write.csv(merged_lncrna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_lncRNA.csv")
```

Plot 
```{r}
GO_plot_lnc<-ggplot(merged_lncrna, aes(x = Term, y = -log(Fisher), size = prop.sig.genes, fill = -log(Fisher))) +
  #expand_limits(y = 1.5) +
  #ylim(1, 7.25) +
  # Add horizontal lines with a single aesthetic value
  #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) +
  #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) +
  geom_point(shape = 21) + 
  scale_size(range = c(2, 12)) + 
  scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") +
  xlab('') + 
  ylab('Enrichment score') +
  #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') +
  theme_bw() +
  facet_grid(vars(ontology), scales = "free", space = "free_y") +
  coord_flip(); GO_plot_lnc
```