--- title: "Closest genomic features to ncRNAs - Peve" author: "Jill Ashey" date: "2024-08-24" output: html_document --- ## Closest genomic features to ncRNAs in Porites evermanni This code will investigate the proximity of ncRNAs to mRNAs in Porites evermanni. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) # Load packages library(tidyverse) library(topGO) ``` See [bash code](https://github.com/JillAshey/JillAshey_Putnam_Lab_Notebook/blob/master/_posts/2024-08-19-e5-deepdive-ncRNA-closest-features.md) for `bedtools closest` analysis. Before analysis, make gene2go file from the annotation information ```{r} GO_PEVE <- read.delim("../../data/Peve-rna-GO.tsv", sep = "\t", header = T) %>% select(query, GeneOntologyIDs) %>% mutate(gene_name = gsub("Parent=","", query)) %>% select(gene_name, GeneOntologyIDs) write_tsv(GO_PEVE, file = "../../data/Peve_gene2go.tab") ``` ## piRNA Load Peve piRNA closest file into R. This file represents the genomic features that are closest/are overlapping with piRNA clusters. ```{r} pi_bed <- read.delim("../output/19-bedtools-closest/Peve_piRNA_output.bed", header = F) head(pi_bed) ``` Keep specific columns and rename them ```{r} pi_bed_filt <- pi_bed %>% dplyr::select(V1, V2, V3, V4, V6, V7, V8, V12) %>% dplyr::rename("pi_chrom" = "V1", "pi_start" = "V2", "pi_end" = "V3", "gf_chrom" = "V4", "gf_type" = "V6", "gf_start" = "V7", "gf_end" = "V8", "gf_attr" = "V12") ``` Pull out gene name information from the `gf_attr` column and subset by mRNA ```{r} pi_bed_filt <- pi_bed_filt %>% mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>% mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix dplyr::filter(gf_type == "mRNA") ``` Calculate the overlap and distance based on the l_start, l_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. ```{r} pi_bed_filt$overlap <- with(pi_bed_filt, pmin(pi_end, gf_end) - pmax(pi_start, gf_start)) sum(pi_bed_filt$overlap < 0) sum(pi_bed_filt$overlap > 0) ``` Calculate average/percentages of distance and overlap. ```{r} summary_stats <- pi_bed_filt %>% summarize( avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE), avg_distance = mean(overlap[overlap < 0], na.rm = TRUE), percent_overlap = sum(overlap > 0) / n() * 100, percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100, percent_distance = sum(overlap < 0) / n() * 100 ) # Display the summary statistics print(summary_stats) ``` Results: avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance 1416.043 -4545.6 84.66258 4.907975 15.33742 Read in gene2go information ```{r} PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t") ``` Make list of genes for input to TopGO ```{r} peve_clust_genes <- as.character(unique(pi_bed_filt$gene_name)) peve_all_genes <- as.character(PEVE_gene2go$gene_name) Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes)) names(Peve_GeneList) <- peve_all_genes ``` The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. Read in gene-to-go-mappings ```{r} PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t") ``` Set function to select genes of interest (ie those that have pvalue < 0.05) ```{r} topDiffGenes <- function(allScore) { return(allScore < 0.05)} ``` ### Biological Processes Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_BP_pi <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_BP_FE_pi <- runTest(GO_PEVE_BP_pi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_BP_En_pi <- GenTable(GO_PEVE_BP_pi, Fisher = GO_PEVE_BP_FE_pi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_BP_En_pi$Fisher<-as.numeric(GO_PEVE_BP_En_pi$Fisher) GO_PEVE_BP_En_sig_pi<-GO_PEVE_BP_En[GO_PEVE_BP_En_pi$Fisher<0.05,] ``` Merge `GO_PEVE_BP_En_sig_pi` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_BP_En_sig_pi$GO.ID <- trimws(GO_PEVE_BP_En_sig_pi$GO.ID) # Join the datasets based on GO term GO_PEVE_BP_En_sig_gene_pi <- PEVE_gene2go %>% left_join(GO_PEVE_BP_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_BP_En_sig_gene_pi$ontology <- "Biological Processes" ``` ### Cellular Components Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_CC_pi <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_CC_FE_pi <- runTest(GO_PEVE_CC_pi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_CC_En_pi <- GenTable(GO_PEVE_CC_pi, Fisher = GO_PEVE_CC_FE_pi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_CC_En_pi$Fisher<-as.numeric(GO_PEVE_CC_En_pi$Fisher) GO_PEVE_CC_En_sig_pi<-GO_PEVE_CC_En_pi[GO_PEVE_CC_En_pi$Fisher<0.05,] ``` Merge `GO_PEVE_CC_En_sig_pi` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_CC_En_sig_pi$GO.ID <- trimws(GO_PEVE_CC_En_sig_pi$GO.ID) # Join the datasets based on GO term GO_PEVE_CC_En_sig_gene_pi <- PEVE_gene2go %>% left_join(GO_PEVE_CC_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_CC_En_sig_gene_pi$ontology <- "Cellular Components" ``` ### Molecular Functions Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_MF_pi <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_MF_FE_pi <- runTest(GO_PEVE_MF_pi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_MF_En_pi <- GenTable(GO_PEVE_MF_pi, Fisher = GO_PEVE_MF_FE_pi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_MF_En_pi$Fisher<-as.numeric(GO_PEVE_MF_En_pi$Fisher) GO_PEVE_MF_En_sig_pi<-GO_PEVE_MF_En_pi[GO_PEVE_MF_En_pi$Fisher<0.05,] ``` Merge `GO_PEVE_MF_En_sig` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_MF_En_sig_pi$GO.ID <- trimws(GO_PEVE_MF_En_sig_pi$GO.ID) # Join the datasets based on GO term GO_PEVE_MF_En_sig_gene_pi <- PEVE_gene2go %>% left_join(GO_PEVE_MF_En_sig_pi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_MF_En_sig_gene_pi$ontology <- "Molecular Functions" ``` ### Join ontologies for piRNAs Bind `GO_PEVE_BP_En_sig_gene`, `GO_PEVE_CC_En_sig_gene`, and `GO_PEVE_MF_En_sig_gene` to make a df that has significantly enriched GO terms for all ontologies ```{r} GO_PEVE_En_sig_gene_pi <- rbind(GO_PEVE_BP_En_sig_gene_pi, GO_PEVE_CC_En_sig_gene_pi, GO_PEVE_MF_En_sig_gene_pi) ``` Merge `pi_bed_filt` with `GO_PEVE_En_sig_gene` and calculate proportion of significant genes ```{r} merged_pirna <- GO_PEVE_En_sig_gene_pi %>% left_join(pi_bed_filt, by = "gene_name") %>% na.omit() %>% mutate(prop.sig.genes = Significant/Annotated) # Write to csv write.csv(merged_pirna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_piRNA.csv") ``` Plot ```{r} GO_plot_pi<-ggplot(merged_pirna, aes(x = Term, y = Fisher, size = prop.sig.genes, fill = Fisher)) + #expand_limits(y = 1.5) + #ylim(1, 7.25) + # Add horizontal lines with a single aesthetic value #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) + #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) + geom_point(shape = 21) + scale_size(range = c(2, 12)) + scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") + xlab('') + ylab('Enrichment score') + #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') + theme_bw() + facet_grid(vars(ontology), scales = "free", space = "free_y") + coord_flip(); GO_plot_pi ``` ## miRNA Load Peve miRNA closest file into R. This file represents the genomic features that are closest/are overlapping with piRNA clusters. ```{r} mi_bed <- read.delim("../output/19-bedtools-closest/miRNA_filtered_Peve_output.bed", header = F) head(mi_bed) ``` Keep specific columns and rename them. I'm going to label columns that relate to miRNAs with an "m" and columns that relate to genomic features with a "gf". ```{r} mi_bed_filt <- mi_bed %>% dplyr::select(V1, V2, V3, V4, V5, V7, V9, V10, V11, V12, V13, V14, V16, V18) %>% dplyr::rename("m_chrom" = "V1", "m_source" = "V2", "m_type" = "V3", "m_start" = "V4", "m_end" = "V5", "m_strand" = "V7", "m_attr" = "V9", "gf_chrom" = "V10", "gf_source" = "V11", "gf_type" = "V12", "gf_start" = "V13", "gf_end" = "V14", "gf_strand" = "V16", "gf_attr" = "V18") ``` Pull out gene name information from the `gf_attr` column and subset by mRNA and mature miRNA. ```{r} mi_bed_filt <- mi_bed_filt %>% mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>% mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix dplyr::filter(gf_type == "mRNA") %>% dplyr::filter(m_type == "mature_miRNA") ``` Calculate the overlap and distance based on the m_start, m_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. ```{r} mi_bed_filt$overlap <- with(mi_bed_filt, pmin(m_end, gf_end) - pmax(m_start, gf_start)) sum(mi_bed_filt$overlap < 0) sum(mi_bed_filt$overlap > 0) ``` Calculate average/percentages of distance and overlap. ```{r} # Perform calculations grouped by m_type summary_stats <- mi_bed_filt %>% #group_by(m_type) %>% summarize( avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE), avg_distance = mean(overlap[overlap < 0], na.rm = TRUE), percent_overlap = sum(overlap > 0) / n() * 100, percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100, percent_distance = sum(overlap < 0) / n() * 100 ) # Display the summary statistics print(summary_stats) ``` Results: avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance 20.93939 -2480.154 71.73913 6.521739 28.26087 Read in gene2go information ```{r} PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t") ``` Make list of genes for input to TopGO ```{r} peve_clust_genes <- as.character(unique(mi_bed_filt$gene_name)) peve_all_genes <- as.character(PEVE_gene2go$gene_name) Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes)) names(Peve_GeneList) <- peve_all_genes ``` The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. Read in gene-to-go-mappings ```{r} PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t") ``` Set function to select genes of interest (ie those that have pvalue < 0.05) ```{r} topDiffGenes <- function(allScore) { return(allScore < 0.05)} ``` ### Biological Processes Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_BP_mi <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_BP_FE_mi <- runTest(GO_PEVE_BP_mi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_BP_En_mi <- GenTable(GO_PEVE_BP_mi, Fisher = GO_PEVE_BP_FE_mi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_BP_En_mi$Fisher<-as.numeric(GO_PEVE_BP_En_mi$Fisher) GO_PEVE_BP_En_sig_mi<-GO_PEVE_BP_En_mi[GO_PEVE_BP_En_mi$Fisher<0.05,] ``` Merge `GO_PEVE_BP_En_sig_mi` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_BP_En_sig_mi$GO.ID <- trimws(GO_PEVE_BP_En_sig_mi$GO.ID) # Join the datasets based on GO term GO_PEVE_BP_En_sig_gene_mi <- PEVE_gene2go %>% left_join(GO_PEVE_BP_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_BP_En_sig_gene_mi$ontology <- "Biological Processes" ``` ### Cellular Components Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_CC_mi <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_CC_FE_mi <- runTest(GO_PEVE_CC_mi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_CC_En_mi <- GenTable(GO_PEVE_CC_mi, Fisher = GO_PEVE_CC_FE_mi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_CC_En_mi$Fisher<-as.numeric(GO_PEVE_CC_En_mi$Fisher) GO_PEVE_CC_En_sig_mi<-GO_PEVE_CC_En_mi[GO_PEVE_CC_En_mi$Fisher<0.05,] ``` Merge `GO_PEVE_CC_En_sig_mi` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_CC_En_sig_mi$GO.ID <- trimws(GO_PEVE_CC_En_sig_mi$GO.ID) # Join the datasets based on GO term GO_PEVE_CC_En_sig_gene_mi <- PEVE_gene2go %>% left_join(GO_PEVE_CC_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_CC_En_sig_gene_mi$ontology <- "Cellular Components" ``` ### Molecular Functions Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_MF_mi <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_MF_FE_mi <- runTest(GO_PEVE_MF_mi, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_MF_En_mi <- GenTable(GO_PEVE_MF_mi, Fisher = GO_PEVE_MF_FE_mi, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_MF_En_mi$Fisher<-as.numeric(GO_PEVE_MF_En_mi$Fisher) GO_PEVE_MF_En_sig_mi<-GO_PEVE_MF_En_mi[GO_PEVE_MF_En_mi$Fisher<0.05,] ``` Merge `GO_PEVE_MF_En_sig_mi` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_MF_En_sig_mi$GO.ID <- trimws(GO_PEVE_MF_En_sig_mi$GO.ID) # Join the datasets based on GO term GO_PEVE_MF_En_sig_gene_mi <- PEVE_gene2go %>% left_join(GO_PEVE_MF_En_sig_mi, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_MF_En_sig_gene_mi$ontology <- "Molecular Functions" ``` ### Join ontologies for piRNAs Bind `GO_PEVE_BP_En_sig_gene_mi`, `GO_PEVE_CC_En_sig_gene_mi`, and `GO_PEVE_MF_En_sig_gene_mi` to make a df that has significantly enriched GO terms for all ontologies ```{r} GO_PEVE_En_sig_gene_mi <- rbind(GO_PEVE_BP_En_sig_gene_mi, GO_PEVE_CC_En_sig_gene_mi, GO_PEVE_MF_En_sig_gene_mi) ``` Merge `mi_bed_filt` with `GO_PEVE_En_sig_gene_mi` and calculate proportion of significant genes ```{r} merged_mirna <- GO_PEVE_En_sig_gene_mi %>% left_join(mi_bed_filt, by = "gene_name") %>% na.omit() %>% mutate(prop.sig.genes = Significant/Annotated) # Write to csv write.csv(merged_mirna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_miRNA.csv") ``` Plot ```{r} GO_plot_mi<-ggplot(merged_mirna, aes(x = Term, y = -log10(Fisher), size = prop.sig.genes, fill = -log10(Fisher))) + #expand_limits(y = 1.5) + #ylim(1, 7.25) + # Add horizontal lines with a single aesthetic value #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) + #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) + geom_point(shape = 21) + scale_size(range = c(2, 12)) + scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") + xlab('') + ylab('Enrichment score') + #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') + theme_bw() + facet_grid(vars(ontology), scales = "free", space = "free_y") + coord_flip(); GO_plot_mi ``` ## lncRNAs Load Peve lncRNA closest file into R. This file represents the genomic features that are closest/are overlapping with lncRNA clusters. ```{r} lnc_bed <- read.delim("../output/19-Peve-bedtools-closest/Peve_lncRNA_output.bed", header = F) head(lnc_bed) ``` Keep specific columns and rename them ```{r} lnc_bed_filt <- lnc_bed %>% dplyr::select(V1, V2, V3, V4, V6, V7, V8, V12) %>% dplyr::rename("l_chrom" = "V1", "l_start" = "V2", "l_end" = "V3", "gf_chrom" = "V4", "gf_type" = "V6", "gf_start" = "V7", "gf_end" = "V8", "gf_attr" = "V12") ``` Pull out gene name information from the `gf_attr` column and subset by mRNA ```{r} lnc_bed_filt <- lnc_bed_filt %>% mutate(gene_name = str_extract(gf_attr, "ID=Peve_[0-9]+")) %>% mutate(gene_name = str_replace(gene_name, "ID=", "")) %>% # Remove "ID=" prefix dplyr::filter(gf_type == "mRNA") ``` Calculate the overlap and distance based on the l_start, l_end, gf_start, and gf_end columns. Overlap occurs if the start of one feature is within the bounds of another. If there is no overlaps, the distance can be calculated as the gap between the end of one feature and the start of another. ```{r} lnc_bed_filt$overlap <- with(lnc_bed_filt, pmin(l_end, gf_end) - pmax(l_start, gf_start)) sum(lnc_bed_filt$overlap < 0) sum(lnc_bed_filt$overlap > 0) ``` Calculate average/percentages of distance and overlap. ```{r} summary_stats <- lnc_bed_filt %>% summarize( avg_overlap = mean(overlap[overlap > 0], na.rm = TRUE), median_overlap = median(overlap[overlap > 0], na.rm = TRUE), avg_distance = mean(overlap[overlap < 0], na.rm = TRUE), median_distance = median(overlap[overlap < 0], na.rm = TRUE), percent_overlap = sum(overlap > 0) / n() * 100, percent_distance_lt_1kb = sum(overlap < 0 & overlap > -1000) / n() * 100, percent_distance = sum(overlap < 0) / n() * 100 ) # Display the summary statistics print(summary_stats) ``` Results: avg_overlap avg_distance percent_overlap percent_distance_lt_1kb percent_distance 3751.919 -6298.915 2.032885 17.18984 97.96712 Read in gene2go information ```{r} PEVE_gene2go<-read.delim("../../data/Peve_gene2go.tab", sep="\t") ``` Make list of genes for input to TopGO ```{r} peve_clust_genes <- as.character(unique(lnc_bed_filt$gene_name)) peve_all_genes <- as.character(PEVE_gene2go$gene_name) Peve_GeneList <- factor(as.integer(peve_all_genes %in% peve_clust_genes)) names(Peve_GeneList) <- peve_all_genes ``` The following code will perform GO enrichment using the weighted Fisher's exact test to assess whether specific GO terms are overrepresented in the genes closest to the piRNA clusters. Read in gene-to-go-mappings ```{r} PEVE_gene2go_topgo<-readMappings("../../data/Peve_gene2go.tab", IDsep=";", sep="\t") ``` Set function to select genes of interest (ie those that have pvalue < 0.05) ```{r} topDiffGenes <- function(allScore) { return(allScore < 0.05)} ``` ### Biological Processes Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_BP_lnc <-new("topGOdata", ontology="BP", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_BP_FE_lnc <- runTest(GO_PEVE_BP_lnc, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_BP_En_lnc <- GenTable(GO_PEVE_BP_lnc, Fisher = GO_PEVE_BP_FE_lnc, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_BP_En_lnc$Fisher<-as.numeric(GO_PEVE_BP_En_lnc$Fisher) GO_PEVE_BP_En_sig_lnc<-GO_PEVE_BP_En_lnc[GO_PEVE_BP_En_lnc$Fisher<0.05,] ``` Merge `GO_PEVE_BP_En_sig_lnc` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_BP_En_sig_lnc$GO.ID <- trimws(GO_PEVE_BP_En_sig_lnc$GO.ID) # Join the datasets based on GO term GO_PEVE_BP_En_sig_gene_lnc <- PEVE_gene2go %>% left_join(GO_PEVE_BP_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_BP_En_sig_gene_lnc$ontology <- "Biological Processes" ``` ### Cellular Components Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_CC_lnc <-new("topGOdata", ontology="CC", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_CC_FE_lnc <- runTest(GO_PEVE_CC_lnc, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_CC_En_lnc <- GenTable(GO_PEVE_CC_lnc, Fisher = GO_PEVE_CC_FE_lnc, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_CC_En_lnc$Fisher<-as.numeric(GO_PEVE_CC_En_lnc$Fisher) GO_PEVE_CC_En_sig_lnc<-GO_PEVE_CC_En_lnc[GO_PEVE_CC_En_lnc$Fisher<0.05,] ``` Merge `GO_PEVE_CC_En_sig_lnc` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_CC_En_sig_lnc$GO.ID <- trimws(GO_PEVE_CC_En_sig_lnc$GO.ID) # Join the datasets based on GO term GO_PEVE_CC_En_sig_gene_lnc <- PEVE_gene2go %>% left_join(GO_PEVE_CC_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_CC_En_sig_gene_lnc$ontology <- "Cellular Components" ``` ### Molecular Functions Create `topGOdata` object, which is required for topGO analysis ```{r} GO_PEVE_MF_lnc <-new("topGOdata", ontology="MF", gene2GO=PEVE_gene2go_topgo, allGenes=Peve_GeneList, annot = annFUN.gene2GO, geneSel=topDiffGenes) ``` Run GO enrichment test ```{r} GO_PEVE_MF_FE_lnc <- runTest(GO_PEVE_MF_lnc, algorithm="weight01", statistic="fisher") ``` Generate results table ```{r} GO_PEVE_MF_En_lnc <- GenTable(GO_PEVE_MF_lnc, Fisher = GO_PEVE_MF_FE_lnc, orderBy = "Fisher", topNodes = 100, numChar = 51) ``` Only taking the top 100 GO terms Filter by significant results ```{r} GO_PEVE_MF_En_lnc$Fisher<-as.numeric(GO_PEVE_MF_En_lnc$Fisher) GO_PEVE_MF_En_sig_lnc<-GO_PEVE_MF_En_lnc[GO_PEVE_MF_En_lnc$Fisher<0.05,] ``` Merge `GO_PEVE_MF_En_sig_lnc` with GO and gene info. ```{r} # Separate GO terms PEVE_gene2go <- PEVE_gene2go %>% separate_rows(GeneOntologyIDs, sep = ";") # Ensure GO terms in both datasets are formatted similarly (trim whitespaces) PEVE_gene2go$GeneOntologyIDs <- trimws(PEVE_gene2go$GeneOntologyIDs) GO_PEVE_MF_En_sig_lnc$GO.ID <- trimws(GO_PEVE_MF_En_sig_lnc$GO.ID) # Join the datasets based on GO term GO_PEVE_MF_En_sig_gene_lnc <- PEVE_gene2go %>% left_join(GO_PEVE_MF_En_sig_lnc, by = c("GeneOntologyIDs" = "GO.ID")) %>% na.omit() # Add ontology column GO_PEVE_MF_En_sig_gene_lnc$ontology <- "Molecular Functions" ``` ### Join ontologies for lncRNAs Bind `GO_PEVE_BP_En_sig_gene_lnc`, `GO_PEVE_CC_En_sig_gene_lnc`, and `GO_PEVE_MF_En_sig_gene_lnc` to make a df that has significantly enriched GO terms for all ontologies ```{r} GO_PEVE_En_sig_gene_lnc <- rbind(GO_PEVE_BP_En_sig_gene_lnc, GO_PEVE_CC_En_sig_gene_lnc, GO_PEVE_MF_En_sig_gene_lnc) ``` Merge `lnc_bed_filt` with `GO_APUL_En_sig_gene_lnc` and calculate proportion of significant genes ```{r} merged_lncrna <- GO_PEVE_En_sig_gene_lnc %>% left_join(lnc_bed_filt, by = "gene_name") %>% na.omit() %>% mutate(prop.sig.genes = Significant/Annotated) # Write to csv write.csv(merged_lncrna, "../output/19-bedtools-closest/Peve_GO_en_sig_gene_lncRNA.csv") ``` Plot ```{r} GO_plot_lnc<-ggplot(merged_lncrna, aes(x = Term, y = -log(Fisher), size = prop.sig.genes, fill = -log(Fisher))) + #expand_limits(y = 1.5) + #ylim(1, 7.25) + # Add horizontal lines with a single aesthetic value #geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "black", linewidth = .6) + #geom_hline(yintercept = -log10(0.001), linetype = "solid", colour = "black", linewidth = .6) + geom_point(shape = 21) + scale_size(range = c(2, 12)) + scale_fill_continuous(low = "#1AD3D1FF", high = "#4686FBFF") + xlab('') + ylab('Enrichment score') + #labs(caption = 'Cut-off lines at p=0.01 and p=0.001') + theme_bw() + facet_grid(vars(ontology), scales = "free", space = "free_y") + coord_flip(); GO_plot_lnc ```