---
title: "Functional Enrichment"
author: "Yaamini Venkataraman"
output: html_document
---

In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome.

# Prepare R Markdown file

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "/Users/yaaminivenkataraman/Documents/project-oyster-oa/analyses/Haws_09-functional-enrichment/") #Set root directory
```

# Install packages

```{r}
require(BiocManager)
BiocManager::install("topGO")
BiocManager::install("simplifyEnrichment") #Needs newest R Version (4.1.0)
```

```{r}
require(BiocManager)
require(topGO)
require(simplifyEnrichment)
```

```{r}
require(tidyverse)
```

# Obtain session information

```{r}
sessionInfo()
```

# Describe GO terms associated with DML

Before launching into enrichment, I want to create lists of GOterms associated with each DML list for descriptive purposes.

## ploidy-DML

### Remove overlaps with C/T SNPs

```{r}
ploidyDMLGOterms <- read.table("../Haws_08-GOterm-annotation/DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot", header = FALSE, sep = "\t", col.names = c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat")) #Import list of GO term annotations for ploidy DML
ploidyDMLGOterms$chrStartEnd <- paste(ploidyDMLGOterms$chr, "-", ploidyDMLGOterms$start, "-", ploidyDMLGOterms$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(ploidyDMLGOterms)
```

```{r}
uniqueSNPPloidy <- read.table("../Haws_07-DML-characterization/DML-ploidy-DSS-unique-CT-SNPs.bed", header = FALSE, sep = "\t", col.names = c("chr", "start", "end")) #Import overlaps between unique C/T SNPs and all DML produced with bedtools
uniqueSNPPloidy$chrStartEnd <- paste(uniqueSNPPloidy$chr, "-", uniqueSNPPloidy$start, "-", uniqueSNPPloidy$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(uniqueSNPPloidy) #Confirm import
```

```{r}
ploidyDMLGOterms <- left_join(x = ploidyDMLGOterms, y = uniqueSNPPloidy, by = "chrStartEnd") #Add all columns from uniqueSNPPloidy to ploidyDMLGOterms
ploidyDMLGOtermsFiltered <- filter(ploidyDMLGOterms, is.na(chr.y) == "TRUE")  %>%
  select(!c(chr.y, start.y, end.y)) #Keep all rows where chr.y has an "NA", since this indicates there was no matching entry in the DML-SNP overlap dataframe. Then drop chr.y, start.y, and end.y
length(ploidyDMLGOtermsFiltered[,1]) #7246 rows without SNP overlaps
colnames(ploidyDMLGOtermsFiltered) <- c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat", "chrStartEnd") #Rename columns for consistency
head(ploidyDMLGOtermsFiltered) #Confirm formatting
```

```{r}
length(unique(ploidyDMLGOtermsFiltered$geneID)) #84 annotated genes
length(unique(ploidyDMLGOtermsFiltered$chrStartEnd)) #103 DML in annotated genes
```

### Summarize GOterms

```{r}
ploidyDMLGOtermsFilteredCounts <- as.data.frame(table(ploidyDMLGOtermsFiltered$GOID)) #Create table with GOID and counts
colnames(ploidyDMLGOtermsFilteredCounts) <- c("GOID", "Freq") #Add column names
range(ploidyDMLGOtermsFilteredCounts$Freq) #Get range of GOID frequency
nrow(ploidyDMLGOtermsFilteredCounts) #Number of unique GOIDs associated with ploidy-DML
```

```{r}
ploidyDMLGOtermsBP <- ploidyDMLGOtermsFiltered %>% filter(., ploidyDMLGOtermsFiltered$GOcat == "P") #Filter biological process terms
ploidyDMLGOtermCountsBP <- as.data.frame(table(ploidyDMLGOtermsBP$GOID)) #Count GOIDs
colnames(ploidyDMLGOtermCountsBP) <- c("GOID", "Freq") #Add column names
ploidyDMLGOtermCountsBP <- ploidyDMLGOtermCountsBP %>% filter(ploidyDMLGOtermCountsBP$Freq != 0)
range(ploidyDMLGOtermCountsBP$Freq) #Get range of GOID frequency
nrow(ploidyDMLGOtermCountsBP) #Number of unique GOIDs associated with ploidy-DML
```

```{r}
write.csv(ploidyDMLGOtermCountsBP, "ploidy-BP-GOterms.csv", quote = FALSE, row.names = FALSE)
```

## pH-DML

### Remove overlaps with C/T SNPs

```{r}
pHDMLGOterms <- read.table("../Haws_08-GOterm-annotation/DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot", header = FALSE, sep = "\t", col.names = c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat")) #Import list of GO term annotations for pH DML
pHDMLGOterms$chrStartEnd <- paste(pHDMLGOterms$chr, "-", pHDMLGOterms$start, "-", pHDMLGOterms$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(pHDMLGOterms)
```

```{r}
uniqueSNPpH <- read.table("../Haws_07-DML-characterization/DML-pH-DSS-unique-CT-SNPs.bed", header = FALSE, sep = "\t", col.names = c("chr", "start", "end")) #Import overlaps between unique C/T SNPs and all DML produced with bedtools
uniqueSNPpH$chrStartEnd <- paste(uniqueSNPpH$chr, "-", uniqueSNPpH$start, "-", uniqueSNPpH$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(uniqueSNPpH) #Confirm import
```

```{r}
pHDMLGOterms <- left_join(x = pHDMLGOterms, y = uniqueSNPpH, by = "chrStartEnd") #Add all columns from uniqueSNPpH to pHDMLGOterms
pHDMLGOtermsFiltered <- filter(pHDMLGOterms, is.na(chr.y) == "TRUE")  %>%
  select(!c(chr.y, start.y, end.y)) #Keep all rows where chr.y has an "NA", since this indicates there was no matching entry in the DML-SNP overlap dataframe. Then drop chr.y, start.y, and end.y
length(pHDMLGOtermsFiltered[,1]) #4426 rows without SNP overlaps
colnames(pHDMLGOtermsFiltered) <- c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat", "chrStartEnd") #Rename columns for consistency
head(pHDMLGOtermsFiltered) #Confirm formatting
```

```{r}
length(unique(pHDMLGOtermsFiltered$geneID)) #69 annotated genes
length(unique(pHDMLGOtermsFiltered$chrStartEnd)) #87 DML in annotated genes
```

### Summarize GOterms

```{r}
pHDMLGOtermsFilteredCounts <- as.data.frame(table(pHDMLGOtermsFiltered$GOID)) #Create table with GOID and counts
colnames(pHDMLGOtermsFilteredCounts) <- c("GOID", "Freq") #Add column names
range(pHDMLGOtermsFilteredCounts$Freq) #Get range of GOID frequency
nrow(pHDMLGOtermsFilteredCounts) #Number of unique GOIDs associated with pH-DML
```

```{r}
pHDMLGOtermsBP <- pHDMLGOtermsFiltered %>% filter(., pHDMLGOtermsFiltered$GOcat == "P") #Filter biological process terms
pHDMLGOtermCountsBP <- as.data.frame(table(pHDMLGOtermsBP$GOID)) #Count GOIDs
colnames(pHDMLGOtermCountsBP) <- c("GOID", "Freq") #Add column names
pHDMLGOtermCountsBP <- pHDMLGOtermCountsBP %>% filter(pHDMLGOtermCountsBP$Freq != 0)
range(pHDMLGOtermCountsBP$Freq) #Get range of GOID frequency
nrow(pHDMLGOtermCountsBP) #Number of unique GOIDs associated with pH-DML
```

```{r}
write.csv(pHDMLGOtermCountsBP, "pH-BP-GOterms.csv", quote = FALSE, row.names = FALSE)
```

```{r}
pHDMLGOtermsMF <- pHDMLGOtermsFiltered %>% filter(., pHDMLGOtermsFiltered$GOcat == "F") #Filter biological process terms
pHDMLGOtermCountsMF <- as.data.frame(table(pHDMLGOtermsMF$GOID)) #Count GOIDs
colnames(pHDMLGOtermCountsMF) <- c("GOID", "Freq") #Add column names
pHDMLGOtermCountsMF <- pHDMLGOtermCountsMF %>% filter(pHDMLGOtermCountsMF$Freq != 0)
range(pHDMLGOtermCountsMF$Freq) #Get range of GOID frequency
nrow(pHDMLGOtermCountsMF) #Number of unique GOIDs associated with pH-DML
```

```{r}
write.csv(pHDMLGOtermCountsMF, "pH-MF-GOterms.csv", quote = FALSE, row.names = FALSE)
```

## ploidypH-DML

### Remove overlaps with C/T SNPs

```{r}
ploidypHDMLGOterms <- read.table("../Haws_08-GOterm-annotation/DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot", header = FALSE, sep = "\t", col.names = c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat")) #Import list of GO term annotations for ploidypH DML
ploidypHDMLGOterms$chrStartEnd <- paste(ploidypHDMLGOterms$chr, "-", ploidypHDMLGOterms$start, "-", ploidypHDMLGOterms$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(ploidypHDMLGOterms)
```

```{r}
uniqueSNPploidypH <- read.table("../Haws_07-DML-characterization/DML-ploidypH-DSS-unique-CT-SNPs.bed", header = FALSE, sep = "\t", col.names = c("chr", "start", "end")) #Import overlaps between unique C/T SNPs and all DML produced with bedtools
uniqueSNPploidypH$chrStartEnd <- paste(uniqueSNPploidypH$chr, "-", uniqueSNPploidypH$start, "-", uniqueSNPploidypH$end, sep = "") #Create a column with chr, start, and end information separated by "-"
head(uniqueSNPploidypH) #Confirm import
```

```{r}
ploidypHDMLGOterms <- left_join(x = ploidypHDMLGOterms, y = uniqueSNPploidypH, by = "chrStartEnd") #Add all columns from uniqueSNPploidypH to ploidypHDMLGOterms
ploidypHDMLGOtermsFiltered <- filter(ploidypHDMLGOterms, is.na(chr.y) == "TRUE")  %>%
  select(!c(chr.y, start.y, end.y)) #Keep all rows where chr.y has an "NA", since this indicates there was no matching entry in the DML-SNP overlap dataframe. Then drop chr.y, start.y, and end.y
length(ploidypHDMLGOtermsFiltered[,1]) #1126 rows without SNP overlaps
colnames(ploidypHDMLGOtermsFiltered) <- c("transcript", "geneID", "chr", "start", "end", "GOID", "GOterm", "GOSlim", "GOcat", "chrStartEnd") #Rename columns for consistency
head(ploidypHDMLGOtermsFiltered) #Confirm formatting
```

```{r}
length(unique(ploidypHDMLGOtermsFiltered$geneID)) #19 annotated genes
length(unique(ploidypHDMLGOtermsFiltered$chrStartEnd)) #17 DML in annotated genes
```

### Summarize GOterms

```{r}
ploidypHDMLGOtermsFilteredCounts <- as.data.frame(table(ploidypHDMLGOtermsFiltered$GOID)) #Create table with GOID and counts
colnames(ploidypHDMLGOtermsFilteredCounts) <- c("GOID", "Freq") #Add column names
range(ploidypHDMLGOtermsFilteredCounts$Freq) #Get range of GOID frequency
nrow(ploidypHDMLGOtermsFilteredCounts) #Number of unique GOIDs associated with ploidypH-DML
```

```{r}
ploidypHDMLGOtermsBP <- ploidypHDMLGOtermsFiltered %>% filter(., ploidypHDMLGOtermsFiltered$GOcat == "P") #Filter biological process terms
ploidypHDMLGOtermCountsBP <- as.data.frame(table(ploidypHDMLGOtermsBP$GOID)) #Count GOIDs
colnames(ploidypHDMLGOtermCountsBP) <- c("GOID", "Freq") #Add column names
ploidypHDMLGOtermCountsBP <- ploidypHDMLGOtermCountsBP %>% filter(ploidypHDMLGOtermCountsBP$Freq != 0)
range(ploidypHDMLGOtermCountsBP$Freq) #Get range of GOID frequency
nrow(ploidypHDMLGOtermCountsBP) #Number of unique GOIDs associated with ploidypH-DML
```

```{r}
write.csv(ploidypHDMLGOtermCountsBP, "ploidypH-BP-GOterms.csv", quote = FALSE, row.names = FALSE)
```

# Gene enrichment with `topGO`

Following protocol from [Chandra Rajan et al. 2021](https://onlinelibrary.wiley.com/doi/full/10.1111/gcb.15675#support-information-section). Script found [here](https://onlinelibrary-wiley-com.offcampus.lib.washington.edu/action/downloadSupplement?doi=10.1111%2Fgcb.15675&file=gcb15675-sup-0002-FileS2.txt).

## Load data for gene enrichment

```{r}
geneID2GO <- readMappings(file = "../Haws_08-GOterm-annotation/geneid2go-union1x.tab") #Loading the GO annotations and GeneIDs. Each line has one transcript ID and all associated GOterms
str(head(geneID2GO)) #Confirm file structure
```

```{r}
geneNames <- names(geneID2GO) #Extract names to use as gene universe
head(geneNames)
```

## ploidy-DML

I'm using the list of genes with DML as my list of interest. 

```{r}
ploidyGenes <- ploidyDMLGOtermsFiltered$transcript #Extract transcript ID
ploidyGeneList <- factor(as.integer(geneNames %in% ploidyGenes))
names(ploidyGeneList) <- ploidyGenes
str(ploidyGeneList)
```

### Biological processes

```{r}
ploidyGOdataBP <- new("topGOdata", ontology = "BP", allGenes = ploidyGeneList,
                      annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidyGOdataBP #Get summary of object
```

```{r}
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher.ploidyBP <- getSigGroups(ploidyGOdataBP, test.stat)
resultFisher.ploidyBP
```

```{r}
pvalFis.ploidyBP <- score(resultFisher.ploidyBP) #Extract p-values
head(pvalFis.ploidyBP)
hist(pvalFis.ploidyBP, 50, xlab = "p-values") #Plot histogram of p-values
```

```{r}
allRes.ploidyBP <- GenTable(ploidyGOdataBP, classic = resultFisher.ploidyBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.ploidyBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes)
head(allRes.ploidyBP)
```

```{r}
write.csv(allRes.ploidyBP, "ploidy-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

#### Match enriched GOterms with general annotation information

```{r}
sigRes.ploidyBP <- allRes.ploidyBP[1:3,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value
colnames(sigRes.ploidyBP) <- c("GOID", "p.value") #Change column names
head(sigRes.ploidyBP)
```

```{r}
sigRes.ploidyBPAnnot <- merge(sigRes.ploidyBP, ploidyDMLGOtermsBP, by = "GOID") #Additional annotations
sigRes.ploidyBPAnnot <- unique(sigRes.ploidyBPAnnot[,-11]) #Drop GOcat column and retain only unique lines 
length(unique(sigRes.ploidyBPAnnot$geneID)) #1 unique genes
length(unique(sigRes.ploidyBPAnnot$transcript)) #3 unique transcripts
head(sigRes.ploidyBPAnnot) #Confirm formatting
```

```{r}
write.csv(sigRes.ploidyBPAnnot, "ploidy-BP-EnrichedGO-DML-withTranscript.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

```{r}
sigRes.ploidyBPAnnotnoT <- unique(sigRes.ploidyBPAnnot[,-3]) #Drop transcript column and keep only unique rows
head(sigRes.ploidyBPAnnotnoT) #Confirm formatting
```

```{r}
write.csv(sigRes.ploidyBPAnnotnoT, "ploidy-BP-EnrichedGO-DML.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

### Cellular components

```{r}
ploidyGOdataCC <- new("topGOdata", ontology = "CC", allGenes = ploidyGeneList,
                      annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidyGOdataCC #Get summary of object
```

```{r}
resultFisher.ploidyCC <- getSigGroups(ploidyGOdataCC, test.stat)
resultFisher.ploidyCC
```

### Molecular function

```{r}
ploidyGOdataMF <- new("topGOdata", ontology = "MF", allGenes = ploidyGeneList,
                      annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidyGOdataMF #Get summary of object
```

```{r}
resultFisher.ploidyMF <- getSigGroups(ploidyGOdataMF, test.stat) #Extract significant GOterms using Fisher exact test
resultFisher.ploidyMF #Get results summary
```

```{r}
write.csv(allRes.ploidyMF, "ploidy-MF-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

## pH-DML

```{r}
pHGenes <- pHDMLGOtermsFiltered$transcript #Extract transcript ID
pHGeneList <- factor(as.integer(geneNames %in% pHGenes))
names(pHGeneList) <- pHGenes
str(pHGeneList)
```

### Biological processes

```{r}
pHGOdataBP <- new("topGOdata", ontology = "BP", allGenes = pHGeneList,
                  annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
pHGOdataBP #Get summary of object
```

```{r}
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher.pHBP <- getSigGroups(pHGOdataBP, test.stat)
resultFisher.pHBP
```

```{r}
pvalFis.pHBP <- score(resultFisher.pHBP) #Extract p-values
head(pvalFis.pHBP)
hist(pvalFis.pHBP, 50, xlab = "p-values") #Plot histogram of p-values
```

```{r}
allRes.pHBP <- GenTable(pHGOdataBP, classic = resultFisher.pHBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.pHBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes)
head(allRes.pHBP)
```

```{r}
write.csv(allRes.pHBP, "pH-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

#### Match enriched GOterms with general annotation information

```{r}
sigRes.pHBP <- allRes.pHBP[1:10,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value
colnames(sigRes.pHBP) <- c("GOID", "p.value") #Change column names
head(sigRes.pHBP)
```

```{r}
sigRes.pHBPAnnot <- merge(sigRes.pHBP, pHDMLGOtermsBP, by = "GOID") #Additional annotations
sigRes.pHBPAnnot <- unique(sigRes.pHBPAnnot[,-11]) #Drop GOcat column and retain only unique lines 
length(unique(sigRes.pHBPAnnot$geneID)) #8 unique genes
length(unique(sigRes.pHBPAnnot$transcript)) #29 unique transcripts
head(sigRes.pHBPAnnot) #Confirm formatting
```

```{r}
write.csv(sigRes.pHBPAnnot, "pH-BP-EnrichedGO-DML-withTranscript.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

```{r}
sigRes.pHBPAnnotnoT <- unique(sigRes.pHBPAnnot[,-3]) #Drop transcript column and keep only unique rows
head(sigRes.pHBPAnnotnoT) #Confirm formatting
```

```{r}
write.csv(sigRes.pHBPAnnotnoT, "pH-BP-EnrichedGO-DML.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

### Cellular components

```{r}
pHGOdataCC <- new("topGOdata", ontology = "CC", allGenes = pHGeneList,
                  annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
pHGOdataCC #Get summary of object
```

```{r}
resultFisher.pHCC <- getSigGroups(pHGOdataCC, test.stat)
resultFisher.pHCC
```

### Molecular function

```{r}
pHGOdataMF <- new("topGOdata", ontology = "MF", allGenes = pHGeneList,
                  annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
pHGOdataMF #Get summary of object
```

```{r}
resultFisher.pHMF <- getSigGroups(pHGOdataMF, test.stat) #Extract significant GOterms using Fisher exact test
resultFisher.pHMF #Get results summary
```

```{r}
pvalFis.pHMF <- score(resultFisher.pHMF) #Extract p-values
head(pvalFis.pHMF)
hist(pvalFis.pHMF, 50, xlab = "p-values") #Plot histogram of p-values
```

```{r}
allRes.pHMF <- GenTable(pHGOdataMF, classic = resultFisher.pHMF, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.pHMF)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes)
head(allRes.pHMF)
```

```{r}
write.csv(allRes.pHMF, "pH-MF-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

#### Match enriched GOterms with general annotation information

```{r}
sigRes.pHMF <- allRes.pHMF[1:8,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value
colnames(sigRes.pHMF) <- c("GOID", "p.value") #Change column names
head(sigRes.pHMF)
```

```{r}
sigRes.pHMFAnnot <- merge(sigRes.pHMF, pHDMLGOtermsMF, by = "GOID") #Additional annotations
sigRes.pHMFAnnot <- unique(sigRes.pHMFAnnot[,-11]) #Drop GOcat column and retain only unique lines 
length(unique(sigRes.pHMFAnnot$geneID)) #13 unique genes
length(unique(sigRes.pHMFAnnot$transcript)) #59 unique transcripts
head(sigRes.pHMFAnnot) #Confirm formatting
```

```{r}
write.csv(sigRes.pHMFAnnot, "pH-MF-EnrichedGO-DML-withTranscript.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

```{r}
sigRes.pHMFAnnotnoT <- unique(sigRes.pHMFAnnot[,-3]) #Drop transcript column and keep only unique rows
head(sigRes.pHMFAnnotnoT) #Confirm formatting
```

```{r}
write.csv(sigRes.pHMFAnnotnoT, "pH-MF-EnrichedGO-DML.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

## ploidypH-DML

```{r}
ploidypHGenes <- ploidypHDMLGOtermsFiltered$transcript #Extract transcript ID
ploidypHGeneList <- factor(as.integer(geneNames %in% ploidypHGenes))
names(ploidypHGeneList) <- ploidypHGenes
str(ploidypHGeneList)
```

### Biological processes

```{r}
ploidypHGOdataBP <- new("topGOdata", ontology = "BP", allGenes = ploidypHGeneList,
                        annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidypHGOdataBP #Get summary of object
```

```{r}
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher.ploidypHBP <- getSigGroups(ploidypHGOdataBP, test.stat)
resultFisher.ploidypHBP
```

### Cellular components

```{r}
ploidypHGOdataCC <- new("topGOdata", ontology = "CC", allGenes = ploidypHGeneList,
                      annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidypHGOdataCC #Get summary of object
```

```{r}
resultFisher.ploidypHCC <- getSigGroups(ploidypHGOdataCC, test.stat)
resultFisher.ploidypHCC
```

### Molecular function

```{r}
ploidypHGOdataMF <- new("topGOdata", ontology = "MF", allGenes = ploidypHGeneList,
                      annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
ploidypHGOdataMF #Get summary of object
```

```{r}
resultFisher.ploidypHMF <- getSigGroups(ploidypHGOdataMF, test.stat) #Extract significant GOterms using Fisher exact test
resultFisher.ploidypHMF #Get results summary
```

```{r}
save.image("../../project-oyster-oa.RData") #Save R Data
```

# Figures

I will use `simplifyEnrichment` to create heatmaps that cluster GO terms by semantic similarity.

```{bash}
mkdir figures
```

## ploidy-DML

### Biological processes

```{r}
matPloidyBP <- GO_similarity(go_id = sigRes.ploidyBP$GOID, ont = "BP") #Calculate the semantic similarity matrix using the Rel method (default)
```

```{r}
#pdf("figures/simplifyEnrichment-ploidy-BP.pdf", width = 11, height = 8.5) #Save figure

simplifyGO(matPloidyBP,
           column_title = "", col = rev(plotColors), fontsize_range = c(10,40),
           word_cloud_grob_param = list(col = "black", max_width = 25)) #Plot GOterms based on semantic similarity. Do not include a column title. Set colors to be plot colors, and set fontsize to range from 10 to 40. Pass arguments to word_cloud_grob_param to dictate the colors of the words and maximum width

#dev.off()
```

```{r}
dfPloidyBP <- simplifyGO(matPloidyBP, plot = FALSE) #Cluster GOterms by semantic similarity
head(dfPloidyBP) #Dataframe with GOIDs, GOterms, and cluster
sort(table(dfPloidyBP$cluster)) #Obtain size of clusters
```

## pH-DML

### Biological processes

```{r}
matpHBP <- GO_similarity(go_id = sigRes.pHBP$GOID, ont = "BP") #Calculate the semantic similarity matrix using the Rel method (default)
```

```{r}
#pdf("figures/simplifyEnrichment-pH-BP.pdf", width = 11, height = 8.5) #Save figure

simplifyGO(matpHBP,
           column_title = "", col = rev(plotColors), fontsize_range = c(10,40),
           word_cloud_grob_param = list(col = "black", max_width = 25)) #Plot GOterms based on semantic similarity. Do not include a column title. Set colors to be plot colors, and set fontsize to range from 10 to 40. Pass arguments to word_cloud_grob_param to dictate the colors of the words and maximum width

#dev.off()
```

```{r}
dfpHBP <- simplifyGO(matpHBP, plot = FALSE) #Cluster GOterms by semantic similarity
head(dfpHBP) #Dataframe with GOIDs, GOterms, and cluster
sort(table(dfpHBP$cluster)) #Obtain size of clusters
```

### Molecular function

```{r}
matpHMF <- GO_similarity(go_id = sigRes.pHMF$GOID, ont = "MF") #Calculate the semantic similarity matrix using the Rel method (default)
```

```{r}
#pdf("figures/simplifyEnrichment-pH-MF.pdf", width = 11, height = 8.5) #Save figure

simplifyGO(matpHMF,
           column_title = "", col = rev(plotColors), fontsize_range = c(10,40),
           word_cloud_grob_param = list(col = "black", max_width = 25)) #Plot GOterms based on semantic similarity. Do not include a column title. Set colors to be plot colors, and set fontsize to range from 10 to 40. Pass arguments to word_cloud_grob_param to dictate the colors of the words and maximum width

#dev.off()
```

```{r}
dfpHMF <- simplifyGO(matpHMF, plot = FALSE) #Cluster GOterms by semantic similarity
head(dfpHMF) #Dataframe with GOIDs, GOterms, and cluster
sort(table(dfpHMF$cluster)) #Obtain size of clusters
```

```{r}
save.image("../../project-oyster-oa.RData") #Save R Data
```