---
title: "Genomic Location of DML"
author: "Yaamini Venkataraman"
output: github_document
---

In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome. I'll also perform an enrichment test for DML in gene bodies.

# Prepare R Markdown file

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = normalizePath( "../output/DML-characterization/")) #Set root directory
```

```{r}
getwd()
```

# Install packages

```{r}
# if ("broom" %in% rownames(installed.packages()) == 'FALSE') install.packages('broom')
# if ("RColorBrewer" %in% rownames(installed.packages()) == 'FALSE') install.packages('RColorBrewer')
# if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse')
```

```{r}
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("topGO")
```

```{r}
require(broom)
require(RColorBrewer)
require(tidyverse)
require(topGO)
```

# Obtain session information

```{r}
sessionInfo()
```

# Establish color scheme

```{r}
plotColors <- RColorBrewer::brewer.pal(8, "Greys")
```

# Genomic location of DML

I will look at genomic locations of each sex-specific DML list, then create a plot for visualization.

## Female samples

### Import file counts

```{r}
cpgFeatureOverlapsFem <- read.table("female_dml-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("femaleDML", "filename")) #Import line counts
cpgFeatureOverlapsFem <- cpgFeatureOverlapsFem[-c(2),] #Drop extra lines
cpgFeatureOverlapsFem <- cpgFeatureOverlapsFem[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic
head(cpgFeatureOverlapsFem, n = 9) #Confirm import
```

```{r}
femCpGFeatureOverlaps <- read.csv("../methylation-landscape/fem-CpG-feature-overlaps.csv", row.names = 1) #Import female methylation landscape
head(femCpGFeatureOverlaps) #Confirm formatting
```

```{r}
cpgFeatureOverlapsFem$Meth <- femCpGFeatureOverlaps$allCpGs #Add 10x CpG information
rownames(cpgFeatureOverlapsFem) <- row.names(femCpGFeatureOverlaps) #Add rowname information
cpgFeatureOverlapsFem <- cpgFeatureOverlapsFem[,-2] #Drop filename column
head(cpgFeatureOverlapsFem)
```

### Contingency test

```{r}
cpgLocationStatTestFem <- data.frame(t(cpgFeatureOverlapsFem)) #Transpose for statistical testing
head(cpgLocationStatTestFem) #Confirm formatting
```

```{r}
CpGLocationMethDMLFem <- data.frame() #Create empty dataframe to store chi-squared results
for(i in 1:ncol(cpgLocationStatTestFem)) { #For each genome feature
  MethFeature <- cpgLocationStatTestFem[2,i] #Variable for # genome feature overlaps for 10x CpGs
  FemFeature <- cpgLocationStatTestFem[1,i] #Variable for # genome feature overlaps for female DML
  MethNotFeature <- sum(cpgLocationStatTestFem[2,-i]) #Variable for # other CpG types for 10x CpGs
  FemNotFeature <- sum(cpgLocationStatTestFem[1,-i]) #Variable for # other CpG types for female DML
  ct <- matrix(c(MethFeature, FemFeature, MethNotFeature, FemNotFeature), ncol = 2) #Create contingency table
  colnames(ct) <- c(as.character(colnames(cpgLocationStatTestFem[i])), paste0("Not", colnames(cpgLocationStatTestFem[i]))) #Assign column names: type, not type
  rownames(ct) <- c(as.character(row.names(cpgLocationStatTestFem)[c(1,2)])) #Assign row names: 10x CpGs, female DML
  print(ct) #Confirm table is correct
  ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output
  ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestFem)[i]) #Add CpG type to results
  CpGLocationMethDMLFem <- rbind(CpGLocationMethDMLFem, ctResults) #Add test statistics to master table
}
```

```{r}
head(CpGLocationMethDMLFem)
```

```{r}
CpGLocationMethDMLFem$p.adj <- p.adjust(CpGLocationMethDMLFem$p.value, method = "fdr") #Correct p-value using FDR
range(CpGLocationMethDMLFem$p.adj) #Look at range of p-values
head(CpGLocationMethDMLFem) #Confirm changes
```

```{r}
write.csv(CpGLocationMethDMLFem, "CpG-location-statResults-Fem.txt", quote = FALSE, row.names = TRUE) #Save statistical output
```

## Male samples

### Import file counts

```{r}
cpgFeatureOverlapsMale <- read.table("male_dml-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("maleDML", "filename")) #Import line counts
cpgFeatureOverlapsMale <- cpgFeatureOverlapsMale[-c(2),] #Drop extra lines
cpgFeatureOverlapsMale <- cpgFeatureOverlapsMale[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic
head(cpgFeatureOverlapsMale, n = 9) #Confirm import
```

```{r}
maleCpGFeatureOverlaps <- read.csv("../methylation-landscape/male-CpG-feature-overlaps.csv", row.names = 1) #Import female methylation landscape
head(maleCpGFeatureOverlaps) #Confirm formatting
```

```{r}
cpgFeatureOverlapsMale$Meth <- maleCpGFeatureOverlaps$allCpGs #Add 10x CpG information
rownames(cpgFeatureOverlapsMale) <- row.names(maleCpGFeatureOverlaps) #Add rowname information
cpgFeatureOverlapsMale <- cpgFeatureOverlapsMale[,-2] #Drop filename column
head(cpgFeatureOverlapsMale)
```

### Contingency test

```{r}
cpgLocationStatTestMale <- data.frame(t(cpgFeatureOverlapsMale)) #Transpose for statistical testing
head(cpgLocationStatTestMale) #Confirm formatting
```

```{r}
CpGLocationMethDMLMale <- data.frame() #Create empty dataframe to store chi-squared results
for(i in 1:ncol(cpgLocationStatTestMale)) { #For each genome feature
  MethFeature <- cpgLocationStatTestMale[2,i] #Variable for # genome feature overlaps for 10x CpGs
  MaleFeature <- cpgLocationStatTestMale[1,i] #Variable for # genome feature overlaps for male DML
  MethNotFeature <- sum(cpgLocationStatTestMale[2,-i]) #Variable for # other CpG types for 10x CpGs
  MaleNotFeature <- sum(cpgLocationStatTestMale[1,-i]) #Variable for # other CpG types for male DML
  ct <- matrix(c(MethFeature, MaleFeature, MethNotFeature, MaleNotFeature), ncol = 2) #Create contingency table
  colnames(ct) <- c(as.character(colnames(cpgLocationStatTestMale[i])), paste0("Not", colnames(cpgLocationStatTestMale[i]))) #Assign column names: type, not type
  rownames(ct) <- c(as.character(row.names(cpgLocationStatTestMale)[c(1,2)])) #Assign row names: 10x CpGs, male DML
  print(ct) #Confirm table is correct
  ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output
  ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestMale)[i]) #Add CpG type to results
  CpGLocationMethDMLMale <- rbind(CpGLocationMethDMLMale, ctResults) #Add test statistics to master table
}
```

```{r}
head(CpGLocationMethDMLMale)
```

```{r}
CpGLocationMethDMLMale$p.adj <- p.adjust(CpGLocationMethDMLMale$p.value, method = "fdr") #Correct p-value using FDR
range(CpGLocationMethDMLMale$p.adj) #Look at range of p-values
head(CpGLocationMethDMLMale) #Confirm changes
```

```{r}
write.csv(CpGLocationMethDMLMale, "CpG-location-statResults-Male.txt", quote = FALSE, row.names = TRUE) #Save statistical output
```

## Create stacked barplot

```{r}
cpgFeatureOverlapsPercentsFem <- cpgFeatureOverlapsFem #Duplicate dataframe
for (i in 1:length(cpgFeatureOverlapsFem)) {
  cpgFeatureOverlapsPercentsFem[,i] <- (cpgFeatureOverlapsPercentsFem[,i] / (sum(cpgFeatureOverlapsPercentsFem[,i]))) * 100
} #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information
head(cpgFeatureOverlapsPercentsFem) #Check calculations
```

```{r}
cpgFeatureOverlapsPercentsMale <- cpgFeatureOverlapsMale #Duplicate dataframe
for (i in 1:length(cpgFeatureOverlapsMale)) {
  cpgFeatureOverlapsPercentsMale[,i] <- (cpgFeatureOverlapsPercentsMale[,i] / (sum(cpgFeatureOverlapsPercentsMale[,i]))) * 100
} #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information
head(cpgFeatureOverlapsPercentsMale) #Check calculations
```

```{r}
cpgFeatureOverlapsPercentsBind <- cbind(cpgFeatureOverlapsPercentsFem, cpgFeatureOverlapsPercentsMale) #Combine dataframes
head(cpgFeatureOverlapsPercentsBind)
```

```{bash}
mkdir figures
```

```{r}
pdf("figures/DML-feature-overlaps.pdf", width = 11, height = 8.5)

par(mar = c(3,5,1,1), oma = c(0, 1, 1, 11), mfrow = c(2, 1)) #Change figure boundaries

#Females

barplot(t(t(cpgFeatureOverlapsPercentsBind[,c(2,1)])), 
        col= rev(plotColors), 
        axes = FALSE,
        names.arg = c("10x CpGs", "Female DML"), cex.names = 1.5,
        ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. 

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis
mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label
mtext("A. Females", x = 0, y = 105, cex = 1.5, adj = 0) #Add panel title

#Males

barplot(t(t(cpgFeatureOverlapsPercentsBind[,c(4,3)])), 
        col= rev(plotColors), 
        axes = FALSE, 
        names.arg = c("10x CpGs", "Male DML"), cex.names = 1.5,
        ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. 

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis
mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label
mtext("B. Males", x = 0, y = 105, cex = 1.5, adj = 0) #Add panel title

#Legend

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend(x = 0.57, y = 0.87, 
       xpd = TRUE,
       legend = rev(c("Exon UTR", 
                      "CDS", 
                      "Introns", 
                      "Upstream Flank", 
                      "Downstream Flank", 
                      "lncRNA", 
                      "TE", 
                      "Intergenic")),
       pch = 22, 
       col = "black", 
       pt.bg = plotColors,
       bty = "n",
       cex = 1.5, 
       x.intersp = 0.7, xjust = 0) #Place a legend in the top right of the figure with no box
text("Genome Feature", x = 0.785, y = 0.879, cex = 1.5) #Add legend title that is aligned with legend

dev.off()
```

# Enrichment test

I will use `topGO` to identify any overrepresented biological processes in genes that contain DML.

## Annotation data

```{r}
mRNATrack <- read.delim("../../genome-features/C_virginica-3.0-mRNA.gff", header = FALSE) %>%
  separate(., V9, into = c("V10", "V11"), sep = "ID=rna-") %>%
  separate(., V11, into = c("V12", "V13"), sep = ";Parent=") %>%
  separate(., V13, into = c("V14", "V15"), sep = ";Dbxref=") %>%
  separate(., V15, into = c("V16", "V17"), sep = ";product=") %>%
  separate(., V17, into = c("V18", "V19"), sep = ";transcript_id=") %>%
  dplyr::select(., c("V12", "V14", "V18")) %>%
  dplyr::rename(., transcript = V12, gene = V14, product = V18) %>% 
  mutate(., gene_name  = gsub(x = gene, pattern = "gene-", replacement = "")) %>%
  dplyr::select(., -gene) #Winnow down mRNA track. Rename gene column and remove extra characters
head(mRNATrack) #Confirm changes
```

```{r}
GOterms <- read.delim("../../genome-features/_blast-GO-unfolded.sorted", header = FALSE, col.names = c("GO", "transcript"))
head(GOterms)
```

```{r}
GOslim <- read.delim("../../genome-features/GO-GOslim.sorted", header = FALSE, col.names = c("GO", "GOterm", "GOslim", "process"))
head(GOslim)
```

## Females

### Import data

```{r}
femDMLGenes <- read.delim("female_dml-Gene-wb.bed", sep = "\t", col.names = c("chr", "DML.start", "DML.stop", "f_DML", "meth.diff", "gene.chr", "Gnomon", "gene", "gene.start", "gene.stop", "V11", "strand", "V13", "V14")) %>%
  dplyr::select(., chr, DML.start, DML.stop, f_DML, meth.diff, V14) %>%
  separate(., col = V14, into = c("gene_name", "misc"), sep = ";Dbxref=GeneID:") %>%
  dplyr::select(., -misc) %>%
  mutate(., gene_name = gsub(x = gene_name, pattern = "ID=gene-", replacement = "")) #Import overlaps between DML and genes and rename columns during import. Select columns of interest. Separate annotations into gene name and misc information, remove misc column, and remove additional characters from gene name.
head(femDMLGenes)
```

```{r}
femGeneBackground <- read.delim("../methylation-landscape/fem-union-averages_10x-wb.bed", sep = "\t", col.names = c("chr", "start", "stop", "meth", "chr", "RefSeq", "gene", "gene.start", "gene.end", "V10", "strand", "V12", "V13")) %>%
  separate(., col = V13, into = c("gene_name", "misc"), sep = ";Dbxref=") %>%
  mutate(., gene_name = gsub(., x = gene_name, pattern = "ID=gene-", replacement = "")) %>%
  dplyr::select(., chr, start, stop, meth, gene_name) %>%
  unique(.) #Import gene background (all genes with data for 10x CpGs). Separate gene name information from annotaton column and retain columns of interest. Keep only unique rows
head(femGeneBackground) #Confirm formatting
```

### Format enrichment input

```{r}
femGeneBackground %>%
  dplyr::select(., chr, gene_name) %>%
  unique(.) %>%
  left_join(., mRNATrack, by = "gene_name") %>%
  left_join(., GOterms, by = "transcript") %>%
  dplyr::select(., gene_name, GO) %>%
  unique(.) %>%
  group_by(., gene_name) %>%
  filter(., is.na(GO) == FALSE) %>%
  summarize(., GO = paste(GO, collapse = ",")) %>%
  write.table(., "geneid2go-fem_geneBackground.tab", quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) #Select chromosome and gene name information from background and join with transcripts, then GOterms. Keep only gene name and GO annotations, and retain unique entries. Group by gene name then summarize GO column such that all GOterms for one gene are in the same row separated by a comma. Save as a tab file
```

```{r}
geneID2GO <- readMappings(file = "geneid2go-fem_geneBackground.tab") #Loading the GO annotations and GeneIDs. Each line has one transcript ID and all associated GOterms
str(head(geneID2GO)) #Confirm file structure
length(geneID2GO) #32472 genes with annotations for all genes with 10x CpGs
```

```{r}
geneNames <- names(geneID2GO) #Extract names to use as a gene universe
head(geneNames)
```

```{r}
femDMLGenesUnique <- femDMLGenes %>%
  dplyr::select(., gene_name) %>%
  unique(.) #Select the gene ID column and retain unique entries. Save as a new object
femDMLGenesUnique <- femDMLGenesUnique$gene_name #Save column as a vector (will not work otherwise!)
```

```{r}
femDMLGeneList <- factor(as.integer(geneNames %in% femDMLGenesUnique))  #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0)
names(femDMLGeneList) <- geneNames
str(femDMLGeneList)
```

### Run enrichment test

```{r}
femDMLGeneListDataBP <- new("topGOdata", ontology = "BP", allGenes = femDMLGeneList,
                            annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
femDMLGeneListDataBP #Get summary of object
```

```{r}
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher.femDMLGeneListDataBP <- getSigGroups(femDMLGeneListDataBP, test.stat)
resultFisher.femDMLGeneListDataBP
```

```{r}
pvalFis.femDMLGeneListDataBP <- score(resultFisher.femDMLGeneListDataBP) #Extract p-values
head(pvalFis.femDMLGeneListDataBP)
hist(pvalFis.femDMLGeneListDataBP, 50, xlab = "p-values") #Plot histogram of p-values
```

```{r}
allRes.femDMLGeneListDataBP <- GenTable(femDMLGeneListDataBP, classic = resultFisher.femDMLGeneListDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.femDMLGeneListDataBP))
#Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes)
head(allRes.femDMLGeneListDataBP)
```

```{r}
write.csv(allRes.femDMLGeneListDataBP, "fem-DML-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

### Match enriched GOterms with general annotation information

```{r}
sigRes.femDMLGeneListDataBP <- allRes.femDMLGeneListDataBP[1:85,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value
colnames(sigRes.femDMLGeneListDataBP) <- c("GO", "p.value") #Change column names
head(sigRes.femDMLGeneListDataBP)
```

```{r}
femDMLGenes %>%
  dplyr::select(., chr, gene_name, meth.diff) %>%
  unique(.) %>%
  left_join(., mRNATrack, by = "gene_name") %>%
  left_join(., GOterms, by = "transcript") %>%
  unique(.) %>% 
  left_join(sigRes.femDMLGeneListDataBP, ., by = "GO") %>%
  dplyr::select(gene_name, meth.diff, GO, p.value, product) %>%
  separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>%
  dplyr::select(., -transcriptVar) %>%
  filter(., is.na(gene_name) == FALSE) %>%
  unique(.) %>%
  write.csv("fem-DML-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Annotate DML list then join with significantly enriched BPGO. Separate product information and retain unique rows with gene names. Save as a csv
```

## Males

### Import data

```{r}
maleDMLGenes <- read.delim("male_dml-Gene-wb.bed", sep = "\t", col.names = c("chr", "DML.start", "DML.stop", "f_DML", "meth.diff", "gene.chr", "Gnomon", "gene", "gene.start", "gene.stop", "V11", "strand", "V13", "V14")) %>%
  dplyr::select(., chr, DML.start, DML.stop, f_DML, meth.diff, V14) %>%
  separate(., col = V14, into = c("gene_name", "misc"), sep = ";Dbxref=GeneID:") %>%
  dplyr::select(., -misc) %>%
  mutate(., gene_name = gsub(x = gene_name, pattern = "ID=gene-", replacement = "")) #Import overlaps between DML and genes and rename columns during import. Select columns of interest. Separate annotations into gene name and misc information, remove misc column, and remove additional characters from gene name.
head(maleDMLGenes)
```

```{r}
maleGeneBackground <- read.delim("../methylation-landscape/male-union-averages_10x-wb.bed", sep = "\t", col.names = c("chr", "start", "stop", "meth", "chr", "RefSeq", "gene", "gene.start", "gene.end", "V10", "strand", "V12", "V13")) %>%
  separate(., col = V13, into = c("gene_name", "misc"), sep = ";Dbxref=") %>%
  mutate(., gene_name = gsub(., x = gene_name, pattern = "ID=gene-", replacement = "")) %>%
  dplyr::select(., chr, start, stop, meth, gene_name) %>%
  unique(.) #Import gene background (all genes with data for 10x CpGs). Separate gene name information from annotaton column and retain columns of interest. Keep only unique rows
head(maleGeneBackground) #Confirm formatting
```

### Format enrichment input

```{r}
maleGeneBackground %>%
  dplyr::select(., chr, gene_name) %>%
  unique(.) %>%
  left_join(., mRNATrack, by = "gene_name") %>%
  left_join(., GOterms, by = "transcript") %>%
  dplyr::select(., gene_name, GO) %>%
  unique(.) %>%
  group_by(., gene_name) %>%
  filter(., is.na(GO) == FALSE) %>%
  summarize(., GO = paste(GO, collapse = ",")) %>%
  write.table(., "geneid2go-male_geneBackground.tab", quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) #Select chromosome and gene name information from background and join with transcripts, then GOterms. Keep only gene name and GO annotations, and retain unique entries. Group by gene name then summarize GO column such that all GOterms for one gene are in the same row separated by a comma. Save as a tab file
```

```{r}
geneID2GO <- readMappings(file = "geneid2go-male_geneBackground.tab") #Loading the GO annotations and GeneIDs. Each line has one transcript ID and all associated GOterms
str(head(geneID2GO)) #Confirm file structure
length(geneID2GO) #32411 genes with annotations for all genes with 10x CpGs
```

```{r}
geneNames <- names(geneID2GO) #Extract names to use as a gene universe
head(geneNames)
```

```{r}
maleDMLGenesUnique <- maleDMLGenes %>%
  dplyr::select(., gene_name) %>%
  unique(.) #Select the gene ID column and retain unique entries. Save as a new object
maleDMLGenesUnique <- maleDMLGenesUnique$gene_name #Save column as a vector (will not work otherwise!)
```

```{r}
maleDMLGeneList <- factor(as.integer(geneNames %in% maleDMLGenesUnique))  #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0)
names(maleDMLGeneList) <- geneNames
str(maleDMLGeneList)
```

### Run enrichment test

```{r}
maleDMLGeneListDataBP <- new("topGOdata", ontology = "BP", allGenes = maleDMLGeneList,
                            annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object
maleDMLGeneListDataBP #Get summary of object
```

```{r}
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher.maleDMLGeneListDataBP <- getSigGroups(maleDMLGeneListDataBP, test.stat)
resultFisher.maleDMLGeneListDataBP
```

```{r}
pvalFis.maleDMLGeneListDataBP <- score(resultFisher.maleDMLGeneListDataBP) #Extract p-values
head(pvalFis.maleDMLGeneListDataBP)
hist(pvalFis.maleDMLGeneListDataBP, 50, xlab = "p-values") #Plot histogram of p-values
```

```{r}
allRes.maleDMLGeneListDataBP <- GenTable(maleDMLGeneListDataBP, classic = resultFisher.maleDMLGeneListDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.maleDMLGeneListDataBP))
#Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes)
head(allRes.maleDMLGeneListDataBP)
```

```{r}
write.csv(allRes.maleDMLGeneListDataBP, "male-DML-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe
```

### Match enriched GOterms with general annotation information

```{r}
sigRes.maleDMLGeneListDataBP <- allRes.maleDMLGeneListDataBP[1:359,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value
colnames(sigRes.maleDMLGeneListDataBP) <- c("GO", "p.value") #Change column names
head(sigRes.maleDMLGeneListDataBP)
```

```{r}
maleDMLGenes %>%
  dplyr::select(., chr, gene_name, meth.diff) %>%
  unique(.) %>%
  left_join(., mRNATrack, by = "gene_name") %>%
  left_join(., GOterms, by = "transcript") %>%
  unique(.) %>% 
  left_join(sigRes.maleDMLGeneListDataBP, ., by = "GO") %>%
  dplyr::select(gene_name, meth.diff, GO, p.value, product) %>%
  separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>%
  dplyr::select(., -transcriptVar) %>%
  filter(., is.na(gene_name) == FALSE) %>%
  unique(.) %>%
  write.csv("male-DML-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Annotate DML list then join with significantly enriched BPGO. Separate product information and retain unique rows with gene names. Save as a csv
```