---
title: "Genomic Location of DML"
author: "Yaamini Venkataraman"
output: github_document
---

In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome.

# Prepare R Markdown file

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "/Users/yaaminivenkataraman/Documents/project-oyster-oa/analyses/Haws_07-DML-characterization/") #Set root directory
```

```{r}
getwd()
```

# Install packages

```{r}
#install.packages("tidyverse")
#install.packages("RColorBrewer")
#install.packages("broom")

require(tidyverse)
require(RColorBrewer)
require(broom)
```

# Obtain session information

```{r}
sessionInfo()
```

# Establish color scheme

```{r}
plotColors <- brewer.pal(8, "Purples") #Create a color palette for the barplots. Use 5 purple shades from RColorBrewer.
```

# Import data

```{r}
attach("../../project-oyster-oa.Rdata") #Attach R Data if not already loaded
cpgFeatureOverlaps <- cpgFeatureOverlaps
detach("file:../../project-oyster-oa.Rdata")
```

I will look at genomic locations of each condition-specific DML list, then create a plot for visualization.

# ploidy-DML

## Import file counts

```{r}
cpgFeatureOverlapsploidy <- read.table("DML-ploidy-25-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("ploidyDML", "filename")) %>%
  slice(-c(2, 3)) #Import line counts and drop gene overlap information
cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic
cpgFeatureOverlapsploidy #Confirm import
```
```{r}
cpgFeatureOverlapsploidy$allCpGs <- cpgFeatureOverlaps$allCpGs #Add all 5x CpG information
rownames(cpgFeatureOverlapsploidy) <- row.names(cpgFeatureOverlaps) #Add rowname information
cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,-2] #Drop filename column
cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,c(2,1)] #Reorder columns
head(cpgFeatureOverlapsploidy)
```

## Contingency test

```{r}
cpgLocationStatTestploidy <- data.frame(t(cpgFeatureOverlapsploidy)) #Transpose for statistical testing
head(cpgLocationStatTestploidy) #Confirm formatting
```

```{r}
CpGLocationallDMLploidy <- data.frame() #Create empty dataframe to store chi-squared results
for(i in 1:ncol(cpgLocationStatTestploidy)) { #For each genome feature
  allFeature <- cpgLocationStatTestploidy[1,i] #Variable for # genome feature overlaps for all CpGs
  ploidyFeature <- cpgLocationStatTestploidy[2,i] #Variable for # genome feature overlaps for ploidy DML
  allNotFeature <- sum(cpgLocationStatTestploidy[1,-i]) #Variable for # other CpG types for all CpGs
  ploidyNotFeature <- sum(cpgLocationStatTestploidy[2,-i]) #Variable for # other CpG types for ploidy DML
  ct <- matrix(c(allFeature, ploidyFeature, allNotFeature, ploidyNotFeature), ncol = 2) #Create contingency table
  colnames(ct) <- c(as.character(colnames(cpgLocationStatTestploidy[i])), paste0("Not", colnames(cpgLocationStatTestploidy[i]))) #Assign column names: type, not type
  rownames(ct) <- c(as.character(row.names(cpgLocationStatTestploidy)[c(1,2)])) #Assign row names: all CpGs, ploidy DML
  print(ct) #Confirm table is correct
  ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output
  ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestploidy)[i]) #Add CpG type to results
  CpGLocationallDMLploidy <- rbind(CpGLocationallDMLploidy, ctResults) #Add test statistics to master table
}
```

```{r}
head(CpGLocationallDMLploidy)
```

```{r}
CpGLocationallDMLploidy$p.adj <- p.adjust(CpGLocationallDMLploidy$p.value, method = "fdr") #Correct p-value using FDR
range(CpGLocationallDMLploidy$p.adj) #Look at range of p-values
head(CpGLocationallDMLploidy) #Confirm changes
```

```{r}
write.csv(CpGLocationallDMLploidy, "CpG-location-statResults-ploidy.txt", quote = FALSE, row.names = TRUE) #Save statistical output
```

# pH-DML

## Import file counts

```{r}
cpgFeatureOverlapspH <- read.table("DML-pH-25-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("pHDML", "filename")) %>%
  slice(-c(2, 3))  #Import line counts
cpgFeatureOverlapspH <- cpgFeatureOverlapspH[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic
head(cpgFeatureOverlapspH) #Confirm import
```

```{r}
cpgFeatureOverlapspH$allCpGs <- cpgFeatureOverlaps$allCpGs #Add methylated CpG information
rownames(cpgFeatureOverlapspH) <- row.names(cpgFeatureOverlaps) #Add rowname information
cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,-2] #Drop filename column
cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,c(2,1)] #Reorder columns
head(cpgFeatureOverlapspH)
```

## Contingency test

```{r}
cpgLocationStatTestpH <- data.frame(t(cpgFeatureOverlapspH)) #Transpose for statistical testing
head(cpgLocationStatTestpH) #Confirm formatting
```

```{r}
CpGLocationallDMLpH <- data.frame() #Create empty dataframe to store chi-squared results

for(i in 1:ncol(cpgLocationStatTestpH)) { #For each genome feature
  allFeature <- cpgLocationStatTestpH[1,i] #Variable for # genome feature overlaps for all CpGs
  pHFeature <- cpgLocationStatTestpH[2,i] #Variable for # genome feature overlaps for pH DML
  allNotFeature <- sum(cpgLocationStatTestpH[1,-i]) #Variable for # other CpG types for all CpGs
  pHNotFeature <- sum(cpgLocationStatTestpH[2,-i]) #Variable for # other CpG types for pH DML
  ct <- matrix(c(allFeature, pHFeature, allNotFeature, pHNotFeature), ncol = 2) #Create contingency table
  colnames(ct) <- c(as.character(colnames(cpgLocationStatTestpH[i])), paste0("Not", colnames(cpgLocationStatTestpH[i]))) #Assign column names: type, not type
  rownames(ct) <- c(as.character(row.names(cpgLocationStatTestpH)[c(1,2)])) #Assign row names: all CpGs, pH DML
  print(ct) #Confirm table is correct
  ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output
  ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestpH)[i]) #Add CpG type to results
  CpGLocationallDMLpH <- rbind(CpGLocationallDMLpH, ctResults) #Add test statistics to master table
}
```

```{r}
head(CpGLocationallDMLpH)
```

```{r}
CpGLocationallDMLpH$p.adj <- p.adjust(CpGLocationallDMLpH$p.value, method = "fdr") #Correct p-value using FDR
range(CpGLocationallDMLpH$p.adj) #Look at range of p-values
head(CpGLocationallDMLpH) #Confirm changes
```

```{r}
write.csv(CpGLocationallDMLpH, "CpG-location-statResults-pH.txt", quote = FALSE, row.names = TRUE) #Save statistical output
```

# Create stacked barplot

```{r}
cpgFeatureOverlapsPercentsploidy <- cpgFeatureOverlapsploidy #Duplicate dataframe
for (i in 1:length(cpgFeatureOverlapsploidy)) {
  cpgFeatureOverlapsPercentsploidy[,i] <- (cpgFeatureOverlapsPercentsploidy[,i] / (sum(cpgFeatureOverlapsPercentsploidy[,i]))) * 100
} #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information
head(cpgFeatureOverlapsPercentsploidy) #Check calculations
```

```{r}
cpgFeatureOverlapsPercentspH <- cpgFeatureOverlapspH #Duplicate dataframe
for (i in 1:length(cpgFeatureOverlapspH)) {
  cpgFeatureOverlapsPercentspH[,i] <- (cpgFeatureOverlapsPercentspH[,i] / (sum(cpgFeatureOverlapsPercentspH[,i]))) * 100
} #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information
head(cpgFeatureOverlapsPercentspH) #Check calculations
```

```{r}
cpgFeatureOverlapsPercentsBind <- cbind(cpgFeatureOverlapsPercentsploidy, 
                                        cpgFeatureOverlapsPercentspH) #Combine dataframes
cpgFeatureOverlapsPercentsBind <- cpgFeatureOverlapsPercentsBind[,c(1,2,4)] #Reorganize and drop columns
head(cpgFeatureOverlapsPercentsBind)
```

```{r}
#pdf("figures/DML-feature-overlaps.pdf", width = 11, height = 8.5)

par(mar = c(1,5,0,1), oma = c(3, 1, 1, 11)) #Change figure boundaries

barplot(t(t(cpgFeatureOverlapsPercentsBind[,c(1:3)])), 
        col= rev(plotColors), 
        axes = FALSE, 
        names.arg = c("All 5x CpGs", "Ploidy", "pH"), cex.names = 1.5,
        ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. 

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis
mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend(x = 0.57, y = 0.87, 
       xpd = TRUE,
       legend = c("Exon UTR", 
                  "CDS", 
                  "Introns", 
                  "Upstream Flank", 
                  "Downstream Flank", 
                  "lncRNA", 
                  "TE", 
                  "Intergenic"),
       pch = 22, 
        col = "black", 
        pt.bg = rev(plotColors),
       bty = "n",
       cex = 1.5, 
       x.intersp = 0.7, xjust = 0) #Place a legend in the top right of the figure with no box
text("Genome Feature", x = 0.785, y = 0.879, cex = 1.5) #Add legend title that is aligned with legend

#dev.off()
```

```{r}
save.image("../../project-oyster-oa.RData") #Save R data
```

# Genes with multiple DML

## ploidy-DML

```{r}
genePloidyOverlaps <- read.table("DML-ploidy-25-Cov5-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps
head(genePloidyOverlaps) #Confirm import
```

```{r}
geneIDPloidy <- read.table("geneID-ploidy-DML-overlap.tab") #Import column with gene ID information already separated
nrow(geneIDPloidy) == nrow(genePloidyOverlaps) #Check that both dataframes have the same number of rows
head(geneIDPloidy) #Confim import
```

```{r}
genePloidyOverlaps <- cbind(genePloidyOverlaps[,-c(4:6,9:12)], geneIDPloidy) #Add gene ID column to overlap information
colnames(genePloidyOverlaps) <- c("chr", "DMLstart", "DMLend",
                                  "geneStart", "geneEnd", "geneID") #Add column names
head(genePloidyOverlaps) #Confirm formatting
```

```{r}
genePloidyDMLCounts <- as.data.frame(table(genePloidyOverlaps$geneID)) #Create a dataframe with the number of DML/gene
colnames(genePloidyDMLCounts) <- c("geneID", "numDML") #Add column names
head(genePloidyDMLCounts) #Confirm formatting
```

```{r}
range(genePloidyDMLCounts$numDML) #Range of number DML/gene: 1
#Genes only have one ploidy-DML
```

```{r}
write.csv(genePloidyDMLCounts, "Number-of-ploidy-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene
```

## pH-DML

```{r}
genepHOverlaps <- read.table("DML-pH-25-Cov5-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps
head(genepHOverlaps) #Confirm import
```

```{r}
geneIDpH <- read.table("geneID-pH-DML-overlap.tab") #Import column with gene ID information already separated
nrow(geneIDpH) == nrow(genepHOverlaps) #Check that both dataframes have the same number of rows
head(geneIDpH) #Confim import
```

```{r}
genepHOverlaps <- cbind(genepHOverlaps[,-c(4:6,9:12)], geneIDpH) #Add gene ID column to overlap information
colnames(genepHOverlaps) <- c("chr", "DMLstart", "DMLend",
                              "geneStart", "geneEnd", "geneID") #Add column names
head(genepHOverlaps) #Confirm formatting
```

```{r}
genepHDMLCounts <- as.data.frame(table(genepHOverlaps$geneID)) #Create a dataframe with the number of DML/gene
colnames(genepHDMLCounts) <- c("geneID", "numDML") #Add column names
head(genepHDMLCounts) #Confirm formatting
```

```{r}
range(genepHDMLCounts$numDML) #Range of number DML/gene: 1-3
hist(genepHDMLCounts$numDML) #Plot the frequency of multiple DML in genes. Most genes have 1 DML
```

```{r}
write.csv(genepHDMLCounts, "Number-of-pH-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene
```