--- title: "Genomic Location of DML" author: "Yaamini Venkataraman" output: github_document --- In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome. # Prepare R Markdown file ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = "/Users/yaaminivenkataraman/Documents/project-oyster-oa/analyses/Haws_07-DML-characterization/") #Set root directory ``` ```{r} getwd() ``` # Install packages ```{r} #install.packages("tidyverse") #install.packages("RColorBrewer") #install.packages("broom") require(tidyverse) require(RColorBrewer) require(broom) ``` # Obtain session information ```{r} sessionInfo() ``` # Establish color scheme ```{r} plotColors <- brewer.pal(8, "Purples") #Create a color palette for the barplots. Use 5 purple shades from RColorBrewer. ``` # Import data ```{r} attach("../../project-oyster-oa.Rdata") #Attach R Data if not already loaded cpgFeatureOverlaps <- cpgFeatureOverlaps detach("file:../../project-oyster-oa.Rdata") ``` I will look at genomic locations of each condition-specific DML list, then create a plot for visualization. # ploidy-DML ## Import file counts ```{r} cpgFeatureOverlapsploidy <- read.table("DML-ploidy-25-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("ploidyDML", "filename")) %>% slice(-c(2, 3)) #Import line counts and drop gene overlap information cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic cpgFeatureOverlapsploidy #Confirm import ``` ```{r} cpgFeatureOverlapsploidy$allCpGs <- cpgFeatureOverlaps$allCpGs #Add all 5x CpG information rownames(cpgFeatureOverlapsploidy) <- row.names(cpgFeatureOverlaps) #Add rowname information cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,-2] #Drop filename column cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,c(2,1)] #Reorder columns head(cpgFeatureOverlapsploidy) ``` ## Contingency test ```{r} cpgLocationStatTestploidy <- data.frame(t(cpgFeatureOverlapsploidy)) #Transpose for statistical testing head(cpgLocationStatTestploidy) #Confirm formatting ``` ```{r} CpGLocationallDMLploidy <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTestploidy)) { #For each genome feature allFeature <- cpgLocationStatTestploidy[1,i] #Variable for # genome feature overlaps for all CpGs ploidyFeature <- cpgLocationStatTestploidy[2,i] #Variable for # genome feature overlaps for ploidy DML allNotFeature <- sum(cpgLocationStatTestploidy[1,-i]) #Variable for # other CpG types for all CpGs ploidyNotFeature <- sum(cpgLocationStatTestploidy[2,-i]) #Variable for # other CpG types for ploidy DML ct <- matrix(c(allFeature, ploidyFeature, allNotFeature, ploidyNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTestploidy[i])), paste0("Not", colnames(cpgLocationStatTestploidy[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTestploidy)[c(1,2)])) #Assign row names: all CpGs, ploidy DML print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestploidy)[i]) #Add CpG type to results CpGLocationallDMLploidy <- rbind(CpGLocationallDMLploidy, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationallDMLploidy) ``` ```{r} CpGLocationallDMLploidy$p.adj <- p.adjust(CpGLocationallDMLploidy$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationallDMLploidy$p.adj) #Look at range of p-values head(CpGLocationallDMLploidy) #Confirm changes ``` ```{r} write.csv(CpGLocationallDMLploidy, "CpG-location-statResults-ploidy.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` # pH-DML ## Import file counts ```{r} cpgFeatureOverlapspH <- read.table("DML-pH-25-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("pHDML", "filename")) %>% slice(-c(2, 3)) #Import line counts cpgFeatureOverlapspH <- cpgFeatureOverlapspH[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic head(cpgFeatureOverlapspH) #Confirm import ``` ```{r} cpgFeatureOverlapspH$allCpGs <- cpgFeatureOverlaps$allCpGs #Add methylated CpG information rownames(cpgFeatureOverlapspH) <- row.names(cpgFeatureOverlaps) #Add rowname information cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,-2] #Drop filename column cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,c(2,1)] #Reorder columns head(cpgFeatureOverlapspH) ``` ## Contingency test ```{r} cpgLocationStatTestpH <- data.frame(t(cpgFeatureOverlapspH)) #Transpose for statistical testing head(cpgLocationStatTestpH) #Confirm formatting ``` ```{r} CpGLocationallDMLpH <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTestpH)) { #For each genome feature allFeature <- cpgLocationStatTestpH[1,i] #Variable for # genome feature overlaps for all CpGs pHFeature <- cpgLocationStatTestpH[2,i] #Variable for # genome feature overlaps for pH DML allNotFeature <- sum(cpgLocationStatTestpH[1,-i]) #Variable for # other CpG types for all CpGs pHNotFeature <- sum(cpgLocationStatTestpH[2,-i]) #Variable for # other CpG types for pH DML ct <- matrix(c(allFeature, pHFeature, allNotFeature, pHNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTestpH[i])), paste0("Not", colnames(cpgLocationStatTestpH[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTestpH)[c(1,2)])) #Assign row names: all CpGs, pH DML print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestpH)[i]) #Add CpG type to results CpGLocationallDMLpH <- rbind(CpGLocationallDMLpH, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationallDMLpH) ``` ```{r} CpGLocationallDMLpH$p.adj <- p.adjust(CpGLocationallDMLpH$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationallDMLpH$p.adj) #Look at range of p-values head(CpGLocationallDMLpH) #Confirm changes ``` ```{r} write.csv(CpGLocationallDMLpH, "CpG-location-statResults-pH.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` # Create stacked barplot ```{r} cpgFeatureOverlapsPercentsploidy <- cpgFeatureOverlapsploidy #Duplicate dataframe for (i in 1:length(cpgFeatureOverlapsploidy)) { cpgFeatureOverlapsPercentsploidy[,i] <- (cpgFeatureOverlapsPercentsploidy[,i] / (sum(cpgFeatureOverlapsPercentsploidy[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercentsploidy) #Check calculations ``` ```{r} cpgFeatureOverlapsPercentspH <- cpgFeatureOverlapspH #Duplicate dataframe for (i in 1:length(cpgFeatureOverlapspH)) { cpgFeatureOverlapsPercentspH[,i] <- (cpgFeatureOverlapsPercentspH[,i] / (sum(cpgFeatureOverlapsPercentspH[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercentspH) #Check calculations ``` ```{r} cpgFeatureOverlapsPercentsBind <- cbind(cpgFeatureOverlapsPercentsploidy, cpgFeatureOverlapsPercentspH) #Combine dataframes cpgFeatureOverlapsPercentsBind <- cpgFeatureOverlapsPercentsBind[,c(1,2,4)] #Reorganize and drop columns head(cpgFeatureOverlapsPercentsBind) ``` ```{r} #pdf("figures/DML-feature-overlaps.pdf", width = 11, height = 8.5) par(mar = c(1,5,0,1), oma = c(3, 1, 1, 11)) #Change figure boundaries barplot(t(t(cpgFeatureOverlapsPercentsBind[,c(1:3)])), col= rev(plotColors), axes = FALSE, names.arg = c("All 5x CpGs", "Ploidy", "pH"), cex.names = 1.5, ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend(x = 0.57, y = 0.87, xpd = TRUE, legend = c("Exon UTR", "CDS", "Introns", "Upstream Flank", "Downstream Flank", "lncRNA", "TE", "Intergenic"), pch = 22, col = "black", pt.bg = rev(plotColors), bty = "n", cex = 1.5, x.intersp = 0.7, xjust = 0) #Place a legend in the top right of the figure with no box text("Genome Feature", x = 0.785, y = 0.879, cex = 1.5) #Add legend title that is aligned with legend #dev.off() ``` ```{r} save.image("../../project-oyster-oa.RData") #Save R data ``` # Genes with multiple DML ## ploidy-DML ```{r} genePloidyOverlaps <- read.table("DML-ploidy-25-Cov5-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps head(genePloidyOverlaps) #Confirm import ``` ```{r} geneIDPloidy <- read.table("geneID-ploidy-DML-overlap.tab") #Import column with gene ID information already separated nrow(geneIDPloidy) == nrow(genePloidyOverlaps) #Check that both dataframes have the same number of rows head(geneIDPloidy) #Confim import ``` ```{r} genePloidyOverlaps <- cbind(genePloidyOverlaps[,-c(4:6,9:12)], geneIDPloidy) #Add gene ID column to overlap information colnames(genePloidyOverlaps) <- c("chr", "DMLstart", "DMLend", "geneStart", "geneEnd", "geneID") #Add column names head(genePloidyOverlaps) #Confirm formatting ``` ```{r} genePloidyDMLCounts <- as.data.frame(table(genePloidyOverlaps$geneID)) #Create a dataframe with the number of DML/gene colnames(genePloidyDMLCounts) <- c("geneID", "numDML") #Add column names head(genePloidyDMLCounts) #Confirm formatting ``` ```{r} range(genePloidyDMLCounts$numDML) #Range of number DML/gene: 1 #Genes only have one ploidy-DML ``` ```{r} write.csv(genePloidyDMLCounts, "Number-of-ploidy-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene ``` ## pH-DML ```{r} genepHOverlaps <- read.table("DML-pH-25-Cov5-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps head(genepHOverlaps) #Confirm import ``` ```{r} geneIDpH <- read.table("geneID-pH-DML-overlap.tab") #Import column with gene ID information already separated nrow(geneIDpH) == nrow(genepHOverlaps) #Check that both dataframes have the same number of rows head(geneIDpH) #Confim import ``` ```{r} genepHOverlaps <- cbind(genepHOverlaps[,-c(4:6,9:12)], geneIDpH) #Add gene ID column to overlap information colnames(genepHOverlaps) <- c("chr", "DMLstart", "DMLend", "geneStart", "geneEnd", "geneID") #Add column names head(genepHOverlaps) #Confirm formatting ``` ```{r} genepHDMLCounts <- as.data.frame(table(genepHOverlaps$geneID)) #Create a dataframe with the number of DML/gene colnames(genepHDMLCounts) <- c("geneID", "numDML") #Add column names head(genepHDMLCounts) #Confirm formatting ``` ```{r} range(genepHDMLCounts$numDML) #Range of number DML/gene: 1-3 hist(genepHDMLCounts$numDML) #Plot the frequency of multiple DML in genes. Most genes have 1 DML ``` ```{r} write.csv(genepHDMLCounts, "Number-of-pH-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene ```