--- title: "Genomic Location of DML" author: "Yaamini Venkataraman" output: github_document --- In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome. # Prepare R Markdown file ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = "/Users/yaaminivenkataraman/Documents/project-oyster-oa/analyses/Haws_07-DML-characterization/") #Set root directory ``` # Install packages ```{r} #Packages for visualization #install.packages("dichromat") #install.packages("broom") require(dichromat) require(broom) ``` # Obtain session information ```{r} sessionInfo() ``` # Establish color scheme ```{r} plotColors <- c("#B54E00", "#E38100", "#FFA801", "#FFCB5E", "#A9DDFF", "#21ADFA", "#0A72AB", "#004B70") #Create a color palette for the barplots. Use 5 purple shades from RColorBrewer. ``` # Import data ```{r} #save.image("../../project-oyster-oa.Rdata") #load("../../project-oyster-oa.Rdata") #Load R Data if not already loaded ``` I will look at genomic locations of each sex-specific DML list, then create a plot for visualization. # ploidy-DML ## Import file counts ```{r} cpgFeatureOverlapsploidy <- read.table("DML-ploidy-DSS-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("ploidyDML", "filename")) #Import line counts cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[-c(2, 3, 10),] #Drop extra lines cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic head(cpgFeatureOverlapsploidy) #Confirm import ``` ```{r} cpgFeatureOverlapsploidy$allCpGs <- cpgFeatureOverlaps$allCpGs #Add all 5x CpG information rownames(cpgFeatureOverlapsploidy) <- row.names(cpgFeatureOverlaps) #Add rowname information cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,-2] #Drop filename column cpgFeatureOverlapsploidy <- cpgFeatureOverlapsploidy[,c(2,1)] #Reorder columns head(cpgFeatureOverlapsploidy) ``` ## Contingency test ```{r} cpgLocationStatTestploidy <- data.frame(t(cpgFeatureOverlapsploidy)) #Transpose for statistical testing head(cpgLocationStatTestploidy) #Confirm formatting ``` ```{r} CpGLocationallDMLploidy <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTestploidy)) { #For each genome feature allFeature <- cpgLocationStatTestploidy[1,i] #Variable for # genome feature overlaps for all CpGs ploidyFeature <- cpgLocationStatTestploidy[2,i] #Variable for # genome feature overlaps for ploidy DML allNotFeature <- sum(cpgLocationStatTestploidy[1,-i]) #Variable for # other CpG types for all CpGs ploidyNotFeature <- sum(cpgLocationStatTestploidy[2,-i]) #Variable for # other CpG types for ploidy DML ct <- matrix(c(allFeature, ploidyFeature, allNotFeature, ploidyNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTestploidy[i])), paste0("Not", colnames(cpgLocationStatTestploidy[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTestploidy)[c(1,2)])) #Assign row names: all CpGs, ploidy DML print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestploidy)[i]) #Add CpG type to results CpGLocationallDMLploidy <- rbind(CpGLocationallDMLploidy, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationallDMLploidy) ``` ```{r} CpGLocationallDMLploidy$p.adj <- p.adjust(CpGLocationallDMLploidy$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationallDMLploidy$p.adj) #Look at range of p-values head(CpGLocationallDMLploidy) #Confirm changes ``` ```{r} write.csv(CpGLocationallDMLploidy, "CpG-location-statResults-ploidy.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` # pH-DML ## Import file counts ```{r} cpgFeatureOverlapspH <- read.table("DML-pH-DSS-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("pHDML", "filename")) #Import line counts cpgFeatureOverlapspH <- cpgFeatureOverlapspH[-c(2, 3, 10),] #Drop extra lines cpgFeatureOverlapspH <- cpgFeatureOverlapspH[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic head(cpgFeatureOverlapspH) #Confirm import ``` ```{r} cpgFeatureOverlapspH$allCpGs <- cpgFeatureOverlaps$allCpGs #Add methylated CpG information rownames(cpgFeatureOverlapspH) <- row.names(cpgFeatureOverlaps) #Add rowname information cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,-2] #Drop filename column cpgFeatureOverlapspH <- cpgFeatureOverlapspH[,c(2,1)] #Reorder columns head(cpgFeatureOverlapspH) ``` ## Contingency test ```{r} cpgLocationStatTestpH <- data.frame(t(cpgFeatureOverlapspH)) #Transpose for statistical testing head(cpgLocationStatTestpH) #Confirm formatting ``` ```{r} CpGLocationallDMLpH <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTestpH)) { #For each genome feature allFeature <- cpgLocationStatTestpH[1,i] #Variable for # genome feature overlaps for all CpGs pHFeature <- cpgLocationStatTestpH[2,i] #Variable for # genome feature overlaps for pH DML allNotFeature <- sum(cpgLocationStatTestpH[1,-i]) #Variable for # other CpG types for all CpGs pHNotFeature <- sum(cpgLocationStatTestpH[2,-i]) #Variable for # other CpG types for pH DML ct <- matrix(c(allFeature, pHFeature, allNotFeature, pHNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTestpH[i])), paste0("Not", colnames(cpgLocationStatTestpH[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTestpH)[c(1,2)])) #Assign row names: all CpGs, pH DML print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestpH)[i]) #Add CpG type to results CpGLocationallDMLpH <- rbind(CpGLocationallDMLpH, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationallDMLpH) ``` ```{r} CpGLocationallDMLpH$p.adj <- p.adjust(CpGLocationallDMLpH$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationallDMLpH$p.adj) #Look at range of p-values head(CpGLocationallDMLpH) #Confirm changes ``` ```{r} write.csv(CpGLocationallDMLpH, "CpG-location-statResults-pH.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` # ploidy:pH-DML ## Import file counts ```{r} cpgFeatureOverlapsploidypH <- read.table("DML-ploidypH-DSS-Overlap-counts.txt", header = FALSE, sep = "\t", col.names = c("ploidypHDML", "filename")) #Import line counts cpgFeatureOverlapsploidypH <- cpgFeatureOverlapsploidypH[-c(2, 3, 10),] #Drop extra lines cpgFeatureOverlapsploidypH <- cpgFeatureOverlapsploidypH[c(4, 1, 6, 8, 3, 7, 2, 5),] #Reorganize rows to match order: exon UTR, CDS, intron, upstream, downstream, lncRNA, TE, intergenic head(cpgFeatureOverlapsploidypH) #Confirm import ``` ```{r} cpgFeatureOverlapsploidypH$allCpGs <- cpgFeatureOverlaps$allCpGs #Add methylated CpG information rownames(cpgFeatureOverlapsploidypH) <- row.names(cpgFeatureOverlaps) #Add rowname information cpgFeatureOverlapsploidypH <- cpgFeatureOverlapsploidypH[,-2] #Drop filename column cpgFeatureOverlapsploidypH <- cpgFeatureOverlapsploidypH[,c(2,1)] #Reorder columns head(cpgFeatureOverlapsploidypH) ``` ## Contingency test ```{r} cpgLocationStatTestploidypH <- data.frame(t(cpgFeatureOverlapsploidypH)) #Transpose for statistical testing head(cpgLocationStatTestploidypH) #Confirm formatting ``` ```{r} CpGLocationallDMLploidypH <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTestploidypH)) { #For each genome feature allFeature <- cpgLocationStatTestploidypH[1,i] #Variable for # genome feature overlaps for all CpGs ploidypHFeature <- cpgLocationStatTestploidypH[2,i] #Variable for # genome feature overlaps for ploidypH DML allNotFeature <- sum(cpgLocationStatTestploidypH[1,-i]) #Variable for # other CpG types for all CpGs ploidypHNotFeature <- sum(cpgLocationStatTestploidypH[2,-i]) #Variable for # other CpG types for ploidypH DML ct <- matrix(c(allFeature, ploidypHFeature, allNotFeature, ploidypHNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTestploidypH[i])), paste0("Not", colnames(cpgLocationStatTestploidypH[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTestploidypH)[c(1,2)])) #Assign row names: all CpGs, ploidypH DML print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTestploidypH)[i]) #Add CpG type to results CpGLocationallDMLploidypH <- rbind(CpGLocationallDMLploidypH, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationallDMLploidypH) ``` ```{r} CpGLocationallDMLploidypH$p.adj <- p.adjust(CpGLocationallDMLploidypH$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationallDMLploidypH$p.adj) #Look at range of p-values head(CpGLocationallDMLploidypH) #Confirm changes ``` ```{r} write.csv(CpGLocationallDMLploidypH, "CpG-location-statResults-ploidypH.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` # Create stacked barplot ```{r} cpgFeatureOverlapsPercentsploidy <- cpgFeatureOverlapsploidy #Duplicate dataframe for (i in 1:length(cpgFeatureOverlapsploidy)) { cpgFeatureOverlapsPercentsploidy[,i] <- (cpgFeatureOverlapsPercentsploidy[,i] / (sum(cpgFeatureOverlapsPercentsploidy[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercentsploidy) #Check calculations ``` ```{r} cpgFeatureOverlapsPercentspH <- cpgFeatureOverlapspH #Duplicate dataframe for (i in 1:length(cpgFeatureOverlapspH)) { cpgFeatureOverlapsPercentspH[,i] <- (cpgFeatureOverlapsPercentspH[,i] / (sum(cpgFeatureOverlapsPercentspH[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercentspH) #Check calculations ``` ```{r} cpgFeatureOverlapsPercentsploidypH <- cpgFeatureOverlapsploidypH #Duplicate dataframe for (i in 1:length(cpgFeatureOverlapsploidypH)) { cpgFeatureOverlapsPercentsploidypH[,i] <- (cpgFeatureOverlapsPercentsploidypH[,i] / (sum(cpgFeatureOverlapsPercentsploidypH[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercentsploidypH) #Check calculations ``` ```{r} cpgFeatureOverlapsPercentsBind <- cbind(cpgFeatureOverlapsPercentsploidy, cpgFeatureOverlapsPercentspH, cpgFeatureOverlapsPercentsploidypH) #Combine dataframes cpgFeatureOverlapsPercentsBind <- cpgFeatureOverlapsPercentsBind[,c(1,2,4,6)] #Reorganize and drop columns head(cpgFeatureOverlapsPercentsBind) ``` ```{r} #pdf("figures/DML-feature-overlaps.pdf", width = 11, height = 8.5) par(mar = c(1,5,0,1), oma = c(3, 1, 1, 11)) #Change figure boundaries barplot(t(t(cpgFeatureOverlapsPercentsBind[,c(1:4)])), col= rev(plotColors), axes = FALSE, names.arg = c("All 5x CpGs", "Ploidy", "pH", "Interaction"), cex.names = 1.5, ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend(x = 0.57, y = 0.87, xpd = TRUE, legend = c("Exon UTR", "CDS", "Introns", "Upstream Flank", "Downstream Flank", "lncRNA", "TE", "Intergenic"), pch = 22, col = "black", pt.bg = rev(plotColors), bty = "n", cex = 1.5, x.intersp = 0.7, xjust = 0) #Place a legend in the top right of the figure with no box text("Genome Feature", x = 0.785, y = 0.879, cex = 1.5) #Add legend title that is aligned with legend #dev.off() ``` ```{r} save.image("../../project-oyster-oa.RData") #Save R data ``` # Genes with multiple DML ## ploidy-DML ```{r} genePloidyOverlaps <- read.table("DML-ploidy-DSS-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps head(genePloidyOverlaps) #Confirm import ``` ```{r} geneIDPloidy <- read.table("geneID-ploidy-DML-overlap.tab") #Import column with gene ID information already separated nrow(geneIDPloidy) == nrow(genePloidyOverlaps) #Check that both dataframes have the same number of rows head(geneIDPloidy) #Confim import ``` ```{r} genePloidyOverlaps <- cbind(genePloidyOverlaps[,-c(4:6,9:12)], geneIDPloidy) #Add gene ID column to overlap information colnames(genePloidyOverlaps) <- c("chr", "DMLstart", "DMLend", "geneStart", "geneEnd", "geneID") #Add column names head(genePloidyOverlaps) #Confirm formatting ``` ```{r} genePloidyDMLCounts <- as.data.frame(table(genePloidyOverlaps$geneID)) #Create a dataframe with the number of DML/gene colnames(genePloidyDMLCounts) <- c("geneID", "numDML") #Add column names head(genePloidyDMLCounts) #Confirm formatting ``` ```{r} range(genePloidyDMLCounts$numDML) #Range of number DML/gene: 1-8 hist(genePloidyDMLCounts$numDML) #Plot the frequency of multiple DML in genes. Most genes have 1 DML ``` ```{r} write.csv(genePloidyDMLCounts, "Number-of-ploidy-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene ``` ## pH-DML ```{r} genepHOverlaps <- read.table("DML-pH-DSS-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps head(genepHOverlaps) #Confirm import ``` ```{r} geneIDpH <- read.table("geneID-pH-DML-overlap.tab") #Import column with gene ID information already separated nrow(geneIDpH) == nrow(genepHOverlaps) #Check that both dataframes have the same number of rows head(geneIDpH) #Confim import ``` ```{r} genepHOverlaps <- cbind(genepHOverlaps[,-c(4:6,9:12)], geneIDpH) #Add gene ID column to overlap information colnames(genepHOverlaps) <- c("chr", "DMLstart", "DMLend", "geneStart", "geneEnd", "geneID") #Add column names head(genepHOverlaps) #Confirm formatting ``` ```{r} genepHDMLCounts <- as.data.frame(table(genepHOverlaps$geneID)) #Create a dataframe with the number of DML/gene colnames(genepHDMLCounts) <- c("geneID", "numDML") #Add column names head(genepHDMLCounts) #Confirm formatting ``` ```{r} range(genepHDMLCounts$numDML) #Range of number DML/gene: 1-8 hist(genepHDMLCounts$numDML) #Plot the frequency of multiple DML in genes. Most genes have 1 DML ``` ```{r} write.csv(genepHDMLCounts, "Number-of-pH-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene ``` ## ploidy:pH-DML ```{r} genePloidypHOverlaps <- read.table("DML-ploidypH-DSS-Gene-wb.bed", header = FALSE, sep = "\t") #Import table of gene-DML overlaps head(genePloidypHOverlaps) #Confirm import ``` ```{r} geneIDPloidypH <- read.table("geneID-ploidypH-DML-overlap.tab") #Import column with gene ID information already separated nrow(geneIDPloidypH) == nrow(genePloidypHOverlaps) #Check that both dataframes have the same number of rows head(geneIDPloidypH) #Confim import ``` ```{r} genePloidypHOverlaps <- cbind(genePloidypHOverlaps[,-c(4:6,9:12)], geneIDPloidypH) #Add gene ID column to overlap information colnames(genePloidypHOverlaps) <- c("chr", "DMLstart", "DMLend", "geneStart", "geneEnd", "geneID") #Add column names head(genePloidypHOverlaps) #Confirm formatting ``` ```{r} genePloidypHDMLCounts <- as.data.frame(table(genePloidypHOverlaps$geneID)) #Create a dataframe with the number of DML/gene colnames(genePloidypHDMLCounts) <- c("geneID", "numDML") #Add column names head(genePloidypHDMLCounts) #Confirm formatting ``` ```{r} range(genePloidypHDMLCounts$numDML) #Range of number DML/gene: 1-23 hist(genePloidypHDMLCounts$numDML) #Plot the frequency of multiple DML in genes. Most genes have 1 DML ``` ```{r} write.csv(genePloidypHDMLCounts, "Number-of-ploidypH-DML-per-Gene.csv", quote = FALSE, row.names = FALSE) #Save file with number of DML per gene ``` ```{r} save.image("../../project-oyster-oa.RData") #Save R data ```