--- title: "Chacterizing CpG Methylation (5x Individual Samples)" output: github_document --- In this script, I'll create summary tables and figures to characterize CpG methylation in *M. capitata* and *P. acuta* using WGBS, RRBS, and MBD-BSSeq. I used individual samples with data for 5x CpGs in [this Jupyter notebook](https://github.com/hputnam/Meth_Compare/blob/master/scripts/Characterizing-CpG-Methylation-5x.ipynb) to identify methylation status and genomic location. I will use the output in this script. # Set up R Markdown Document ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Install packages ```{r} #install.packages("RColorBrewer") #Use for color palletes #install.packages("dichromat") #Discern color blind accessibility of figures #install.packages("compositions") #Compositional data analysis package #install.packages("vegan") #Multivariate analysis package #install.packages("cluster") #Multivariate analysis package #install.packages("glmmTMB") #Linear modeling package #install.packages("emmeans") ``` ```{r} require(RColorBrewer) require(dichromat) require(compositions) require(vegan) require(cluster) require(glmmTMB) require(emmeans) ``` # Session information ```{r} sessionInfo() ``` # CpG methylation status ## Mcap ### Import file counts ```{r} McapAll <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-bedgraph-counts.txt", header = FALSE, col.names = c("totalLines", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename McapAll <- McapAll[-10,] #Remove last row (total lines for all files) tail(McapAll) #Confirm import ``` ```{r} McapMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-Meth-counts.txt", header = FALSE, col.names = c("Meth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename McapMeth <- McapMeth[-10,] #Remove last row (total lines for all files) tail(McapMeth) #Confirm import ``` ```{r} McapSparseMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-sparseMeth-counts.txt", header = FALSE, col.names = c("sparseMeth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename McapSparseMeth <- McapSparseMeth[-10,] #Remove last row (total lines for all files) tail(McapSparseMeth) #Confirm import ``` ```{r} McapUnMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-unMeth-counts.txt", header = FALSE, col.names = c("unMeth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename McapUnMeth <- McapUnMeth[-10,] #Remove last row (total lines for all files) tail(McapUnMeth) #Confirm import ``` ### Create summary table ```{r} McapCpGType <- cbind(McapAll, McapMeth, McapSparseMeth, McapUnMeth) #Mash tables together by column rownames(McapCpGType) <- substr(McapAll$filename, start = 1, stop = 6) #Use the first 6 characters of the filename to add sample ID as row names McapCpGType <- McapCpGType[,-c(2,4,6,8)] #Remove filename columns tail(McapCpGType) #Confirm table mashing ``` ```{r} McapCpGType$percentMeth <- (McapCpGType$Meth / McapCpGType$totalLines) * 100 #Calculate percent methylated loci McapCpGType$percentSparseMeth <- (McapCpGType$sparseMeth / McapCpGType$totalLines) * 100 #Calculate percent sparsely methylated loci McapCpGType$percentUnMeth <- (McapCpGType$unMeth / McapCpGType$totalLines) * 100 #Calculate percent unmethylated loci McapCpGType <- McapCpGType[,c(1, 2, 5, 3, 6, 4, 7)] #Reorganize columns tail(McapCpGType) #Confirm calculations ``` ```{r} write.table(McapCpGType, "../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-CpG-Type.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save summary table ``` ### Reorganize data ```{r} McapCpGTypePercents <- McapCpGType[,c(3,5,7)] #Keep only columns with % total CpG information head(McapCpGTypePercents) #Check reorganization ``` ```{r} #Create test plots barplot(t(McapCpGTypePercents[1,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, WGBS axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") barplot(t(McapCpGTypePercents[4,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, RRBS axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") barplot(t(McapCpGTypePercents[7,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, MBD-BSSeq axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") ``` ### Create multipanel figure with all samples ```{r} #pdf("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-CpG-Type.pdf", height = 8.5, width = 11) #Save file as pdf par(mfcol = c(3,3), mar = c(2, 2, 2, 0), oma = c(5, 5, 2, 0)) #Fill in multipanel plot by column and adjust inner and outer margins barplot(t(McapCpGTypePercents[1,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 1, WGBS axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis mtext(side = 3, "WBGS", adj = 0, line = 1) #Add sequencing information barplot(t(McapCpGTypePercents[4,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 1, RRBS mtext(side = 3, "RRBS", adj = 0, line = 1) #Add sequencing information axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis mtext(side = 2, outer = TRUE, "% 5x CpG with Data", cex = 1.5, line = 2) #Add y-axis label barplot(t(McapCpGTypePercents[7,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 1, MBD-BSSeq mtext(side = 3, "MBD-BSSeq", adj = 0, line = 1) #Add sequencing information axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis barplot(t(McapCpGTypePercents[2,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 2, WGBS barplot(t(McapCpGTypePercents[5,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 2, RRBS barplot(t(McapCpGTypePercents[8,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 2, MBD-BSSeq mtext(side = 1, outer = TRUE, "% Methylation", cex = 1.5, line = 2) #Add x-axis label barplot(t(McapCpGTypePercents[3,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 3, WGBS barplot(t(McapCpGTypePercents[6,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 3, RRBS barplot(t(McapCpGTypePercents[9,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 3, MBD-BSSeq #dev.off() #Turn off plotting device ``` ### Multivariate data analysis with compositional data We are interested in how sequencing method affects the proportion, or composition, of CpGs in various methylation statuses and genomic features. For this reason, I will use a combined compositional analysis and multivariate approach. For each sample, I will have separate columns for each methylation status. #### Format data ```{r} McapCpGPercentsTrans <- data.frame(clr(McapCpGTypePercents / 100)) #Use centered log-ratio transformation on proportion data tail(McapCpGPercentsTrans) # Confirm transformation ``` #### PCoA and perMANOVA ##### Conduct PCoA ```{r} dissimMcapCpGPercentsTrans <- vegdist(McapCpGPercentsTrans, "euclidean") #Calculate euclidean dissimilarity matrix ``` ```{r} McapCpGPercentsPCoA <- cmdscale(dissimMcapCpGPercentsTrans, eig = TRUE, add = TRUE) #Perform the PCoA. Include eigenvalues for each PC, and add a constant so default eigenvalues are non-negative. McapCpGPercentsPCoA$points #View PC scores ``` ##### Understand eigenvalues and loadings ```{r} McapCpGPercentsPCoA$eig #View eigenvalues (McapCpGPercentsPCoA$eig / sum(McapCpGPercentsPCoA$eig)) * 100 #Calculate percent variation explained by each PC ``` ```{r} plot(McapCpGPercentsPCoA$eig/sum(McapCpGPercentsPCoA$eig)*100, type = "b",lwd = 2,col = "blue", xlab = "Principal Component from PCoA", ylab = "% variation explained", main = "% variation explained by PCoA (blue) vs. random expectation (red)") #Plot eigenvalues lines(bstick(35)*100, type = "b",lwd = 2, col = "red") #Compare eigenvalues to expectations according to the broken stick model ``` ```{r} vec.McapCpGPercentsPCoA <- envfit(scores(McapCpGPercentsPCoA), McapCpGPercentsTrans, perm = 1000) #Extract PCs to calculate PC loadings (variable weights) vec.McapCpGPercentsPCoA #Look at statistical results ``` ##### Global perMANOVA ```{r} sampleInformation <- c(rep("WGBS", times = 3), rep("RRBS", times = 3), rep("MBDBS", times = 3)) #Create a vector with grouping information ``` ```{r} McapCpGPercentsTest <- adonis(dissimMcapCpGPercentsTrans ~ sampleInformation) #Conduct perMANOVA by method McapCpGPercentsTest #Look at test output. ``` ##### Beta dispersion model ```{r} disp.McapCpGPercentsTrans <- betadisper(dissimMcapCpGPercentsTrans,group=sampleInformation,type='centroid') #Run a beta dispersion model to assess if significant differences are due to differences in group centroid or variance anova(disp.McapCpGPercentsTrans) #Variance is the same across all groups. Significance in perMANOVA due to centroid differences, not variance ``` ##### Create plot ```{r} ordiplot(McapCpGPercentsPCoA, choices = c(1,2), type = "text", display = "sites", xlab = "PC 1 (83.1%)", ylab = "PC 2 (16.9%)") #Plot basic PCoA plot(vec.McapCpGPercentsPCoA, p.max = 0.05, col = "blue") #Plot loadings that are significant at the 0.05 level ``` #### Pairwise perMANVOA ##### WGBS vs. RRBS ```{r} McapCpGPercentsWGRR <- vegdist(McapCpGPercentsTrans[c(1:3, 4:6),], "euclidean") #Subset WGBS and RRBS data and calculate dissimilarity matrix ``` ```{r} McapCpGPercentsWGRRTest <- adonis(McapCpGPercentsWGRR ~ sampleInformation[1:6]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples McapCpGPercentsWGRRTest ``` ##### WGBS vs. MBD-BS ```{r} McapCpGPercentsWGMB <- vegdist(McapCpGPercentsTrans[c(1:3, 7:9),], "euclidean") #Subset WGBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} McapCpGPercentsWGMBTest <- adonis(McapCpGPercentsWGMB ~ sampleInformation[c(1:3, 7:9)]) #Conduct pairwise perMANOVA for WGBS and MBD-BS data. Only use sequencing method metadata for these samples McapCpGPercentsWGMBTest ``` ##### RRBS vs. MBD-BS ```{r} McapCpGPercentsRRMB <- vegdist(McapCpGPercentsTrans[c(4:9),], "euclidean") #Subset RRBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} McapCpGPercentsRRMBTest <- adonis(McapCpGPercentsRRMB ~ sampleInformation[c(4:9)]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples McapCpGPercentsRRMBTest ``` ### Generalized linear model analysis To complement my multivariate analysis, I will use a series of generalized linear models. I will run separate models for each methylation status since the proportions of each add up to 1. Does sequencing method influence the proportion of high, moderate, or low CpGs detected? #### High methylation ```{r} McapCpGPercentsHigh <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "highMeth" = McapCpGTypePercents[,1] / 100) #Remove moderate and low methylation information and add sequencing metadata. A = WGBS, B = RRBS, C = MBD-BS. Needed to alphabetize so RRBS and MBD-BS are compared to WGBS (model default is alphabetized) head(McapCpGPercentsHigh) #Confirm dataframe creation ``` ```{r} McapCpGHighModel <- glmmTMB(highMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapCpGPercentsHigh) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapCpGHighModel) #Look at model output. ``` ```{r} McapCpGHighPostHoc <- data.frame(emmeans(McapCpGHighModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapCpGHighPostHoc) #Look at log odd ratio results ``` #### Moderate methylation ```{r} McapCpGPercentsMod <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "modMeth" = McapCpGTypePercents[,2] / 100) #Remove high and low methylation information and add sequencing metadata head(McapCpGPercentsMod) #Confirm dataframe creation ``` ```{r} McapCpGModModel <- glmmTMB(modMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapCpGPercentsMod) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapCpGModModel) #Look at model output. ``` ```{r} McapCpGModPostHoc <- data.frame(emmeans(McapCpGModModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapCpGModPostHoc) #Look at log odd ratio results ``` #### Low methylation ```{r} McapCpGPercentsLow <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "lowMeth" = McapCpGTypePercents[,3] / 100) #Remove moderate and low methylation information and add sequencing metadata head(McapCpGPercentsLow) #Confirm dataframe creation ``` ```{r} McapCpGLowModel <- glmmTMB(lowMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapCpGPercentsLow) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapCpGLowModel) #Look at model output. ``` ```{r} McapCpGLowPostHoc <- data.frame(emmeans(McapCpGLowModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapCpGLowPostHoc) #Look at log odd ratio results ``` #### Save statistical output ```{r} McapCpGMethStatusStatOutput <- rbind(McapCpGHighPostHoc, McapCpGModPostHoc, McapCpGLowPostHoc) #Create a dataframe with logs odd ratio output for each model McapCpGMethStatusStatOutput$model <- c(rep("High", times = 3), rep("Mod", times = 3), rep("Low", times = 3)) #Add model information head(McapCpGMethStatusStatOutput) #Confirm dataframe creation ``` ```{r} write.table(McapCpGMethStatusStatOutput, "../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-CpG-Type-StatResults.txt", quote = FALSE, row.names = FALSE) #Save table ``` ## Pact ### Import file counts ```{r} PactAll <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-bedgraph-counts.txt", header = FALSE, col.names = c("totalLines", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename PactAll <- PactAll[-10,] #Remove last row (total lines for all files) tail(PactAll) #Confirm import ``` ```{r} PactMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-Meth-counts.txt", header = FALSE, col.names = c("Meth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename PactMeth <- PactMeth[-10,] #Remove last row (total lines for all files) tail(PactMeth) #Confirm import ``` ```{r} PactSparseMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-sparseMeth-counts.txt", header = FALSE, col.names = c("sparseMeth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename PactSparseMeth <- PactSparseMeth[-10,] #Remove last row (total lines for all files) tail(PactSparseMeth) #Confirm import ``` ```{r} PactUnMeth <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-unMeth-counts.txt", header = FALSE, col.names = c("unMeth", "filename")) #Import file using space as a delimiter. Columns are the number of lines and the filename PactUnMeth <- PactUnMeth[-10,] #Remove last row (total lines for all files) tail(PactUnMeth) #Confirm import ``` ### Create summary table ```{r} PactCpGType <- cbind(PactAll, PactMeth, PactSparseMeth, PactUnMeth) #Mash tables together by column rownames(PactCpGType) <- substr(PactAll$filename, start = 1, stop = 5) #Use the first 5 characters of the filename to add sample ID to row names PactCpGType <- PactCpGType[,-c(2,4,6,8)] #Remove filename columns tail(PactCpGType) #Confirm table mashing ``` ```{r} PactCpGType$percentMeth <- (PactCpGType$Meth / PactCpGType$totalLines) * 100 #Calculate percent methylated loci PactCpGType$percentSparseMeth <- (PactCpGType$sparseMeth / PactCpGType$totalLines) * 100 #Calculate percent sparsely methylated loci PactCpGType$percentUnMeth <- (PactCpGType$unMeth / PactCpGType$totalLines) * 100 #Calculate percent unmethylated loci PactCpGType <- PactCpGType[,c(1, 2, 5, 3, 6, 4, 7)] #Reorganize columns tail(PactCpGType) #Confirm calculations ``` ```{r} write.table(PactCpGType, "../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CpG-Type.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save summary table ``` ```{r} PactCpGType <- read.delim("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CpG-Type.txt", sep = "\t", header = TRUE) #Import summary table head(PactCpGType) #Check import ``` ### Reorganize data ```{r} PactCpGTypePercents <- PactCpGType[,c(3,5,7)] #Keep only columns with % total CpG information tail(PactCpGTypePercents) #Check reorganization ``` ```{r} #Create test plots barplot(t(PactCpGTypePercents[1,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, WGBS axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") barplot(t(PactCpGTypePercents[4,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, RRBS axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") barplot(t(PactCpGTypePercents[7,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")), axes = FALSE) #Sample 1, MBD-BSSeqc(expression("High (">="50%)"), "Moderate (10-50%)", expression("Weak ("<="10%)")) axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80") ``` ### Create multipanel figure with all samples ```{r} #pdf("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CpG-Type.pdf", height = 8.5, width = 11) #Save file as pdf par(mfcol = c(3,3), mar = c(2, 2, 2, 0), oma = c(5, 5, 2, 0)) #Fill in multipanel plot by column and adjust inner and outer margins barplot(t(PactCpGTypePercents[1,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 1, WGBS mtext(side = 3, "WBGS", adj = 0, line = 1) #Add sequencing information axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis barplot(t(PactCpGTypePercents[4,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 1, RRBS mtext(side = 3, "RRBS", adj = 0, line = 1) #Add sequencing information axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis mtext(side = 2, outer = TRUE, "% 5x CpG with Data", cex = 1.5, line = 2) #Add y-axis label barplot(t(PactCpGTypePercents[7,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 1, MBD-BSSeq mtext(side = 3, "MBD-BSSeq", adj = 0, line = 1) #Add sequencing information axis(side = 2, at = seq(0, 100, by = 25), las = 2, col = "grey80", cex.axis = 1.3) #Add y-axis barplot(t(PactCpGTypePercents[2,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 2, WGBS barplot(t(PactCpGTypePercents[5,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 2, RRBS barplot(t(PactCpGTypePercents[8,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 2, MBD-BSSeq mtext(side = 1, outer = TRUE, "% Methylation", cex = 1.5, line = 2) #Add x-axis label barplot(t(PactCpGTypePercents[3,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 3, WGBS barplot(t(PactCpGTypePercents[6,]), beside = TRUE, ylim = c(0,100), names.arg = FALSE, axes = FALSE) #Sample 3, RRBS barplot(t(PactCpGTypePercents[9,]), beside = TRUE, ylim = c(0,100), names.arg = c(expression("">= "50%"), "10-50%", "< 10%"), cex.names = 1.3, axes = FALSE) #Sample 3, MBD-BSSeq #dev.off() #Turn off plotting device ``` ### Multivariate data analysis with compositional data We are interested in how sequencing method affects the proportion, or composition, of CpGs in various methylation statuses and genomic features. For this reason, I will use a combined compositional analysis and multivariate approach. For each sample, I will have separate columns for each methylation status. #### Format data ```{r} PactCpGPercentsTrans <- data.frame(clr(PactCpGTypePercents / 100)) #Use centered log-ratio transformation on proportion data tail(PactCpGPercentsTrans) # Confirm transformation ``` #### PCoA and perMANOVA ##### Conduct PCoA ```{r} dissimPactCpGPercentsTrans <- vegdist(PactCpGPercentsTrans, "euclidean") #Calculate euclidean dissimilarity matrix ``` ```{r} PactCpGPercentsPCoA <- cmdscale(dissimPactCpGPercentsTrans, eig = TRUE, add = TRUE) #Perform the PCoA. Include eigenvalues for each PC, and add a constant so default eigenvalues are non-negative. PactCpGPercentsPCoA$points #View PC scores ``` ##### Understand eigenvalues and loadings ```{r} PactCpGPercentsPCoA$eig #View eigenvalues (PactCpGPercentsPCoA$eig / sum(PactCpGPercentsPCoA$eig)) * 100 #Calculate percent variation explained by each PC ``` ```{r} plot(PactCpGPercentsPCoA$eig/sum(PactCpGPercentsPCoA$eig)*100, type = "b",lwd = 2,col = "blue", xlab = "Principal Component from PCoA", ylab = "% variation explained", main = "% variation explained by PCoA (blue) vs. random expectation (red)") #Plot eigenvalues lines(bstick(35)*100, type = "b",lwd = 2, col = "red") #Compare eigenvalues to expectations according to the broken stick model ``` ```{r} vec.PactCpGPercentsPCoA <- envfit(scores(PactCpGPercentsPCoA), PactCpGPercentsTrans, perm = 1000) #Extract PCs to calculate PC loadings (variable weights) vec.PactCpGPercentsPCoA #Look at statistical results ``` ##### Global perMANOVA ```{r} PactCpGPercentsTest <- adonis(dissimPactCpGPercentsTrans ~ sampleInformation) #Conduct perMANOVA by method PactCpGPercentsTest #Look at test output. ``` ##### Beta dispersion model ```{r} disp.PactCpGPercentsTrans <- betadisper(dissimPactCpGPercentsTrans,group=sampleInformation,type='centroid') #Run a beta dispersion model to assess if significant differences are due to differences in group centroid or variance anova(disp.PactCpGPercentsTrans) #Variance is different between groups. Significance in perMANOVA can be due to centroid and variance differences. ``` ##### Create plot ```{r} ordiplot(PactCpGPercentsPCoA, choices = c(1,2), type = "text", display = "sites", xlab = "PC 1 (97.9%)", ylab = "PC 2 (2.1%)") #Plot basic PCoA plot(vec.PactCpGPercentsPCoA, p.max = 0.05, col = "blue") #Plot loadings that are significant at the 0.05 level ``` #### Pairwise perMANVOA ##### WGBS vs. RRBS ```{r} PactCpGPercentsWGRR <- vegdist(PactCpGPercentsTrans[c(1:3, 4:6),], "euclidean") #Subset WGBS and RRBS data and calculate dissimilarity matrix ``` ```{r} PactCpGPercentsWGRRTest <- adonis(PactCpGPercentsWGRR ~ sampleInformation[1:6]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples PactCpGPercentsWGRRTest ``` ```{r} disp.PactCpGPercentsWGRR <- betadisper(PactCpGPercentsWGRR, group = sampleInformation[1:6], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactCpGPercentsWGRR) #Variance is the same between groups. ``` ##### WGBS vs. MBD-BS ```{r} PactCpGPercentsWGMB <- vegdist(PactCpGPercentsTrans[c(1:3, 7:9),], "euclidean") #Subset WGBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} PactCpGPercentsWGMBTest <- adonis(PactCpGPercentsWGMB ~ sampleInformation[c(1:3, 7:9)]) #Conduct pairwise perMANOVA for WGBS and MBD-BS data. Only use sequencing method metadata for these samples PactCpGPercentsWGMBTest ``` ```{r} disp.PactCpGPercentsWGMB <- betadisper(PactCpGPercentsWGMB, group = sampleInformation[c(1:3, 7:9)], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactCpGPercentsWGMB) #Variance is significantly different between groups. ``` ##### RRBS vs. MBD-BS ```{r} PactCpGPercentsRRMB <- vegdist(PactCpGPercentsTrans[c(4:9),], "euclidean") #Subset RRBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} PactCpGPercentsRRMBTest <- adonis(PactCpGPercentsRRMB ~ sampleInformation[c(4:9)]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples PactCpGPercentsRRMBTest ``` ```{r} disp.PactCpGPercentsRRMB <- betadisper(PactCpGPercentsRRMB, group = sampleInformation[c(4:9)], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactCpGPercentsRRMB) #Variance is marginally different between groups. ``` ### Generalized linear model analysis #### High methylation ```{r} PactCpGPercentsHigh <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "highMeth" = PactCpGTypePercents[,1] / 100) #Remove moderate and low methylation information and add sequencing metadata. A = WGBS, B = RRBS, C = MBD-BS. Needed to alphabetize so RRBS and MBD-BS are compared to WGBS (model default is alphabetized) head(PactCpGPercentsHigh) #Confirm dataframe creation ``` ```{r} PactCpGHighModel <- glmmTMB(highMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactCpGPercentsHigh) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactCpGHighModel) #Look at model output. ``` ```{r} PactCpGHighPostHoc <- data.frame(emmeans(PactCpGHighModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactCpGHighPostHoc) #Look at log odd ratio results ``` #### Moderate methylation ```{r} PactCpGPercentsMod <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "modMeth" = PactCpGTypePercents[,2] / 100) #Remove high and low methylation information and add sequencing metadata head(PactCpGPercentsMod) #Confirm dataframe creation ``` ```{r} PactCpGModModel <- glmmTMB(modMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactCpGPercentsMod) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactCpGModModel) #Look at model output. ``` ```{r} PactCpGModPostHoc <- data.frame(emmeans(PactCpGModModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactCpGModPostHoc) #Look at log odd ratio results ``` #### Low methylation ```{r} PactCpGPercentsLow <- data.frame("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(c("1", "2", "3"), times = 3), "lowMeth" = PactCpGTypePercents[,3] / 100) #Remove moderate and low methylation information and add sequencing metadata head(PactCpGPercentsLow) #Confirm dataframe creation ``` ```{r} PactCpGLowModel <- glmmTMB(lowMeth ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactCpGPercentsLow) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactCpGLowModel) #Look at model output. ``` ```{r} PactCpGLowPostHoc <- data.frame(emmeans(PactCpGLowModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactCpGLowPostHoc) #Look at log odd ratio results ``` #### Save statistical output ```{r} PactCpGMethStatusStatOutput <- rbind(PactCpGHighPostHoc, PactCpGModPostHoc, PactCpGLowPostHoc) #Create a dataframe with logs odd ratio output for each model PactCpGMethStatusStatOutput$model <- c(rep("High", times = 3), rep("Mod", times = 3), rep("Low", times = 3)) #Add model information head(PactCpGMethStatusStatOutput) #Confirm dataframe creation ``` ```{r} write.table(PactCpGMethStatusStatOutput, "../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CpG-Type-StatResults.txt", quote = FALSE, row.names = FALSE) #Save table ``` # CpG genomic location ## Mcap ### Import file counts ```{r} McapGenomeFeatures <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-CGMotif-Overlaps-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with CG motif and feature track overlaps McapGenomeFeatures <- McapGenomeFeatures[-8,] #Remove final row tail(McapGenomeFeatures) #Check import ``` ```{r} McapGeneOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcGenes-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-gene overlaps McapGeneOverlaps <- McapGeneOverlaps[-37,] #Remove final row tail(McapGeneOverlaps) #Confirm import ``` ```{r} McapCDSOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcCDS-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-CDS overlaps McapCDSOverlaps <- McapCDSOverlaps[-37,] #Remove final row tail(McapCDSOverlaps) #Confirm import ``` ```{r} McapIntronsOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcIntrons-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-Introns overlaps McapIntronsOverlaps <- McapIntronsOverlaps[-37,] #Remove final row tail(McapIntronsOverlaps) #Confirm import ``` ```{r} McapFlanksOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcFlanks-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-flank overlaps McapFlanksOverlaps <- McapFlanksOverlaps[-37,] #Remove final row tail(McapFlanksOverlaps) #Confirm import ``` ```{r} McapFlanksUpstreamOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcFlanksUpstream-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-upstream flank overlaps McapFlanksUpstreamOverlaps <- McapFlanksUpstreamOverlaps[-37,] #Remove final row tail(McapFlanksUpstreamOverlaps) #Confirm import ``` ```{r} McapFlanksDownstreamOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcFlanksDownstream-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-Introns overlaps McapFlanksUpstreamOverlaps <- McapFlanksUpstreamOverlaps[-37,] #Remove final row tail(McapFlanksDownstreamOverlaps) #Confirm import ``` ```{r} McapIntergenicOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-5x-mcIntergenic-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-Intergenic overlaps McapIntergenicOverlaps <- McapIntergenicOverlaps[-37,] #Remove final row tail(McapIntergenicOverlaps) #Confirm import ``` ### Create summary table ```{r} McapFeatureOverlaps <- data.frame("allCpGs" = rep(0, times = 7), "Meth10Meth" = rep(0, times = 7), "Meth10sparseMeth" = rep(0, times = 7), "Meth10unMeth" = rep(0, times = 7), "Meth10" = rep(0, times = 7), "Meth11Meth" = rep(0, times = 7), "Meth11sparseMeth" = rep(0, times = 7), "Meth11unMeth" = rep(0, times = 7), "Meth11" = rep(0, times = 7), "Meth12Meth" = rep(0, times = 7), "Meth12sparseMeth" = rep(0, times = 7), "Meth12unMeth" = rep(0, times = 7), "Meth12" = rep(0, times = 7), "Meth13Meth" = rep(0, times = 7), "Meth13sparseMeth" = rep(0, times = 7), "Meth13unMeth" = rep(0, times = 7), "Meth13" = rep(0, times = 7), "Meth14Meth" = rep(0, times = 7), "Meth14sparseMeth" = rep(0, times = 7), "Meth14unMeth" = rep(0, times = 7), "Meth14" = rep(0, times = 7), "Meth15Meth" = rep(0, times = 7), "Meth15sparseMeth" = rep(0, times = 7), "Meth15unMeth" = rep(0, times = 7), "Meth15" = rep(0, times = 7), "Meth16Meth" = rep(0, times = 7), "Meth16sparseMeth" = rep(0, times = 7), "Meth16unMeth" = rep(0, times = 7), "Meth16" = rep(0, times = 7), "Meth17Meth" = rep(0, times = 7), "Meth17sparseMeth" = rep(0, times = 7), "Meth17unMeth" = rep(0, times = 7), "Meth17" = rep(0, times = 7), "Meth18Meth" = rep(0, times = 7), "Meth18sparseMeth" = rep(0, times = 7), "Meth18unMeth" = rep(0, times = 7), "Meth18" = rep(0, times = 7)) #Create blank dataframe with information for various CpG categories and methylation status. Match columns to the order of columns in the overlap count files row.names(McapFeatureOverlaps) <- c("Genes", "CDS", "Introns", "Flanking Regions", "Upstream Flanks", "Downstream Flanks", "Intergenic") #Assign row names head(McapFeatureOverlaps) #Confirm changes ``` ```{r} McapFeatureOverlaps$allCpGs <- c(McapGenomeFeatures$counts[5], McapGenomeFeatures$counts[1], McapGenomeFeatures$counts[7], McapGenomeFeatures$counts[3], McapGenomeFeatures$counts[4], McapGenomeFeatures$counts[2], McapGenomeFeatures$counts[6]) #Assign information for CG motif overlaps with genome features. head(McapFeatureOverlaps) #Confirm modification ``` ```{r} for (i in 1:length(McapGeneOverlaps$counts)) { McapFeatureOverlaps[1,i+1] <- McapGeneOverlaps[i,1] McapFeatureOverlaps[2,i+1] <- McapCDSOverlaps[i,1] McapFeatureOverlaps[3,i+1] <- McapIntronsOverlaps[i,1] McapFeatureOverlaps[4,i+1] <- McapFlanksOverlaps[i,1] McapFeatureOverlaps[5,i+1] <- McapFlanksUpstreamOverlaps[i,1] McapFeatureOverlaps[6,i+1] <- McapFlanksDownstreamOverlaps[i,1] McapFeatureOverlaps[7,i+1] <- McapIntergenicOverlaps[i,1] } #For each table with feature overlap information, paste the contents of the count column in the assigned row tail(McapFeatureOverlaps) #Check summary table ``` ```{r} write.table(McapFeatureOverlaps, "../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap_union-Genomic-Location-Counts.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save file ``` ```{r} McapFeatureOverlapsPercents <- McapFeatureOverlaps[-c(1,4),] #Duplicate dataframe but remove gene and total flank rows for (i in 1:length(McapFeatureOverlaps)) { McapFeatureOverlapsPercents[,i] <- ((McapFeatureOverlapsPercents[,i] / sum(McapFeatureOverlapsPercents[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(McapFeatureOverlapsPercents) #Check calculations ``` ```{r} write.table(McapFeatureOverlapsPercents, "../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap_union-Genomic-Location-Percents.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save file ``` ### Multivariate data analysis with compositional data We are interested in how sequencing method affects the proportion, or composition, of CpGs in various methylation statuses and genomic features. For this reason, I will use a combined compositional analysis and multivariate approach. For each sample, I will have separate columns for each methylation status. #### Format data ```{r} McapFeatureOverlapsTrans <- data.frame(clr(t(McapFeatureOverlapsPercents[,seq(5, 37, 4)] / 100))) #Use centered log-ratio transformation on proportion data tail(McapFeatureOverlapsTrans) # Confirm transformation ``` #### PCoA and perMANOVA ##### Conduct PCoA ```{r} dissimMcapFeatureOverlapsTrans <- vegdist(McapFeatureOverlapsTrans, "euclidean") #Calculate euclidean dissimilarity matrix ``` ```{r} McapFeatureOverlapsPCoA <- cmdscale(dissimMcapFeatureOverlapsTrans, eig = TRUE, add = TRUE) #Perform the PCoA. Include eigenvalues for each PC, and add a constant so default eigenvalues are non-negative. McapFeatureOverlapsPCoA$points #View PC scores ``` ##### Understand eigenvalues and loadings ```{r} McapFeatureOverlapsPCoA$eig #View eigenvalues (McapFeatureOverlapsPCoA$eig / sum(McapFeatureOverlapsPCoA$eig)) * 100 #Calculate percent variation explained by each PC ``` ```{r} plot(McapFeatureOverlapsPCoA$eig/sum(McapFeatureOverlapsPCoA$eig)*100, type = "b",lwd = 2,col = "blue", xlab = "Principal Component from PCoA", ylab = "% variation explained", main = "% variation explained by PCoA (blue) vs. random expectation (red)") #Plot eigenvalues lines(bstick(35)*100, type = "b",lwd = 2, col = "red") #Compare eigenvalues to expectations according to the broken stick model ``` ```{r} vec.McapFeatureOverlapsPCoA <- envfit(scores(McapFeatureOverlapsPCoA), McapFeatureOverlapsTrans, perm = 1000) #Extract PCs to calculate PC loadings (variable weights) vec.McapFeatureOverlapsPCoA #Look at statistical results ``` ##### Global perMANOVA ```{r} McapFeatureOverlapsTest <- adonis(dissimMcapFeatureOverlapsTrans ~ sampleInformation) #Conduct perMANOVA by method McapFeatureOverlapsTest #Look at test output. ``` ##### Beta dispersion model ```{r} disp.McapFeatureOverlapsTrans <- betadisper(dissimMcapFeatureOverlapsTrans, group = sampleInformation, type = 'centroid') #Run a beta dispersion model to assess if significant differences are due to differences in group centroid or variance anova(disp.McapFeatureOverlapsTrans) #Variance is the same between groups. Significance in perMANOVA due to centroid differences, not variance ``` ##### Create plot ```{r} ordiplot(McapFeatureOverlapsPCoA, choices = c(1,2), type = "text", display = "sites", xlab = "PC 1 (92.3%)", ylab = "PC 2 (7.4%)") #Plot basic PCoA plot(vec.McapFeatureOverlapsPCoA, p.max = 0.05, col = "blue") #Plot loadings that are significant at the 0.05 level ``` #### Pairwise perMANVOA ##### WGBS vs. RRBS ```{r} McapFeatureOverlapsWGRR <- vegdist(McapFeatureOverlapsTrans[c(1:3, 4:6),], "euclidean") #Subset WGBS and RRBS data and calculate dissimilarity matrix ``` ```{r} McapFeatureOverlapsWGRRTest <- adonis(McapFeatureOverlapsWGRR ~ sampleInformation[1:6]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples McapFeatureOverlapsWGRRTest ``` ##### WGBS vs. MBD-BS ```{r} McapFeatureOverlapsWGMB <- vegdist(McapFeatureOverlapsTrans[c(1:3, 7:9),], "euclidean") #Subset WGBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} McapFeatureOverlapsWGMBTest <- adonis(McapFeatureOverlapsWGMB ~ sampleInformation[c(1:3, 7:9)]) #Conduct pairwise perMANOVA for WGBS and MBD-BS data. Only use sequencing method metadata for these samples McapFeatureOverlapsWGMBTest ``` ##### RRBS vs. MBD-BS ```{r} McapFeatureOverlapsRRMB <- vegdist(McapFeatureOverlapsTrans[c(4:9),], "euclidean") #Subset RRBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} McapFeatureOverlapsRRMBTest <- adonis(McapFeatureOverlapsRRMB ~ sampleInformation[c(4:9)]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples McapFeatureOverlapsRRMBTest ``` ### Generalized linear model analysis Does sequencing method influence the proportion of CpGs detected in various genome features? #### Format data ```{r} McapFeatureOverlapsGLMData <- cbind("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(seq(1:3), times = 3), data.frame(t(McapFeatureOverlapsPercents[,seq(5, 37, 4)] / 100))) #Create master dataframe for GLM head(McapFeatureOverlapsGLMData) #Confirm dataframe creation ``` #### CDS ```{r} McapCDSModel <- glmmTMB(CDS ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapCDSModel) #Look at model output. ``` ```{r} McapCDSPostHoc <- data.frame(emmeans(McapCDSModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapCDSPostHoc) #Look at log odd ratio results ``` #### Introns ```{r} McapIntronsModel <- glmmTMB(Introns ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapIntronsModel) #Look at model output. ``` ```{r} McapIntronsPostHoc <- data.frame(emmeans(McapIntronsModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapIntronsPostHoc) #Look at log odd ratio results ``` #### Upstream Flanks ```{r} McapUpstreamFlanksModel <- glmmTMB(Upstream.Flanks ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapUpstreamFlanksModel) #Look at model output ``` ```{r} McapUpstreamFlanksPostHoc <- data.frame(emmeans(McapUpstreamFlanksModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapUpstreamFlanksPostHoc) #Look at log odd ratio results ``` #### Downstream Flanks ```{r} McapDownstreamFlanksModel <- glmmTMB(Downstream.Flanks ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapDownstreamFlanksModel) #Look at model output ``` ```{r} McapDownstreamFlanksPostHoc <- data.frame(emmeans(McapDownstreamFlanksModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapDownstreamFlanksPostHoc) #Look at log odd ratio results ``` #### Intergenic regions ```{r} McapIntergenicModel <- glmmTMB(Intergenic ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = McapFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(McapIntergenicModel) #Look at model output ``` ```{r} McapIntergenicPostHoc <- data.frame(emmeans(McapIntergenicModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(McapIntergenicPostHoc) #Look at log odd ratio results ``` #### Save statistical output ```{r} McapCpGFeatureOverlapStatOutput <- rbind(McapCDSPostHoc, McapIntronsPostHoc, McapUpstreamFlanksPostHoc, McapDownstreamFlanksPostHoc, McapIntergenicPostHoc) #Create a dataframe with logs odd ratio output for each model McapCpGFeatureOverlapStatOutput$model <- c(rep("CDS", times = 3), rep("Introns", times = 3), rep("UpstreamFlanks", times = 3), rep("DownstreamFlanks", times = 3), rep("Intergenic", times = 3)) #Add model information head(McapCpGFeatureOverlapStatOutput) #Confirm dataframe creation ``` ```{r} write.table(McapCpGFeatureOverlapStatOutput, "../analyses/Characterizing-CpG-Methylation-5x/Mcap/Mcap-CpG-Overlap-StatResults.txt", quote = FALSE, row.names = FALSE) #Save table ``` ## Pact ### Import file counts ```{r} PactGenomeFeatures <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CGMotif-Overlaps-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with CG motif and feature track overlaps PactGenomeFeatures <- PactGenomeFeatures[-8,] #Remove final row tail(PactGenomeFeatures) #Check import ``` ```{r} PactGeneOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paGenes-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-gene overlaps PactGeneOverlaps <- PactGeneOverlaps[-37,] #Remove final row tail(PactGeneOverlaps) #Confirm import ``` ```{r} PactCDSOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paCDS-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-CDS overlaps PactCDSOverlaps <- PactCDSOverlaps[-37,] #Remove final row tail(PactCDSOverlaps) #Confirm import ``` ```{r} PactIntronsOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paIntron-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-Introns overlaps PactIntronsOverlaps <- PactIntronsOverlaps[-37,] #Remove final row tail(PactIntronsOverlaps) #Confirm import ``` ```{r} PactFlanksOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paFlanks-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-flanks overlaps PactFlanksOverlaps <- PactFlanksOverlaps[-37,] #Remove final row tail(PactFlanksOverlaps) #Confirm import ``` ```{r} PactFlanksUpstreamOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paFlanksUpstream-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-upstream flanks overlaps PactFlanksUpstreamOverlaps <- PactFlanksUpstreamOverlaps[-37,] #Remove final row tail(PactFlanksUpstreamOverlaps) #Confirm import ``` ```{r} PactFlanksDownstreamOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paFlanksDownstream-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-downstream flanks overlaps PactFlanksDownstreamOverlaps <- PactFlanksDownstreamOverlaps[-37,] #Remove final row tail(PactFlanksDownstreamOverlaps) #Confirm import ``` ```{r} PactIntergenicOverlaps <- read.table("../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-5x-paIntergenic-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file-Intergenic overlaps PactIntergenicOverlaps <- PactIntergenicOverlaps[-37,] #Remove final row tail(PactIntergenicOverlaps) #Confirm import ``` ### Create summary table ```{r} PactFeatureOverlaps <- data.frame("allCpGs" = rep(0, times = 7), "Meth1Meth" = rep(0, times = 7), "Meth1sparseMeth" = rep(0, times = 7), "Meth1unMeth" = rep(0, times = 7), "Meth1" = rep(0, times = 7), "Meth2Meth" = rep(0, times = 7), "Meth2sparseMeth" = rep(0, times = 7), "Meth2unMeth" = rep(0, times = 7), "Meth2" = rep(0, times = 7), "Meth3Meth" = rep(0, times = 7), "Meth3sparseMeth" = rep(0, times = 7), "Meth3unMeth" = rep(0, times = 7), "Meth3" = rep(0, times = 7), "Meth4Meth" = rep(0, times = 7), "Meth4sparseMeth" = rep(0, times = 7), "Meth4unMeth" = rep(0, times = 7), "Meth4" = rep(0, times = 7), "Meth5Meth" = rep(0, times = 7), "Meth5sparseMeth" = rep(0, times = 7), "Meth5unMeth" = rep(0, times = 7), "Meth5" = rep(0, times = 7), "Meth6Meth" = rep(0, times = 7), "Meth6sparseMeth" = rep(0, times = 7), "Meth6unMeth" = rep(0, times = 7), "Meth6" = rep(0, times = 7), "Meth7Meth" = rep(0, times = 7), "Meth7sparseMeth" = rep(0, times = 7), "Meth7unMeth" = rep(0, times = 7), "Meth7" = rep(0, times = 7), "Meth8Meth" = rep(0, times = 7), "Meth8sparseMeth" = rep(0, times = 7), "Meth8unMeth" = rep(0, times = 7), "Meth8" = rep(0, times = 7), "Meth9Meth" = rep(0, times = 7), "Meth9sparseMeth" = rep(0, times = 7), "Meth9unMeth" = rep(0, times = 7), "Meth9" = rep(0, times = 7)) #Create blank dataframe with information for various CpG categories and methylation status. Match columns to the order of columns in the overlap count files row.names(PactFeatureOverlaps) <- c("Genes", "CDS", "Introns", "Flanking Regions", "Upstream Flanks", "Downstream Flanks", "Intergenic") #Assign row names head(PactFeatureOverlaps) #Confirm changes ``` ```{r} PactFeatureOverlaps$allCpGs <- c(PactGenomeFeatures$counts[5], PactGenomeFeatures$counts[1], PactGenomeFeatures$counts[7], PactGenomeFeatures$counts[3], PactGenomeFeatures$counts[4], PactGenomeFeatures$counts[2], PactGenomeFeatures$counts[6]) #Assign information for CG motif overlaps with genome features. head(PactFeatureOverlaps) #Confirm modification ``` ```{r} for (i in 1:length(PactGeneOverlaps$counts)) { PactFeatureOverlaps[1,i+1] <- PactGeneOverlaps[i,1] PactFeatureOverlaps[2,i+1] <- PactCDSOverlaps[i,1] PactFeatureOverlaps[3,i+1] <- PactIntronsOverlaps[i,1] PactFeatureOverlaps[4,i+1] <- PactFlanksOverlaps[i,1] PactFeatureOverlaps[5,i+1] <- PactFlanksUpstreamOverlaps[i,1] PactFeatureOverlaps[6,i+1] <- PactFlanksDownstreamOverlaps[i,1] PactFeatureOverlaps[7,i+1] <- PactIntergenicOverlaps[i,1] } #For each table with feature overlap information, paste the contents of the count column in the assigned row tail(PactFeatureOverlaps) #Check summary table ``` ```{r} write.table(PactFeatureOverlaps, "../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact_union-Genomic-Location-Counts.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save file ``` ```{r} PactFeatureOverlapsPercents <- PactFeatureOverlaps[-c(1,4),] #Duplicate dataframe but remove gene and total flank rows for (i in 1:length(PactFeatureOverlaps)) { PactFeatureOverlapsPercents[,i] <- (PactFeatureOverlapsPercents[,i] / (sum(PactFeatureOverlapsPercents[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(PactFeatureOverlapsPercents) #Check calculations ``` ```{r} write.table(PactFeatureOverlapsPercents, "../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact_union-Genomic-Location-Percents.txt", sep = "\t", quote = FALSE, row.names = TRUE) #Save file ``` ### Multivariate data analysis with compositional data We are interested in how sequencing method affects the proportion, or composition, of CpGs in various methylation statuses and genomic features. For this reason, I will use a combined compositional analysis and multivariate approach. For each sample, I will have separate columns for each methylation status. #### Format data ```{r} PactFeatureOverlapsTrans <- data.frame(clr(t(PactFeatureOverlapsPercents[,seq(5, 37, 4)] / 100))) #Use centered log-ratio transformation on proportion data tail(PactFeatureOverlapsTrans) # Confirm transformation ``` #### PCoA and perMANOVA ##### Conduct PCoA ```{r} dissimPactFeatureOverlapsTrans <- vegdist(PactFeatureOverlapsTrans, "euclidean") #Calculate euclidean dissimilarity matrix ``` ```{r} PactFeatureOverlapsPCoA <- cmdscale(dissimPactFeatureOverlapsTrans, eig = TRUE, add = TRUE) #Perform the PCoA. Include eigenvalues for each PC, and add a constant so default eigenvalues are non-negative. PactFeatureOverlapsPCoA$points #View PC scores ``` ##### Understand eigenvalues and loadings ```{r} PactFeatureOverlapsPCoA$eig #View eigenvalues (PactFeatureOverlapsPCoA$eig / sum(PactFeatureOverlapsPCoA$eig)) * 100 #Calculate percent variation explained by each PC ``` ```{r} plot(PactFeatureOverlapsPCoA$eig/sum(PactFeatureOverlapsPCoA$eig)*100, type = "b",lwd = 2,col = "blue", xlab = "Principal Component from PCoA", ylab = "% variation explained", main = "% variation explained by PCoA (blue) vs. random expectation (red)") #Plot eigenvalues lines(bstick(35)*100, type = "b",lwd = 2, col = "red") #Compare eigenvalues to expectations according to the broken stick model ``` ```{r} vec.PactFeatureOverlapsPCoA <- envfit(scores(PactFeatureOverlapsPCoA), PactFeatureOverlapsTrans, perm = 1000) #Extract PCs to calculate PC loadings (variable weights) vec.PactFeatureOverlapsPCoA #Look at statistical results ``` ##### Global perMANOVA ```{r} PactFeatureOverlapsTest <- adonis(dissimPactFeatureOverlapsTrans ~ sampleInformation) #Conduct perMANOVA by method PactFeatureOverlapsTest #Look at test output. ``` ##### Beta dispersion model ```{r} disp.PactFeatureOverlapsTrans <- betadisper(dissimPactFeatureOverlapsTrans, group = sampleInformation, type = 'centroid') #Run a beta dispersion model to assess if significant differences are due to differences in group centroid or variance anova(disp.PactFeatureOverlapsTrans) #Variance is different between groups. Significance in perMANOVA is due to either centroid or variance differences ``` ##### Create plot ```{r} ordiplot(PactFeatureOverlapsPCoA, choices = c(1,2), type = "text", display = "sites", xlab = "PC 1 (82.5%)", ylab = "PC 2 (15.0%)") #Plot basic PCoA plot(vec.PactFeatureOverlapsPCoA, p.max = 0.05, col = "blue") #Plot loadings that are significant at the 0.05 level ``` #### Pairwise perMANVOA ##### WGBS vs. RRBS ```{r} PactFeatureOverlapsWGRR <- vegdist(PactFeatureOverlapsTrans[c(1:3, 4:6),], "euclidean") #Subset WGBS and RRBS data and calculate dissimilarity matrix ``` ```{r} PactFeatureOverlapsWGRRTest <- adonis(PactFeatureOverlapsWGRR ~ sampleInformation[1:6]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples PactFeatureOverlapsWGRRTest ``` ```{r} disp.PactFeatureOverlapsWGRR <- betadisper(PactFeatureOverlapsWGRR, group = sampleInformation[1:6], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactFeatureOverlapsWGRR) #Variance is the same between groups. ``` ##### WGBS vs. MBD-BS ```{r} PactFeatureOverlapsWGMB <- vegdist(PactFeatureOverlapsTrans[c(1:3, 7:9),], "euclidean") #Subset WGBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} PactFeatureOverlapsWGMBTest <- adonis(PactFeatureOverlapsWGMB ~ sampleInformation[c(1:3, 7:9)]) #Conduct pairwise perMANOVA for WGBS and MBD-BS data. Only use sequencing method metadata for these samples PactFeatureOverlapsWGMBTest ``` ```{r} disp.PactFeatureOverlapsWGMB <- betadisper(PactFeatureOverlapsWGMB, group = sampleInformation[c(1:3, 7:9)], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactFeatureOverlapsWGMB) #Variance is different between groups. ``` ##### RRBS vs. MBD-BS ```{r} PactFeatureOverlapsRRMB <- vegdist(PactFeatureOverlapsTrans[c(4:9),], "euclidean") #Subset RRBS and MBD-BS data and calculate dissimilarity matrix ``` ```{r} PactFeatureOverlapsRRMBTest <- adonis(PactFeatureOverlapsRRMB ~ sampleInformation[c(4:9)]) #Conduct pairwise perMANOVA for WGBS and RRBS data. Only use sequencing method metadata for these samples PactFeatureOverlapsRRMBTest ``` ```{r} disp.PactFeatureOverlapsRRMB <- betadisper(PactFeatureOverlapsRRMB, group = sampleInformation[c(4:9)], type = 'centroid') #Run a beta dispersion model to assess if differences are due to differences in group centroid or variance anova(disp.PactFeatureOverlapsRRMB) #Variance is different between groups. ``` ### Generalized linear model analysis Does sequencing method influence the proportion of CpGs detected in various genome features? #### Format data ```{r} PactFeatureOverlapsGLMData <- cbind("seqMethod" = c(rep("A", times = 3), rep("B", times = 3), rep("C", times = 3)), "replicate" = rep(seq(1:3), times = 3), data.frame(t(PactFeatureOverlapsPercents[,seq(5, 37, 4)] / 100))) #Create master dataframe for GLM head(PactFeatureOverlapsGLMData) #Confirm dataframe creation ``` #### CDS ```{r} PactCDSModel <- glmmTMB(CDS ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactCDSModel) #Look at model output. ``` ```{r} PactCDSPostHoc <- data.frame(emmeans(PactCDSModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactCDSPostHoc) #Look at log odd ratio results ``` #### Introns ```{r} PactIntronsModel <- glmmTMB(Introns ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactIntronsModel) #Look at model output. ``` ```{r} PactIntronsPostHoc <- data.frame(emmeans(PactIntronsModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactIntronsPostHoc) #Look at log odd ratio results ``` #### Upstream Flanks ```{r} PactUpstreamFlanksModel <- glmmTMB(Upstream.Flanks ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactUpstreamFlanksModel) #Look at model output ``` ```{r} PactUpstreamFlanksPostHoc <- data.frame(emmeans(PactUpstreamFlanksModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactUpstreamFlanksPostHoc) #Look at log odd ratio results ``` #### Downstream Flanks ```{r} PactDownstreamFlanksModel <- glmmTMB(Downstream.Flanks ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactDownstreamFlanksModel) #Look at model output ``` ```{r} PactDownstreamFlanksPostHoc <- data.frame(emmeans(PactDownstreamFlanksModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactDownstreamFlanksPostHoc) #Look at log odd ratio results ``` #### Intergenic regions ```{r} PactIntergenicModel <- glmmTMB(Intergenic ~ seqMethod + (1|replicate), family = beta_family(link = "logit"), data = PactFeatureOverlapsGLMData) #Run the model using a beta distribution and a logit link. Use replicate as a random effect summary(PactIntergenicModel) #Look at model output ``` ```{r} PactIntergenicPostHoc <- data.frame(emmeans(PactIntergenicModel, pairwise ~ seqMethod, adjust = "FDR")$contrasts) #Run pairwise comparisons (estimated marginal means). Obtain log odd ratio results and not confidence intervals for individual methods in a dataframe format. Specify FDR instead of Tukey post-hoc test (default) head(PactIntergenicPostHoc) #Look at log odd ratio results ``` #### Save statistical output ```{r} PactCpGFeatureOverlapStatOutput <- rbind(PactCDSPostHoc, PactIntronsPostHoc, PactUpstreamFlanksPostHoc, PactDownstreamFlanksPostHoc, PactIntergenicPostHoc) #Create a dataframe with logs odd ratio output for each model PactCpGFeatureOverlapStatOutput$model <- c(rep("CDS", times = 3), rep("Introns", times = 3), rep("UpstreamFlanks", times = 3), rep("DownstreamFlanks", times = 3), rep("Intergenic", times = 3)) #Add model information head(PactCpGFeatureOverlapStatOutput) #Confirm dataframe creation ``` ```{r} write.table(PactCpGFeatureOverlapStatOutput, "../analyses/Characterizing-CpG-Methylation-5x/Pact/Pact-CpG-Overlap-StatResults.txt", quote = FALSE, row.names = FALSE) #Save table ```