--- title: "Proportion Test" author: "Yaamini Venkataraman" date: "1/15/2019" output: html_document --- I will use a proportion test to compare the proportion of genome feature overlaps between various CpG categories. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Install packages ```{r} #install.packages("RColorBrewer") require(RColorBrewer) #Load RColorBrewer #install.packages("dichromat") require(dichromat) ``` # Obtain session information ```{r} sessionInfo() ``` # Import data ```{r} overlapData <- read.csv("../analyses/2018-12-02-Gene-Enrichment-Analysis/2019-01-15-Overlap-Proportions.csv", header = TRUE) rownames(overlapData) <- overlapData$genomicFeature #Set genomic feature indication and rownames overlapData <- overlapData[,-1] #Remove genomic feature indication column head(overlapData) #Confirm import ``` ## Reformat data ```{r} proportionData <- overlapData #Copy overlap data as a new dataframe nLength <- length(proportionData$totalCpG) #Count number of rows for(i in 1:nLength) { proportionData[8,i] <- sum(proportionData[,i], na.rm = TRUE) } #Count the total number of entries for each column tail(proportionData) #Confirm changes ``` ```{r} nLength <- length(proportionData$totalCpG) #Count number of rows for(i in 1:nLength) { proportionData[i,] <- (proportionData[i,]/proportionData[8,])*100 } #Divide each column of proportionData by respective totalLines entry. Multiply by 100 and and save the percentage tail(proportionData) #Confirm changes ``` ```{r} proportionData <- proportionData[-8,] #Remove totalLines row tail(proportionData) #Confirm changes ``` # DML ## Conduct chi-squared tests of homogeneity The null hypothesis is that loci distributions in the genome are the same between different categories. ### Total CpGs vs. methylated CpGs ```{r} totalVersusMethylated <- overlapData[, c(1,3)] #Subset data for statistical testing totalVersusMethylated <- totalVersusMethylated[-6, ] #Remove totalLines row head(totalVersusMethylated) #Confirm changes ``` ```{r} totalVersusMethylatedTest <- chisq.test(t(totalVersusMethylated)) #Conduct a chi-squared test totalVersusMethylatedTest #The distribution of methylated CpGs is significantly different from all CpG ``` ### Enriched CpGs vs. DML The MBD-Enriched loci are the background for DML. ```{r} enrichedVersusDML <- overlapData[,c(2,6)] #Subset data for statistical testing enrichedVersusDML <- enrichedVersusDML[-6,] #Remove totalLines row head(enrichedVersusDML) #Confirm changes ``` ```{r} enrichedVersusDMLTest <- chisq.test(t(enrichedVersusDML)) #Conduct a chi-squared test enrichedVersusDMLTest #The distribution of DML is significantly different from methylated CpGs. ``` ## Create figures ### Define color scheme ```{r} RColorBrewer::display.brewer.all() #Show all RColorBrewer palettes. I will choose greens. plotColors <- rev(RColorBrewer::brewer.pal(5, "GnBu")) #Create a color palette for the barplots. Use 5 green-blue shades from RColorBrewer. Reverse theorder so the darkest shade is used first. barplot(t(t(proportionData)), col = plotColors) #See what plot looks like with new scheme barplot(t(t(proportionData)), col = dichromat(plotColors)) #Check that the plot colors will be easy to interpret for those with color blindess ``` ### All CpGs vs. Methylated CpGs ```{r} allCpGVersusMethylated <- proportionData[,c(1,3)] #Subset data head(allCpGVersusMethylated) #Confirm subset ``` ```{r} #pdf("../analyses/2018-12-02-Gene-Enrichment-Analysis/2019-04-10-All-CpGs-Versus-Methylated-CpGs.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,1), oma = c(1, 1, 1, 15)) #Change figure boundaries par(mar = c(3,5,1,1), oma = c(1, 1, 1, 15)) #Change figure boundaries barplot(t(t(allCpGVersusMethylated)), axes = FALSE, names.arg = c("All CpGs", "Methylated CpGs"), cex.names = 1.5, ylim = c(0,100), col = plotColors) #Create a stacked barplot. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) mtext(side = 2, "Proportion Overlap", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Putative Promoters", "UTRs", "Exons", "Introns", "Transposable Elements", "Intergenic", "Other"), pch = 22, col = "black", pt.bg = plotColors, bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box #dev.off() ``` ### Enriched CpGs vs. DML ```{r} enrichedVersusDMLProportions <- proportionData[,c(2,7)] #Subset data head(enrichedVersusDMLProportions) #Confirm subset ``` ```{r} #pdf("../analyses/2018-12-02-Gene-Enrichment-Analysis/2019-04-10-Enriched-Versus-DML.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,1), oma = c(1, 1, 1, 15)) #Change figure boundaries par(mar = c(3,5,1,1), oma = c(1, 1, 1, 15)) #Change figure boundaries barplot(t(t(enrichedVersusDMLProportions)), axes = FALSE, names.arg = c("Loci with 5x Coverage", "DML"), cex.names = 1.5, ylim = c(0,100), col = plotColors) #Create a stacked barplot. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis mtext(side = 2, "Proportion CpGs", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Putative Promoters", "UTRs", "Exons", "Introns", "Transposable Elements", "Intergenic", "Other"), pch = 22, col = "black", pt.bg = plotColors, bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ```