--- title: "General Methylation Landscape" author: "Yaamini Venkataraman" output: html_document --- In this script, I'll create summary tables for genomic location information, run statistcal tests, and visualize the distribution of DML in the genome. # Prepare R Markdown file ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = "/Users/yaaminivenkataraman/Documents/project-oyster-oa/analyses/Haws_06-methylation-landscape/") #Set root directory ``` # Install packages ```{r} #Packages for visualization #install.packages("dichromat") #install.packages("broom") require(dichromat) require(broom) ``` # Obtain session information ```{r} sessionInfo() ``` # Establish color scheme ```{r} plotColors <- c("#B54E00", "#E38100", "#FFA801", "#FFCB5E", "#A9DDFF", "#21ADFA", "#0A72AB", "#004B70") #Create a color palette for the barplots. ``` # Genomic location of methylated CpGs My goal is to 1) understand statistical differences between distribution of CG motifs and methylated CpGs in various genomic features and 2) visualize the genomic locations of these loci. ## Import file counts ```{r} exonUTROverlaps <- read.table("zr3644_5x-exonUTR-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps exonUTROverlaps <- exonUTROverlaps[73:75,] #Retain final rows head(exonUTROverlaps) #Confirm import ``` ```{r} CDSOverlaps <- read.table("zr3644_5x-CDS-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps CDSOverlaps <- CDSOverlaps[73:75,] #Retain final rows head(CDSOverlaps) #Confirm import ``` ```{r} intronOverlaps <- read.table("zr3644_5x-intron-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps intronOverlaps <- intronOverlaps[73:75,] #Retain final rows head(intronOverlaps) #Confirm import ``` ```{r} upstreamOverlaps <- read.table("zr3644_5x-upstreamFlanks-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps upstreamOverlaps <- upstreamOverlaps[73:75,] #Retain final rows head(upstreamOverlaps) #Confirm import ``` ```{r} downstreamOverlaps <- read.table("zr3644_5x-downstreamFlanks-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps downstreamOverlaps <- downstreamOverlaps[73:75,] #Retain final rows head(downstreamOverlaps) #Confirm import ``` ```{r} lncRNAOverlaps <- read.table("zr3644_5x-lncRNA-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps lncRNAOverlaps <- lncRNAOverlaps[73:75,] #Retain final rows head(lncRNAOverlaps) #Confirm import ``` ```{r} TEOverlaps <- read.table("zr3644_5x-TE-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps TEOverlaps <- TEOverlaps[73:75,] #Retain final rows head(TEOverlaps) #Confirm import ``` ```{r} intergenicOverlaps <- read.table("zr3644_5x-intergenic-counts.txt", header = FALSE, col.names = c("counts", "filename")) #Import file with all file overlaps intergenicOverlaps <- intergenicOverlaps[73:75,] #Retain final rows head(intergenicOverlaps) #Confirm import ``` ## Create summary table ```{r} cpgFeatureOverlaps <- cbind(exonUTROverlaps, CDSOverlaps, intronOverlaps, upstreamOverlaps, downstreamOverlaps, lncRNAOverlaps, TEOverlaps, intergenicOverlaps) #Combine information from all genome features rownames(cpgFeatureOverlaps) <- c("Meth", "sparseMeth", "unMeth") #Assign row names ncol(cpgFeatureOverlaps) #Count columns cpgFeatureOverlaps <- cpgFeatureOverlaps[,seq(1, 16, 2)] #Keep odd-numbered columns (counts) colnames(cpgFeatureOverlaps) <- c("exonUTR", "CDS", "intron", "upstream", "downstream", "lncRNA", "TE", "intergenic") #Add column names head(cpgFeatureOverlaps) #Confirm formatting ``` ```{r} cpgFeatureOverlaps <- as.data.frame(t(cpgFeatureOverlaps)) #Transpose so column names are row names cpgFeatureOverlaps$allCpGs <- c(700167, 1516641, 5221932, 592834, 549787, 454393, 5424483, 4730728) #Add CpG motif overlap information cpgFeatureOverlaps <- cpgFeatureOverlaps[,c(4, 1:3)] #Reorganize columns head(cpgFeatureOverlaps) #Confirm formatting ``` ```{r} write.csv(cpgFeatureOverlaps, "CpG-feature-overlaps.csv", quote = FALSE, row.names = TRUE) #Save data ``` ## Contingency tests ### Format data ```{r} cpgLocationStatTest <- data.frame(t(cpgFeatureOverlaps)) #Transpose for statistical testing cpgLocationStatTest <- cpgLocationStatTest[1:2,] #keep only allCpGs and Meth data head(cpgLocationStatTest) #Confirm formatting ``` ### All vs. Meth ```{r} CpGLocationAllMeth <- data.frame() #Create empty dataframe to store chi-squared results for(i in 1:ncol(cpgLocationStatTest)) { #For each genome feature AllFeature <- cpgLocationStatTest[1,i] #Variable for # genome feature overlaps for genomic CpGs with data MethFeature <- cpgLocationStatTest[2,i] #Variable for # genome feature overlaps for methylated CpGs AllNotFeature <- sum(cpgLocationStatTest[1,-i]) #Variable for # other CpG types for genomic CpGs with data MethNotFeature <- sum(cpgLocationStatTest[2,-i]) #Variable for # other CpG types for methylated CpGs ct <- matrix(c(AllFeature, MethFeature, AllNotFeature, MethNotFeature), ncol = 2) #Create contingency table colnames(ct) <- c(as.character(colnames(cpgLocationStatTest[i])), paste0("Not", colnames(cpgLocationStatTest[i]))) #Assign column names: type, not type rownames(ct) <- c(as.character(row.names(cpgLocationStatTest)[c(1,2)])) #Assign row names: genomic CpGs with data, methylated CpGs print(ct) #Confirm table is correct ctResults <- data.frame(broom::tidy(chisq.test(ct))) #Create dataframe storing chi-sq stats results. Use broom::tidy to extract results from test output ctResults$GenomeFeature <- as.character(colnames(cpgLocationStatTest)[i]) #Add CpG type to results CpGLocationAllMeth <- rbind(CpGLocationAllMeth, ctResults) #Add test statistics to master table } ``` ```{r} head(CpGLocationAllMeth) ``` ```{r} CpGLocationAllMeth$p.adj <- p.adjust(CpGLocationAllMeth$p.value, method = "fdr") #Correct p-value using FDR range(CpGLocationAllMeth$p.adj) #Look at range of p-values head(CpGLocationAllMeth) #Confirm changes ``` ```{r} write.csv(CpGLocationAllMeth, "CpG-location-statResults.txt", quote = FALSE, row.names = TRUE) #Save statistical output ``` ## Create barplot ```{r} head(cpgFeatureOverlaps) ``` ```{r} cpgFeatureOverlapsPercents <- cpgFeatureOverlaps #Duplicate dataframe for (i in 1:length(cpgFeatureOverlaps)) { cpgFeatureOverlapsPercents[,i] <- (cpgFeatureOverlapsPercents[,i] / (sum(cpgFeatureOverlapsPercents[,i]))) * 100 } #Divide every entry by sum of the column and multiply by 100 to get percentages. Do not include gene information head(cpgFeatureOverlapsPercents) #Check calculations ``` ```{r} #pdf("figures/CpG-feature-overlaps.pdf", width = 11, height = 8.5) par(mar = c(1,5,0,1), oma = c(3, 1, 1, 11)) #Change figure boundaries barplot(t(t(cpgFeatureOverlapsPercents[,c(1:4)])), col= rev(plotColors), axes = FALSE, names.arg = c("All 5x CpGs", "High", "Moderate", "Low"), cex.names = 1.5, ylim = c(0, 110)) #Create base plot. Everything should be white. Do not plot axes. Include sequencing type as labels and set size. Set y-axis specs. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex.axis = 1.2) #Add y-axis mtext(side = 2, "% CpGs", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend(x = 0.57, y = 0.87, xpd = TRUE, legend = c("Exon UTR", "CDS", "Introns", "Upstream Flank", "Downstream Flank", "lncRNA", "TE", "Intergenic"), pch = 22, col = "black", pt.bg = rev(plotColors), bty = "n", cex = 1.5, x.intersp = 0.7, xjust = 0) #Place a legend in the top right of the figure with no box text("Genome Feature", x = 0.785, y = 0.879, cex = 1.5) #Add legend title that is aligned with legend #dev.off() ``` ```{r} save.image("../../project-oyster-oa.RData") #Save RData ```