--- title: "Chacterizing CpG Methylation" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` In this script, I'll create figures to characterize CpG methylation in my *C. virginica* gonad samples. I will use a list of CpGs with at least 5x coverage in any samples to look at the distribution of methylated CpGs. I'll also use individual sample bedgraphs to calculate percent methylation across genome features. # Session information ```{r} sessionInfo() ``` # Distribution of CpGs ## Import data ```{r} cpgMethylation <- read.csv("2019-04-09-All-5x-CpGs.csv", header = FALSE) #Import file with CpG methylation for all loci with 5x coverage colnames(cpgMethylation) <- c("chromosome", "start", "end", "methylation") #Add column names head(cpgMethylation) #Confirm import ``` ## Create figure ```{r} RColorBrewer::display.brewer.all() #Show all RColorBrewer palettes. I will choose greens. plotColors <- rev(RColorBrewer::brewer.pal(7, "GnBu")) #Create a color palette for the barplots. Use 5 green-blue shades from RColorBrewer. Reverse theorder so the darkest shade is used first. barplot(t(t(proportionData)), col = plotColors) #See what plot looks like with new scheme barplot(t(t(proportionData)), col = dichromat(plotColors)) #Check that the plot colors will be easy to interpret for those with color blindess ``` ```{r} #pdf("2019-04-10-5x-CpG-Frequency-Distribution.pdf", width = 11, height = 8.5) hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[2], xaxs = "i", yaxs = "i") #Create base plot axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis mtext(side = 1, text = "Methylation (%)", line = 3, cex = 1.5) #Add x-axis label axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis mtext(side = 2, text = "Frequency (x100,000)", line = 2.5, cex = 1.5) #Add y-axis label #dev.off() ``` # Percent methylation of genome features For each genome feature, I will do the following: 1. Interect sample bedgraphs with genome feature track 2. Add percent methylation information 2. Calculate median percent methylation for the entire matrix ## Check format of previously-generated files and copy to current folder ```{bash} head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph #Individual sample bedgraph with all information ``` ```{bash} cp ../2019-08-14-Differentially-Methylated-Genes/*5x.bedgraph . ``` ```{bash} find *5x.bedgraph ``` ```{bash} head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly #Individual sample bedgraph with chromosome, start, and end position only ``` ```{bash} cp ../2019-08-14-Differentially-Methylated-Genes/*posOnly . ``` ```{bash} find *posOnly ``` ```{bash} head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.sorted #Indvidual sample bedgraph with merged chr, start, end information to join files ``` ```{bash} cp ../2019-08-14-Differentially-Methylated-Genes/*bedgraph.sorted . ``` ```{bash} find *bedgraph.sorted ``` ## Reformat global methylation file ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph #Check current file ``` ```{bash} awk '{print $1"\t"$2"\t"$3}' 2019-04-09-All-5x-CpGs.bedgraph > 2019-04-09-All-5x-CpGs.bedgraph.posOnly #Remove percent methylation information ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph.posOnly ``` ```{bash} #Create a version of the file that can be used to add percent methylation information back with a join command awk '{print $1"-"$2"-"$3"\t"$1"\t"$2"\t"$3"\t"$4}' 2019-04-09-All-5x-CpGs.bedgraph \ | sort -k1,1 \ > 2019-04-09-All-5x-CpGs.bedgraph.sorted ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph.sorted ``` ## Putative promoters ```{bash} head -n 1 ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed #Check promoter track ``` ### Run `intersectBed` ```{bash} find *posOnly ``` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: promoter track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \ > ${f}-promoterOverlap done ``` ```{bash} find *promoterOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in promoterOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *promoterOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *promoterOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-promoterOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-promoterOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-promoterOverlap-percentMeth done ``` ```{bash} find *promoterOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-promoterOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *promoterOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *promoterOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-promoterOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-promoterOverlap-percentMeth-Medians.txt ``` ## UTRs ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed #Check promoter track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: promoter track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-UTROverlap done ``` ```{bash} find *UTROverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in UTROverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *UTROverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *UTROverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-UTROverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-UTROverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-UTROverlap-percentMeth done ``` ```{bash} find *UTROverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-UTROverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *UTROverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *UTROverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-UTROverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-UTROverlap-percentMeth-Medians.txt ``` ## Exons ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-exonOverlap done ``` ```{bash} find *exonOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in exonOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *exonOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *exonOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-exonOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-exonOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-exonOverlap-percentMeth done ``` ```{bash} find *exonOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-exonOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *exonOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *exonOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-exonOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-exonOverlap-percentMeth-Medians.txt ``` ## Introns ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intronOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-intronOverlap done ``` ```{bash} find *intronOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in intronOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *intronOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *intronOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-intronOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-intronOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-intronOverlap-percentMeth done ``` ```{bash} find *intronOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-intronOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *intronOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *intronOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-intronOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-intronOverlap-percentMeth-Medians.txt ``` ## Transposable elements ```{bash} head -n 7 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intronOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff \ | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \ > ${f}-TEOverlap done ``` ```{bash} find *TEOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in TEOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *TEOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *TEOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-TEOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-TEOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-TEOverlap-percentMeth done ``` ```{bash} find *TEOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-TEOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *TEOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *TEOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-TEOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-TEOverlap-percentMeth-Medians.txt ``` ## Intergenic regions ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intergenicOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-intergenicOverlap done ``` ```{bash} find *intergenicOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in intergenicOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *intergenicOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *intergenicOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-intergenicOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-intergenicOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-intergenicOverlap-percentMeth done ``` ```{bash} find *intergenicOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-intergenicOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *intergenicOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *intergenicOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-intergenicOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-intergenicOverlap-percentMeth-Medians.txt ``` ## Plot median methylation ### Reformat data ```{r} #Import median methylation information promoterMedMeth <- read.delim("2020-02-25-promoterOverlap-percentMeth-Medians.txt", header = FALSE) UTRMedMeth <- read.delim("2020-02-25-UTROverlap-percentMeth-Medians.txt", header = FALSE) exonMedMeth <- read.delim("2020-02-25-exonOverlap-percentMeth-Medians.txt", header = FALSE) intronMedMeth <- read.delim("2020-02-25-intronOverlap-percentMeth-Medians.txt", header = FALSE) TEMedMeth <- read.delim("2020-02-25-TEOverlap-percentMeth-Medians.txt", header = FALSE) intergenicMedMeth <- read.delim("2020-02-25-intergenicOverlap-percentMeth-Medians.txt", header = FALSE) ``` ```{r} featureMedMeth <- cbind(promoterMedMeth, UTRMedMeth, exonMedMeth, intronMedMeth, TEMedMeth, intergenicMedMeth) #Create a dataframe with percent methylation information across features colnames(featureMedMeth) <- c("promoter", "UTR", "exon", "intron", "TE", "intergenic") #Add column names featureMedMeth[12,] <- featureMedMeth[2,] #Move sample 10 information to the end featureMedMeth <- featureMedMeth[-2,] #Remove sample 10 information in the incorrect position row.names(featureMedMeth) <- c("global", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10") #Add row names with sample distinctions featureMedMeth <- as.data.frame(t(featureMedMeth)) #Transpose dataframe tail(featureMedMeth) #Confirm formatting ``` ### All median methylation for all features ```{r} #pdf("2020-02-25-All-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,8)) #Change figure boundaries barplot(t(featureMedMeth), beside = TRUE, axes = FALSE, names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[4], times = 5)), ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Global", "Control", "Treatment"), pch = 22, col = "black", pt.bg = c(plotColors[1], plotColors[2], plotColors[4]), bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` ```{r} #pdf("2020-02-25-All-Median-Methylation-Across-Split-Features.pdf", height = 8.5, width = 11) par(mar = c(3,4,0,0), oma = c(0, 2.5, 2, 0), mfrow = c(3,2)) #Change figure boundaries barplot(t(featureMedMeth[1,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " a. Putative promoters", line = -0.5, adj = 0) #Add figure label mtext(side = 2, "Methylation (%)", line = 0, cex = 1.5, outer = TRUE) #Add y-axis outer label barplot(t(featureMedMeth[2,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " b. UTR", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[3,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " c. Exons", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[4,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " d. Introns", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[5,]), beside = TRUE, axes = FALSE, names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " e. Transposable elements", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[6,]), beside = TRUE, axes = FALSE, names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " f. Intergenic regions", line = -0.5, adj = 0) #Add figure label #dev.off() ``` ### Global methylation only ```{r} #pdf("2020-02-25-Global-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,15)) #Change figure boundaries barplot(featureMedMeth$global, axes = FALSE, names.arg = c("", "", "", "", "", ""), col = plotColors[1:6], ylim = c(0,100)) #Create a barplot using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Putative Promoters", "UTRs", "Exons", "Introns", "Transposable Elements", "Intergenic"), pch = 22, col = "black", pt.bg = plotColors[1:6], bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` ### Samples only ```{r} #pdf("2020-02-25-Sample-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,8)) #Change figure boundaries barplot(t(featureMedMeth[,2:11]), beside = TRUE, axes = FALSE, names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"), col = c(rep(plotColors[2], times = 5), rep(plotColors[4], times = 5)), ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Control", "Treatment"), pch = 22, col = "black", pt.bg = c(plotColors[2], plotColors[4]), bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` # Create multipanel plot I'm going to create a multipanel plot with the frequency distribution of CpG types as well as median percent methylation across genome features. ```{r} #pdf("2020-02-25-Methylation-Distribution-Median-Methylation-Across-Features-Multipanel.pdf", height = 8.5, width = 11) plotMatrix <- matrix(c(1,1,0,2,3, 1,1,0,4,5, 1,1,0,6,7), nrow = 3, ncol = 5, byrow = TRUE) #Create a matrix with plot location information. Fill matrix by row. par(mar = c(2, 1, 0, 0), oma = c(4, 5, 2, 2)) #Specify inner and outer margins layout(mat = plotMatrix, widths = c(2, 2, 0.7, 2, 2)) #Create a layout based on plotMatrix. The first and second plots (first column) should be wider. layout.show(n = 7) #Confirm layout looks good hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[1], xaxs = "i", yaxs = "i") #Create base plot axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis mtext(side = 1, text = "Methylation (%)", line = 3.5, cex = 1.5) #Add x-axis label axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis mtext(side = 2, text = "Frequency (x100,000)", line = 3, cex = 1.5) #Add y-axis label mtext(side = 3, "a. CpG distribution", line = -1.5, adj = 1) #Add figure label barplot(t(featureMedMeth[1,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " b. Putative promoters", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[2,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " c. UTR", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[3,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " d. Exons", line = -1.5, adj = 0) #Add figure label mtext(side = 2, "Methylation (%)", line = 3.5, cex = 1.5) #Add y-axis outer label barplot(t(featureMedMeth[4,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " e. Introns", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[5,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " f. Transposable elements", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[6,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " g. Intergenic regions", line = -1.5, adj = 0) #Add figure label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend(x = 0.16, y = -0.9, xpd = TRUE, horiz = TRUE, x.intersp = 0.5, legend = c("Global", "Control", "Treatment"), pch = 22, col = "black", pt.bg = plotColors[c(1,2,5)], bty = "n", cex = 2.2) #Place a horizontal legend at the bottom of genome feature plots with no box using manual x and y coordinates. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` # Methylated 5x CpGs ## Import data ```{r} allMethLoci <- read.csv("2019-04-09-All-5x-CpG-Loci-Methylated.csv", header = FALSE) #Import file with positions of all methylated loci with 5x coverage colnames(allMethLoci) <- c("chromosome", "start", "end", "methylation") #Add column names head(allMethLoci) #Confirm import ``` ```{r} methLociGeneOverlaps <- read.delim("2019-05-29-All5xCpGs-Genes.txt", header = FALSE, sep = "\t") #Import file with overlaps for methylated loci and genes methLociGeneOverlaps <- methLociGeneOverlaps[,-c(4)] #Remove extra column colnames(methLociGeneOverlaps) <- c("chromosome", "start", "end", "gene-start", "gene-end") #Add column names head(methLociGeneOverlaps) #Confirm import ``` ## Scaled methylated CpG distribution Scaling each gene from 0% to 100%, I want to see where methylated CpG are located. This is useful to see if methylation is occuring in any consistent location for each gene. ```{r} methLociGeneOverlaps$geneLength <- methLociGeneOverlaps$`gene-end` - methLociGeneOverlaps$`gene-start` #Calculate gene length methLociGeneOverlaps$absPosition <- methLociGeneOverlaps$start - methLociGeneOverlaps$`gene-start` #Calculate the absolute position of the methylated CpG in the gene methLociGeneOverlaps$scaledPosition <- methLociGeneOverlaps$absPosition / methLociGeneOverlaps$geneLength #Calculate the scaled position of the methylated CpG in the gene head(methLociGeneOverlaps) #Confirm calculations ``` ```{r} #pdf("2019-10-10-Scaled-Gene-Methylated-Loci.pdf", height = 8.5, width = 11) par(mar = c(5, 6, 2, 2)) #Change figure dimensions hist(methLociGeneOverlaps$scaledPosition, breaks = 100, axes = FALSE, xlab = "", ylab = "", main = "", ylim = c(0,40000), col = "grey80", xaxs = "i", yaxs = "i") #Create base plot with no axes or labels. Include breaks at each percent. axis(side = 1, at = seq(from = 0, to = 1, by = 0.1), col = "grey80", cex = 1.2) #Add x-axis mtext(side = 1, "Scaled Position on Gene", line = 3, cex = 1.5) #Add x-axis label axis(side = 2, at = seq(from = 0, to = 40000, by = 10000), col = "grey80", las = 2, cex = 1.2) #Add y-axis mtext(side = 2, "Number of Methylated CpG", line = 4, cex = 1.5) #Add y-axis label #dev.off() ```