--- title: "Chacterizing CpG Methylation" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` In this script, I'll create figures to characterize CpG methylation in my *C. virginica* gonad samples. I will use a list of CpGs with at least 5x coverage in any samples. I'll also calculate relevant statistics for methylation islands. **MOVE CODE TO ANALYSIS DIRECTORY `2019-03-18-Characterizing-CpG-Methylation` BEFORE PROCEEDING.** # Session information ```{r} sessionInfo() ``` # All 5x CpGs ## Import data ```{r} cpgMethylation <- read.csv("../analyses/2019-03-18-Characterizing-CpG-Methylation/2019-04-09-All-5x-CpGs.csv", header = FALSE) #Import file with CpG methylation for all loci with 5x coverage colnames(cpgMethylation) <- c("chromosome", "start", "end", "methylation") #Add column names head(cpgMethylation) #Confirm import ``` ## Create figure ```{r} RColorBrewer::display.brewer.all() #Show all RColorBrewer palettes. I will choose greens. plotColors <- rev(RColorBrewer::brewer.pal(5, "GnBu")) #Create a color palette for the barplots. Use 5 green-blue shades from RColorBrewer. Reverse theorder so the darkest shade is used first. barplot(t(t(proportionData)), col = plotColors) #See what plot looks like with new scheme barplot(t(t(proportionData)), col = dichromat(plotColors)) #Check that the plot colors will be easy to interpret for those with color blindess ``` ```{r} #pdf("2019-04-10-5x-CpG-Frequency-Distribution.pdf", width = 11, height = 8.5) hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[2], xaxs = "i", yaxs = "i") #Create base plot axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis mtext(side = 1, text = "Methylation (%)", line = 3, cex = 1.5) #Add x-axis label axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis mtext(side = 2, text = "Frequency (x100,000)", line = 2.5, cex = 1.5) #Add y-axis label #dev.off() ``` # Percent methylation of genome features For each genome feature, I will do the following: 1. Interect sample bedgraphs with genome feature track 2. Add percent methylation information 2. Calculate median percent methylation for the entire matrix ## Annotate sample coverage files ### Download 5x coverage files ```{bash, echo = FALSE} wget -r -l1 --no-parent -A_5x.bedgraph https://gannet.fish.washington.edu/spartina/2018-10-10-project-virginica-oa-Large-Files/2019-03-07-Yaamini-Virginica-Repository/analyses/2019-03-07-IGV-Verification/ #Download 5x bedgraphs from gannet ``` ```{bash} mv gannet.fish.washington.edu/spartina/2018-10-10-project-virginica-oa-Large-Files/2019-03-07-Yaamini-Virginica-Repository/analyses/2019-03-07-IGV-Verification/* . #Move files to current directory. Where they are stored mimics file structure on gannet ``` ```{bash} rm -r ../gannet.fish.washington.edu #Remove gannet directory ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph #Columns: chr, start, end, percent methylation. Tab-delimited. ``` ### Create new files without percent methylation information To match loci with the gene they're in, I'm going to use `intersectBed` from `bedtools`. Before I can use files with `intersectBed`, I need to remove the last column from the coverage files, percent methylation. This is because `intersectBed` will only accept chromosome and positions, and will not accept columns with numbers that are not positions. ```{bash} #For each file that ends in bedgraph #Print the first three columns, tab-delimited, and save the columns in a new file that has the same base name and ends in posOnly for f in *bedgraph do awk '{print $1"\t"$2"\t"$3}' ${f} > ${f}.posOnly done ``` ```{bash} find *posOnly #Confirm files were created ``` ```{bash} head zr2096_5_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly #Confirm file formatting ``` ### Format and sort files for final `join` The original files aren't tab-delimited while the others are, so I need to get the coverage files in a tab-delimited format so my output will be tab-delimited. ```{bash} #For each file that ends in bedgraph (the original coverage files) #Print the first three columns (chr, start, end) with dashes ("-") in between. Then, print the rest of the columns (chr, start, end, percentMeth) in the file with tabs ("\t") inbetween. #Save the file with the base name + .sorted for f in *bedgraph do awk '{print $1"-"$2"-"$3"\t"$1"\t"$2"\t"$3"\t"$4}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *bedgraph.sorted ``` ```{bash} head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.sorted #Indvidual sample bedgraph with merged chr, start, end information to join files ``` ## Reformat global methylation file ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph #Check current file ``` ```{bash} awk '{print $1"\t"$2"\t"$3}' 2019-04-09-All-5x-CpGs.bedgraph > 2019-04-09-All-5x-CpGs.bedgraph.posOnly #Remove percent methylation information ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph.posOnly ``` ```{bash} #Create a version of the file that can be used to add percent methylation information back with a join command awk '{print $1"-"$2"-"$3"\t"$1"\t"$2"\t"$3"\t"$4}' 2019-04-09-All-5x-CpGs.bedgraph \ | sort -k1,1 \ > 2019-04-09-All-5x-CpGs.bedgraph.sorted ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph.sorted ``` ## Putative promoters ```{bash} head -n 1 ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed #Check promoter track ``` ### Run `intersectBed` ```{bash} find *posOnly ``` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: promoter track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \ > ${f}-promoterOverlap done ``` ```{bash} find *promoterOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in promoterOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *promoterOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *promoterOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-promoterOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-promoterOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-promoterOverlap-percentMeth done ``` ```{bash} find *promoterOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-promoterOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *promoterOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *promoterOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-promoterOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-promoterOverlap-percentMeth-Medians.txt ``` ## UTRs ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed #Check promoter track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: promoter track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-UTROverlap done ``` ```{bash} find *UTROverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in UTROverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *UTROverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *UTROverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-UTROverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-UTROverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-UTROverlap-percentMeth done ``` ```{bash} find *UTROverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-UTROverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *UTROverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *UTROverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-UTROverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-UTROverlap-percentMeth-Medians.txt ``` ## Exons ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -promoterOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-exonOverlap done ``` ```{bash} find *exonOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in exonOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *exonOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *exonOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-exonOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-exonOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-exonOverlap-percentMeth done ``` ```{bash} find *exonOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-exonOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *exonOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *exonOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-exonOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-exonOverlap-percentMeth-Medians.txt ``` ## Introns ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intronOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-intronOverlap done ``` ```{bash} find *intronOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in intronOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *intronOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *intronOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-intronOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-intronOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-intronOverlap-percentMeth done ``` ```{bash} find *intronOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-intronOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *intronOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *intronOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-intronOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-intronOverlap-percentMeth-Medians.txt ``` ## Transposable elements ```{bash} head -n 7 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intronOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff \ | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \ > ${f}-TEOverlap done ``` ```{bash} find *TEOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in TEOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *TEOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *TEOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-TEOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-TEOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-TEOverlap-percentMeth done ``` ```{bash} find *TEOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-TEOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *TEOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *TEOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-TEOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-TEOverlap-percentMeth-Medians.txt ``` ## Intergenic regions ```{bash} head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed #Check track ``` ### Run `intersectBed` ```{bash} #For each file that ends in posOnly #Use intersectBed to find where loci and promoters intersect #wb: Print all lines in the second file #a: file that ends in posOnly #b: exon track #Only keep specified columns #Save output in a new file that has the same base name and ends in -intergenicOverlap for f in *posOnly do /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \ -wb \ -a ${f} \ -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed \ | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \ > ${f}-intergenicOverlap done ``` ```{bash} find *intergenicOverlap ``` ```{bash} head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap #Confirm file is annotated ``` ### Add back percent methylation information ```{bash} #For each file that ends in intergenicOverlap #Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file #Save the file with the base name + .sorted for f in *intergenicOverlap do awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \ | sort -k1,1 \ > ${f}.sorted done ``` ```{bash} find *intergenicOverlap.sorted ``` ```{bash} head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap.sorted ``` ```{bash} #For each file that ends in bedgraph #Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited #The first file ends with .sorted #The second file ends with .posOnly-intergenicOverlap.sorted #Add .annotated.percentMeth to the base name of the output file for f in *bedgraph do join -j 1 -t $'\t' \ ${f}.sorted \ ${f}.posOnly-intergenicOverlap.sorted \ | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \ > ${f}-intergenicOverlap-percentMeth done ``` ```{bash} find *intergenicOverlap-percentMeth ``` ```{bash} head 2019-04-09-All-5x-CpGs.bedgraph-intergenicOverlap-percentMeth ``` ### Calculate median methylation ```{bash} #For each file that ends in *intergenicOverlap-percentMeth #Sort numerically #Calculate the median of the sixth column (percent methylation) for each file #Save output in a new file (for f in *intergenicOverlap-percentMeth do sort -n ${f} \ | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' done) > 2020-02-25-intergenicOverlap-percentMeth-Medians.txt ``` ```{bash} cat 2020-02-25-intergenicOverlap-percentMeth-Medians.txt ``` ## Plot median methylation ### Reformat data ```{r} #Import median methylation information promoterMedMeth <- read.delim("2020-02-25-promoterOverlap-percentMeth-Medians.txt", header = FALSE) UTRMedMeth <- read.delim("2020-02-25-UTROverlap-percentMeth-Medians.txt", header = FALSE) exonMedMeth <- read.delim("2020-02-25-exonOverlap-percentMeth-Medians.txt", header = FALSE) intronMedMeth <- read.delim("2020-02-25-intronOverlap-percentMeth-Medians.txt", header = FALSE) TEMedMeth <- read.delim("2020-02-25-TEOverlap-percentMeth-Medians.txt", header = FALSE) intergenicMedMeth <- read.delim("2020-02-25-intergenicOverlap-percentMeth-Medians.txt", header = FALSE) ``` ```{r} featureMedMeth <- cbind(promoterMedMeth, UTRMedMeth, exonMedMeth, intronMedMeth, TEMedMeth, intergenicMedMeth) #Create a dataframe with percent methylation information across features colnames(featureMedMeth) <- c("promoter", "UTR", "exon", "intron", "TE", "intergenic") #Add column names featureMedMeth[12,] <- featureMedMeth[2,] #Move sample 10 information to the end featureMedMeth <- featureMedMeth[-2,] #Remove sample 10 information in the incorrect position row.names(featureMedMeth) <- c("global", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10") #Add row names with sample distinctions featureMedMeth <- as.data.frame(t(featureMedMeth)) #Transpose dataframe tail(featureMedMeth) #Confirm formatting ``` ### All median methylation for all features ```{r} #pdf("2020-02-25-All-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,8)) #Change figure boundaries barplot(t(featureMedMeth), beside = TRUE, axes = FALSE, names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[4], times = 5)), ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Global", "Control", "Treatment"), pch = 22, col = "black", pt.bg = c(plotColors[1], plotColors[2], plotColors[4]), bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` ```{r} #pdf("2020-02-25-All-Median-Methylation-Across-Split-Features.pdf", height = 8.5, width = 11) par(mar = c(3,4,0,0), oma = c(0, 2.5, 2, 0), mfrow = c(3,2)) #Change figure boundaries barplot(t(featureMedMeth[1,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " a. Putative promoters", line = -0.5, adj = 0) #Add figure label mtext(side = 2, "Methylation (%)", line = 0, cex = 1.5, outer = TRUE) #Add y-axis outer label barplot(t(featureMedMeth[2,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " b. UTR", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[3,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " c. Exons", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[4,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " d. Introns", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[5,]), beside = TRUE, axes = FALSE, names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " e. Transposable elements", line = -0.5, adj = 0) #Add figure label barplot(t(featureMedMeth[6,]), beside = TRUE, axes = FALSE, names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " f. Intergenic regions", line = -0.5, adj = 0) #Add figure label #dev.off() ``` ### Global methylation only ```{r} #pdf("2020-02-25-Global-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,15)) #Change figure boundaries barplot(featureMedMeth$global, axes = FALSE, names.arg = c("", "", "", "", "", ""), col = plotColors[1:6], ylim = c(0,100)) #Create a barplot using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Putative Promoters", "UTRs", "Exons", "Introns", "Transposable Elements", "Intergenic"), pch = 22, col = "black", pt.bg = plotColors[1:6], bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` ### Samples only ```{r} #pdf("2020-02-25-Sample-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11) par(mar = c(3,5,1,8)) #Change figure boundaries barplot(t(featureMedMeth[,2:11]), beside = TRUE, axes = FALSE, names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"), col = c(rep(plotColors[2], times = 5), rep(plotColors[4], times = 5)), ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2) mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend("topright", xpd = TRUE, legend = c("Control", "Treatment"), pch = 22, col = "black", pt.bg = c(plotColors[2], plotColors[4]), bty = "n", cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` # Create multipanel plot I'm going to create a multipanel plot with the frequency distribution of CpG types as well as median percent methylation across genome features. ```{r} #pdf("2020-02-25-Methylation-Distribution-Median-Methylation-Across-Features-Multipanel.pdf", height = 8.5, width = 11) plotMatrix <- matrix(c(1,1,0,2,3, 1,1,0,4,5, 1,1,0,6,7), nrow = 3, ncol = 5, byrow = TRUE) #Create a matrix with plot location information. Fill matrix by row. par(mar = c(2, 1, 0, 0), oma = c(4, 5, 2, 2)) #Specify inner and outer margins layout(mat = plotMatrix, widths = c(2, 2, 0.7, 2, 2)) #Create a layout based on plotMatrix. The first and second plots (first column) should be wider. layout.show(n = 7) #Confirm layout looks good hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[1], xaxs = "i", yaxs = "i") #Create base plot axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis mtext(side = 1, text = "Methylation (%)", line = 3.5, cex = 1.5) #Add x-axis label axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis mtext(side = 2, text = "Frequency (x100,000)", line = 3, cex = 1.5) #Add y-axis label mtext(side = 3, "a. CpG distribution", line = -1.5, adj = 1) #Add figure label barplot(t(featureMedMeth[1,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " b. Putative promoters", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[2,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " c. UTR", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[3,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " d. Exons", line = -1.5, adj = 0) #Add figure label mtext(side = 2, "Methylation (%)", line = 3.5, cex = 1.5) #Add y-axis outer label barplot(t(featureMedMeth[4,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " e. Introns", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[5,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis mtext(side = 3, " f. Transposable elements", line = -1.5, adj = 0) #Add figure label barplot(t(featureMedMeth[6,]), beside = TRUE, axes = FALSE, names.arg = rep("", times = 11), col = c(plotColors[1], rep(plotColors[2], times = 5), rep(plotColors[5], times = 5)), ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis. mtext(side = 3, " g. Intergenic regions", line = -1.5, adj = 0) #Add figure label par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot legend(x = 0.16, y = -0.9, xpd = TRUE, horiz = TRUE, x.intersp = 0.5, legend = c("Global", "Control", "Treatment"), pch = 22, col = "black", pt.bg = plotColors[c(1,2,5)], bty = "n", cex = 2.2) #Place a horizontal legend at the bottom of genome feature plots with no box using manual x and y coordinates. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill #dev.off() ``` # Methylated 5x CpGs ## Import data ```{r} allMethLoci <- read.csv("../analyses/2019-03-18-Characterizing-CpG-Methylation/2019-04-09-All-5x-CpG-Loci-Methylated.csv", header = FALSE) #Import file with positions of all methylated loci with 5x coverage colnames(allMethLoci) <- c("chromosome", "start", "end", "methylation") #Add column names head(allMethLoci) #Confirm import ``` ```{r} methLociGeneOverlaps <- read.delim("../analyses/2019-03-18-Characterizing-CpG-Methylation/2019-05-29-All5xCpGs-Genes.txt", header = FALSE, sep = "\t") #Import file with overlaps for methylated loci and genes methLociGeneOverlaps <- methLociGeneOverlaps[,-c(4)] #Remove extra column colnames(methLociGeneOverlaps) <- c("chromosome", "start", "end", "gene-start", "gene-end") #Add column names head(methLociGeneOverlaps) #Confirm import ``` ## Scaled methylated CpG distribution Scaling each gene from 0% to 100%, I want to see where methylated CpG are located. This is useful to see if methylation is occuring in any consistent location for each gene. ```{r} methLociGeneOverlaps$geneLength <- methLociGeneOverlaps$`gene-end` - methLociGeneOverlaps$`gene-start` #Calculate gene length methLociGeneOverlaps$absPosition <- methLociGeneOverlaps$start - methLociGeneOverlaps$`gene-start` #Calculate the absolute position of the methylated CpG in the gene methLociGeneOverlaps$scaledPosition <- methLociGeneOverlaps$absPosition / methLociGeneOverlaps$geneLength #Calculate the scaled position of the methylated CpG in the gene head(methLociGeneOverlaps) #Confirm calculations ``` ```{r} #pdf("2019-10-10-Scaled-Gene-Methylated-Loci.pdf", height = 8.5, width = 11) par(mar = c(5, 6, 2, 2)) #Change figure dimensions hist(methLociGeneOverlaps$scaledPosition, breaks = 100, axes = FALSE, xlab = "", ylab = "", main = "", ylim = c(0,40000), col = "grey80", xaxs = "i", yaxs = "i") #Create base plot with no axes or labels. Include breaks at each percent. axis(side = 1, at = seq(from = 0, to = 1, by = 0.1), col = "grey80", cex = 1.2) #Add x-axis mtext(side = 1, "Scaled Position on Gene", line = 3, cex = 1.5) #Add x-axis label axis(side = 2, at = seq(from = 0, to = 40000, by = 10000), col = "grey80", las = 2, cex = 1.2) #Add y-axis mtext(side = 2, "Number of Methylated CpG", line = 4, cex = 1.5) #Add y-axis label #dev.off() ``` # Methylation island characteristics ## Import data ```{r} methylationIslands <- read.delim("2020-02-06-Methylation-Islands-500_0.02_25-filtered.tab", header = FALSE) #Import methylation island list colnames(methylationIslands) <- c("chr", "MI.start", "MI.end", "mCpGs", "MI.length") #Add column names head(methylationIslands) #Confirm file format ``` ## Calculate statistics ```{r} range(methylationIslands$mCpGs) #Calculate the range of the number of mCpGs median(methylationIslands$mCpGs) #Calculate the median number of mCpGs in an island ``` ```{r} range(methylationIslands$MI.length) #Calculate the range MI length median(methylationIslands$MI.length) #Calculate the median MI length ``` ```{r} methIslandLengthDistr <- hist(methylationIslands$MI.length) #Create histogram and save output to characterize length distribution methIslandLengthDistr$breaks #Look at bins for methylation island length methIslandLengthDistr$counts #Look at counts for number of islands in bins ```