if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("methylKit")
## Bioconductor version 3.8 (BiocManager 1.30.4), R 3.5.1 (2018-07-02)
## Installing package(s) 'methylKit'
##
## The downloaded binary packages are in
## /var/folders/kn/b_4h8b912q3cg5w8v5zg8wfc0000gn/T//RtmpJMWxNq/downloaded_packages
## Update old packages: 'ashr', 'backports', 'bayesm', 'betareg',
## 'BiocManager', 'blob', 'bookdown', 'boot', 'callr', 'car', 'carData',
## 'checkmate', 'clipr', 'cluster', 'coda', 'coin', 'compositions', 'covr',
## 'cowplot', 'curl', 'data.table', 'dbplyr', 'ddalpha', 'dendextend',
## 'DescTools', 'devtools', 'digest', 'DT', 'e1071', 'ellipsis', 'energy',
## 'evaluate', 'FactoMineR', 'farver', 'foreach', 'foreign', 'formatR',
## 'fpc', 'FSA', 'geometry', 'gganimate', 'ggplot2', 'ggpmisc', 'ggpubr',
## 'ggsignif', 'ggthemes', 'gower', 'haven', 'hexbin', 'HH', 'Hmisc',
## 'hms', 'htmlTable', 'htmltools', 'htmlwidgets', 'httpuv', 'httr',
## 'iterators', 'jpeg', 'kernlab', 'KernSmooth', 'knitr', 'lambda.r',
## 'later', 'lava', 'libcoin', 'lpSolve', 'maptools', 'matrixStats',
## 'mclust', 'mgcv', 'mime', 'mixsqp', 'modelr', 'mvtnorm', 'nlme',
## 'numDeriv', 'openssl', 'openxlsx', 'pillar', 'pkgbuild', 'pkgconfig',
## 'plotly', 'plotrix', 'pls', 'prabclus', 'processx', 'prodlim',
## 'progress', 'promises', 'purrr', 'quantreg', 'R.oo', 'R.utils', 'R6',
## 'rcompanion', 'Rcpp', 'RcppArmadillo', 'RcppEigen', 'RcppNumerical',
## 'recipes', 'rlang', 'rmarkdown', 'rngtools', 'robustbase', 'roxygen2',
## 'RSQLite', 'rvest', 'scales', 'selectr', 'sfsmisc', 'shiny',
## 'shinyFiles', 'sp', 'survival', 'sys', 'testthat', 'tibble', 'tinytex',
## 'vegan', 'whisker', 'xfun', 'XML', 'xml2', 'zCompositions', 'zip', 'zoo'
# require needed packages
lapply(c("methylKit","reshape2, here"), require, character.only = TRUE)
## Loading required package: methylKit
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 3.5.2
## Loading required package: reshape2, here
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'reshape2, here'
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] FALSE
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] methylKit_1.8.1 GenomicRanges_1.34.0 GenomeInfoDb_1.18.2
## [4] IRanges_2.16.0 S4Vectors_0.20.1 BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] mclust_5.4.3 Rcpp_1.0.1
## [3] lattice_0.20-38 Rsamtools_1.34.1
## [5] Biostrings_2.50.2 gtools_3.8.1
## [7] assertthat_0.2.1 digest_0.6.18
## [9] R6_2.4.0 plyr_1.8.4
## [11] emdbook_1.3.11 evaluate_0.13
## [13] coda_0.19-2 ggplot2_3.1.1
## [15] pillar_1.3.1 zlibbioc_1.28.0
## [17] rlang_0.4.0 lazyeval_0.2.2
## [19] data.table_1.12.2 R.utils_2.8.0
## [21] R.oo_1.22.0 Matrix_1.2-17
## [23] bbmle_1.0.20 qvalue_2.14.1
## [25] rmarkdown_1.15 splines_3.5.1
## [27] BiocParallel_1.16.6 stringr_1.4.0
## [29] fastseg_1.28.0 RCurl_1.95-4.12
## [31] munsell_0.5.0 DelayedArray_0.8.0
## [33] compiler_3.5.1 numDeriv_2016.8-1
## [35] rtracklayer_1.42.2 xfun_0.6
## [37] pkgconfig_2.0.2 htmltools_0.3.6
## [39] tidyselect_0.2.5 SummarizedExperiment_1.12.0
## [41] tibble_2.1.1 GenomeInfoDbData_1.2.0
## [43] matrixStats_0.54.0 XML_3.98-1.19
## [45] crayon_1.3.4 dplyr_0.8.3
## [47] GenomicAlignments_1.18.1 MASS_7.3-51.4
## [49] bitops_1.0-6 R.methodsS3_1.7.1
## [51] grid_3.5.1 gtable_0.3.0
## [53] magrittr_1.5 scales_1.0.0
## [55] stringi_1.4.3 XVector_0.22.0
## [57] reshape2_1.4.3 limma_3.38.3
## [59] tools_3.5.1 Biobase_2.42.0
## [61] glue_1.3.1 purrr_0.3.2
## [63] yaml_2.2.0 colorspace_1.4-1
## [65] BiocManager_1.30.4 knitr_1.22
Laura Spencer created this notebook to become familiar with the basic methylKit commands, starting from a methylRawList object, which was created in a separate notebook (“01-methylkit.Rmd”). This step takes a long time, so I bypass it here.
Note: most of the text in this notebook is copied directly from the methylKit: User Guide v1.10.0, by Altuna Akalin1 (2019-05-02).
Methylation percentage calls can be calculated from sorted SAM or BAM file(s) from Bismark aligner and read-in to the memory. Bismark is a popular aligner for bisulfite sequencing reads, available here (Krueger and Andrews 2011). processBismarkAln function is designed to read-in Bismark SAM/BAM files as methylRaw or methylRawList objects which store per base methylation calls. SAM files must be sorted by chromosome and read position columns, using ‘sort’ command in unix-like machines will accomplish such a sort easily. BAM files should be sorted and indexed. This could be achieved with samtools (http://www.htslib.org/doc/samtools.html).
File download from here: https://d.pr/f/vRtrqZ
load(here::here("analyses","myobj_18"))
Since we read the methylation data now, we can check the basic stats about the methylation data such as coverage and percent methylation. We now have a methylRawList object which contains methylation information per sample.
The following command prints out percent methylation statistics for sample #1, 2, 3, 9, 10, 11
getMethylationStats(myobj_18[[1]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 75.00 100.00 84.24 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 50.00000 66.66667 83.33333 100.00000 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
getMethylationStats(myobj_18[[2]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 75.00 100.00 84.82 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 50.00000 66.66667 83.33333 100.00000 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
getMethylationStats(myobj_18[[3]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 75.00 100.00 84.24 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 50.00000 66.66667 83.33333 100.00000 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
getMethylationStats(myobj_18[[9]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 66.67 100.00 80.11 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 33.33333 50.00000 75.00000 87.50000 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
getMethylationStats(myobj_18[[10]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 75.00 100.00 82.85 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 50.00000 66.66667 81.25000 93.54839 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
getMethylationStats(myobj_18[[11]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 80.00 100.00 84.36 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.00000 50.00000 66.66667 85.71429 100.00000 100.00000 100.00000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
## 100%
## 100.00000
The following command plots the histogram for percent methylation distribution.The figure below is the histogram and numbers on bars denote what percentage of locations are contained in that bin. Typically, percent methylation histogram should have two peaks on both ends. In any given cell, any given base are either methylated or not. Therefore, looking at many cells should yield a similar pattern where we see lots of locations with high methylation and lots of locations with low methylation.
(Interestingly, our samples do not have a peak at 0% methylated)
for (i in 1:18) {
getMethylationStats(myobj_18[[i]],plot=TRUE,both.strands=FALSE)
}
We can also plot the read coverage per base information in a similar way, again numbers on bars denote what percentage of locations are contained in that bin. Experiments that are highly suffering from PCR duplication bias will have a secondary peak towards the right hand side of the histogram.
for (i in 1:18) {
getCoverageStats(myobj_18[[i]],plot=TRUE,both.strands=FALSE)
}
It might be useful to filter samples based on coverage. Particularly, if our samples are suffering from PCR bias it would be useful to discard bases with very high read coverage. Furthermore, we would also like to discard bases that have low read coverage, a high enough read coverage will increase the power of the statistical tests. The code below creates two objects by filtering the methylRawList and discards bases that have coverage below 10X, and below 5x (but no max coverage is set b/c/ hi.perc=100. Could consider discarding the bases that have more than 99.9th percentile of coverage in each sample.)
filtered10.myobj=filterByCoverage(myobj_18,lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
filtered5.myobj=filterByCoverage(myobj_18,lo.count=5,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
In order to do further analysis, we will need to get the bases covered in all samples. The following function will merge all samples to one object for base-pair locations that are covered in all samples. Setting destrand=TRUE (the default is FALSE) will merge reads on both strands of a CpG dinucleotide. This provides better coverage, but only advised when looking at CpG methylation (for CpH methylation this will cause wrong results in subsequent analyses). In addition, setting destrand=TRUE will only work when operating on base-pair resolution, otherwise setting this option TRUE will have no effect. The unite() function will return a methylBase object which will be our main object for all comparative analysis. The methylBase object contains methylation information for regions/bases that are covered in all samples.
By default, unite function produces bases/regions covered in all samples. That requirement can be relaxed using “min.per.group” option in unite function. For example, to create a methylBase object where only CpGs covered with at least 1 sample per group (i.e. treatment, defined upon reading in Bismark alignment file) will be returned: meth.min=unite(myobj,min.per.group=1L)
In this case, we use the default which is to require coverage across all samples. Also, we create 2 methylBase objects: coverage >=10x, and coverage >=5x
meth_filter10=unite(filtered10.myobj, destrand=TRUE)
meth_filter5=unite(filtered5.myobj, destrand=TRUE)
We can check the correlation (default=Pearson) between samples using getCorrelation. This function will either plot scatter plot and correlation coefficients or just print a correlation matrix. (I’m showing w/o plotting; takes too much time)
getCorrelation(meth_filter10,plot=FALSE) #correlation matrix, 10x coverage
## 1 2 3 4 5 6 7
## 1 1.0000000 0.6474827 0.6666848 0.6762287 0.6392971 0.7017533 0.6541630
## 2 0.6474827 1.0000000 0.6800761 0.7053758 0.6570977 0.6345348 0.6716664
## 3 0.6666848 0.6800761 1.0000000 0.7151097 0.6663944 0.6442118 0.7302352
## 4 0.6762287 0.7053758 0.7151097 1.0000000 0.6602668 0.6587420 0.7051690
## 5 0.6392971 0.6570977 0.6663944 0.6602668 1.0000000 0.6326074 0.7128529
## 6 0.7017533 0.6345348 0.6442118 0.6587420 0.6326074 1.0000000 0.6221249
## 7 0.6541630 0.6716664 0.7302352 0.7051690 0.7128529 0.6221249 1.0000000
## 8 0.6820820 0.6466106 0.6528931 0.6701398 0.6183613 0.6793876 0.6345836
## 9 0.6473843 0.6400475 0.6647439 0.6472976 0.6463275 0.6175413 0.6511168
## 10 0.6532630 0.6483994 0.6618212 0.6665623 0.6579780 0.6329433 0.6705518
## 11 0.6253457 0.6196995 0.6440825 0.6519309 0.6281219 0.6067272 0.6447527
## 12 0.6300573 0.6315207 0.6250652 0.6247597 0.6524651 0.6132448 0.6179600
## 13 0.6582075 0.6595815 0.6690174 0.6605324 0.6575865 0.6471854 0.6610663
## 14 0.6591828 0.6448005 0.6657464 0.6578911 0.6430044 0.6347740 0.6555664
## 15 0.6434660 0.6474434 0.6648987 0.6678681 0.6271321 0.6268320 0.6715743
## 16 0.6619737 0.6517417 0.6542004 0.6521972 0.6514878 0.6253665 0.6544498
## 17 0.6509435 0.6494889 0.6538636 0.6569427 0.6445617 0.6374211 0.6532621
## 18 0.6599287 0.6569770 0.6667105 0.6592379 0.6670609 0.6401146 0.6605606
## 8 9 10 11 12 13 14
## 1 0.6820820 0.6473843 0.6532630 0.6253457 0.6300573 0.6582075 0.6591828
## 2 0.6466106 0.6400475 0.6483994 0.6196995 0.6315207 0.6595815 0.6448005
## 3 0.6528931 0.6647439 0.6618212 0.6440825 0.6250652 0.6690174 0.6657464
## 4 0.6701398 0.6472976 0.6665623 0.6519309 0.6247597 0.6605324 0.6578911
## 5 0.6183613 0.6463275 0.6579780 0.6281219 0.6524651 0.6575865 0.6430044
## 6 0.6793876 0.6175413 0.6329433 0.6067272 0.6132448 0.6471854 0.6347740
## 7 0.6345836 0.6511168 0.6705518 0.6447527 0.6179600 0.6610663 0.6555664
## 8 1.0000000 0.6172210 0.6218287 0.6107849 0.6028061 0.6419895 0.6435372
## 9 0.6172210 1.0000000 0.6528998 0.6222314 0.6280387 0.6537493 0.6546545
## 10 0.6218287 0.6528998 1.0000000 0.7213455 0.6938524 0.7325784 0.7147908
## 11 0.6107849 0.6222314 0.7213455 1.0000000 0.6514273 0.7242441 0.7053775
## 12 0.6028061 0.6280387 0.6938524 0.6514273 1.0000000 0.7190977 0.6804995
## 13 0.6419895 0.6537493 0.7325784 0.7242441 0.7190977 1.0000000 0.7316778
## 14 0.6435372 0.6546545 0.7147908 0.7053775 0.6804995 0.7316778 1.0000000
## 15 0.6355816 0.6199579 0.6802428 0.6789533 0.6280124 0.6837751 0.6827601
## 16 0.6271593 0.6392529 0.7161549 0.7154545 0.6646392 0.7421678 0.7437519
## 17 0.6690379 0.6383191 0.6925906 0.6609847 0.6457725 0.6842226 0.6784860
## 18 0.6367324 0.6486887 0.7138594 0.6949911 0.7104654 0.7641980 0.7360778
## 15 16 17 18
## 1 0.6434660 0.6619737 0.6509435 0.6599287
## 2 0.6474434 0.6517417 0.6494889 0.6569770
## 3 0.6648987 0.6542004 0.6538636 0.6667105
## 4 0.6678681 0.6521972 0.6569427 0.6592379
## 5 0.6271321 0.6514878 0.6445617 0.6670609
## 6 0.6268320 0.6253665 0.6374211 0.6401146
## 7 0.6715743 0.6544498 0.6532621 0.6605606
## 8 0.6355816 0.6271593 0.6690379 0.6367324
## 9 0.6199579 0.6392529 0.6383191 0.6486887
## 10 0.6802428 0.7161549 0.6925906 0.7138594
## 11 0.6789533 0.7154545 0.6609847 0.6949911
## 12 0.6280124 0.6646392 0.6457725 0.7104654
## 13 0.6837751 0.7421678 0.6842226 0.7641980
## 14 0.6827601 0.7437519 0.6784860 0.7360778
## 15 1.0000000 0.6837052 0.6635478 0.6530284
## 16 0.6837052 1.0000000 0.6856313 0.7071251
## 17 0.6635478 0.6856313 1.0000000 0.6825180
## 18 0.6530284 0.7071251 0.6825180 1.0000000
getCorrelation(meth_filter5,plot=FALSE) #correlation matrix, 5x coverage
## 1 2 3 4 5 6 7
## 1 1.0000000 0.5203324 0.5282404 0.5277955 0.5022175 0.5899275 0.5270635
## 2 0.5203324 1.0000000 0.5589683 0.5914152 0.5341853 0.4954366 0.5759710
## 3 0.5282404 0.5589683 1.0000000 0.5906174 0.5438585 0.4999331 0.6128457
## 4 0.5277955 0.5914152 0.5906174 1.0000000 0.5232693 0.5060093 0.5862341
## 5 0.5022175 0.5341853 0.5438585 0.5232693 1.0000000 0.4852987 0.6087552
## 6 0.5899275 0.4954366 0.4999331 0.5060093 0.4852987 1.0000000 0.4903991
## 7 0.5270635 0.5759710 0.6128457 0.5862341 0.6087552 0.4903991 1.0000000
## 8 0.5715679 0.5176748 0.5164720 0.5317901 0.4933246 0.5579952 0.5107628
## 9 0.5302612 0.5317613 0.5352212 0.5247550 0.5151886 0.5025897 0.5288700
## 10 0.5278785 0.5134400 0.5230536 0.5257591 0.5040114 0.5017013 0.5304511
## 11 0.5148343 0.5012486 0.5179590 0.5152580 0.4876532 0.4854534 0.5200991
## 12 0.5054842 0.5073348 0.4942885 0.4958185 0.5077958 0.4901367 0.5000016
## 13 0.5513353 0.5347206 0.5499918 0.5495005 0.5202891 0.5164031 0.5558191
## 14 0.5477871 0.5303936 0.5488113 0.5432307 0.5190625 0.5168156 0.5493498
## 15 0.5349181 0.5160514 0.5289485 0.5418185 0.5065675 0.5084176 0.5309964
## 16 0.5398558 0.5198102 0.5376549 0.5354271 0.5108783 0.5010364 0.5408960
## 17 0.5334805 0.5227479 0.5262968 0.5345332 0.5021054 0.5064655 0.5215438
## 18 0.5297419 0.5291267 0.5375875 0.5319430 0.5223619 0.5088418 0.5348182
## 8 9 10 11 12 13 14
## 1 0.5715679 0.5302612 0.5278785 0.5148343 0.5054842 0.5513353 0.5477871
## 2 0.5176748 0.5317613 0.5134400 0.5012486 0.5073348 0.5347206 0.5303936
## 3 0.5164720 0.5352212 0.5230536 0.5179590 0.4942885 0.5499918 0.5488113
## 4 0.5317901 0.5247550 0.5257591 0.5152580 0.4958185 0.5495005 0.5432307
## 5 0.4933246 0.5151886 0.5040114 0.4876532 0.5077958 0.5202891 0.5190625
## 6 0.5579952 0.5025897 0.5017013 0.4854534 0.4901367 0.5164031 0.5168156
## 7 0.5107628 0.5288700 0.5304511 0.5200991 0.5000016 0.5558191 0.5493498
## 8 1.0000000 0.5123594 0.5097867 0.5048310 0.4844548 0.5262421 0.5302390
## 9 0.5123594 1.0000000 0.5365803 0.5173040 0.5248426 0.5529726 0.5488203
## 10 0.5097867 0.5365803 1.0000000 0.6216062 0.5828120 0.6515680 0.6316178
## 11 0.5048310 0.5173040 0.6216062 1.0000000 0.5355448 0.6369405 0.6098844
## 12 0.4844548 0.5248426 0.5828120 0.5355448 1.0000000 0.6250909 0.5949646
## 13 0.5262421 0.5529726 0.6515680 0.6369405 0.6250909 1.0000000 0.6630315
## 14 0.5302390 0.5488203 0.6316178 0.6098844 0.5949646 0.6630315 1.0000000
## 15 0.5285563 0.5288093 0.5607748 0.5503544 0.5095783 0.5853971 0.5788881
## 16 0.5182491 0.5428453 0.6283167 0.6233602 0.5583828 0.6595183 0.6573833
## 17 0.5292731 0.5383151 0.5647206 0.5511178 0.5263066 0.5815727 0.5732395
## 18 0.5146142 0.5439974 0.5990934 0.5769738 0.6198649 0.6810999 0.6498659
## 15 16 17 18
## 1 0.5349181 0.5398558 0.5334805 0.5297419
## 2 0.5160514 0.5198102 0.5227479 0.5291267
## 3 0.5289485 0.5376549 0.5262968 0.5375875
## 4 0.5418185 0.5354271 0.5345332 0.5319430
## 5 0.5065675 0.5108783 0.5021054 0.5223619
## 6 0.5084176 0.5010364 0.5064655 0.5088418
## 7 0.5309964 0.5408960 0.5215438 0.5348182
## 8 0.5285563 0.5182491 0.5292731 0.5146142
## 9 0.5288093 0.5428453 0.5383151 0.5439974
## 10 0.5607748 0.6283167 0.5647206 0.5990934
## 11 0.5503544 0.6233602 0.5511178 0.5769738
## 12 0.5095783 0.5583828 0.5263066 0.6198649
## 13 0.5853971 0.6595183 0.5815727 0.6810999
## 14 0.5788881 0.6573833 0.5732395 0.6498659
## 15 1.0000000 0.5760357 0.5620237 0.5574024
## 16 0.5760357 1.0000000 0.5793753 0.6167757
## 17 0.5620237 0.5793753 1.0000000 0.5624481
## 18 0.5574024 0.6167757 0.5624481 1.0000000
We can cluster the samples based on the similarity of their methylation profiles. The following function will cluster the samples and draw a dendrogram.
clusterSamples(meth_filter10, dist="correlation", method="ward", plot=TRUE) #cluster analysis 10x, correlation matrix
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 18
clusterSamples(meth_filter5, dist="correlation", method="ward", plot=TRUE) #cluster analysis 5x, correlation matrix
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 18
clusterSamples(meth_filter10, dist="manhattan", method="ward", plot=TRUE) #cluster analysis 10x, manhattan distance matrix
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : manhattan
## Number of objects: 18
clusterSamples(meth_filter5, dist="manhattan", method="ward", plot=TRUE) #cluster analysis 5x, manhattan distance matrix
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : manhattan
## Number of objects: 18
PCASamples(meth_filter10)
PCASamples(meth_filter5)
Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.
perc.meth10=percMethylation(meth_filter10, rowids=T)
perc.meth5=percMethylation(meth_filter5, rowids=T)
Extract correlation matrix (pearson) and distance matrix (manhattan) from the % methylated matrix FYI I checked that the correlation matrix in getCorrelation
is derived from the percent methylated matrix. It is!
cor.pears.10 <- cor(perc.meth10, method="pearson") # create pearson correlation matrix.
dist.manhat.10 <- dist(t(perc.meth10), method = "manhattan", diag = T, upper = F)
dist.manhat.10.df <- reshape2::melt(as.matrix(dist.manhat.10), varnames = c("row", "col"))
cor.pears.5 <- cor(perc.meth5, method="pearson") # create pearson correlation matrix.
dist.manhat.5 <- dist(t(perc.meth5), method = "manhattan", diag = T, upper = F) #must transform % matrix (samples are rows) for the dist() function
dist.manhat.5.df <- reshape2::melt(as.matrix(dist.manhat.5), varnames = c("row", "col"))
key <- read.csv(here::here("data", "sample-key.csv"))
dist.manhat.10.df <- merge(x=merge(x=dist.manhat.10.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.10.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.10.df, file=here::here("analyses", "dist.manhat.10.csv"), row.names=FALSE)
dist.manhat.5.df <- merge(x=merge(x=dist.manhat.5.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.5.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.5.df, file=here::here("analyses","dist.manhat.5.csv"), row.names=FALSE)