list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] vegan_2.5-6 lattice_0.20-38 permute_0.9-5
## [4] plotly_4.9.1 viridis_0.5.1 viridisLite_0.3.0
## [7] Pstat_1.2 matrixStats_0.55.0 ggforce_0.3.1
## [10] methylKit_1.8.1 GenomicRanges_1.34.0 GenomeInfoDb_1.18.2
## [13] IRanges_2.16.0 S4Vectors_0.20.1 BiocGenerics_0.28.0
## [16] here_0.1 reshape2_1.4.3 forcats_0.4.0
## [19] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.3
## [22] readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
## [25] ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 mclust_5.4.5
## [3] rprojroot_1.3-2 qvalue_2.14.1
## [5] XVector_0.22.0 fs_1.3.1
## [7] rstudioapi_0.10 farver_2.0.1
## [9] fansi_0.4.0 mvtnorm_1.0-11
## [11] lubridate_1.7.4 xml2_1.2.2
## [13] splines_3.5.1 R.methodsS3_1.7.1
## [15] knitr_1.26 polyclip_1.10-0
## [17] zeallot_0.1.0 jsonlite_1.6
## [19] Rsamtools_1.34.1 broom_0.5.3
## [21] cluster_2.1.0 dbplyr_1.4.2
## [23] R.oo_1.23.0 compiler_3.5.1
## [25] httr_1.4.1 backports_1.1.5
## [27] assertthat_0.2.1 Matrix_1.2-18
## [29] lazyeval_0.2.2 limma_3.38.3
## [31] cli_2.0.0 tweenr_1.0.1
## [33] htmltools_0.4.0 tools_3.5.1
## [35] coda_0.19-3 gtable_0.3.0
## [37] glue_1.3.1 GenomeInfoDbData_1.2.0
## [39] Rcpp_1.0.3 bbmle_1.0.22
## [41] Biobase_2.42.0 cellranger_1.1.0
## [43] vctrs_0.2.1 Biostrings_2.50.2
## [45] nlme_3.1-143 rtracklayer_1.42.2
## [47] xfun_0.11 fastseg_1.28.0
## [49] rvest_0.3.5 lifecycle_0.1.0
## [51] gtools_3.8.1 XML_3.98-1.20
## [53] zlibbioc_1.28.0 MASS_7.3-51.5
## [55] scales_1.1.0 hms_0.5.2
## [57] SummarizedExperiment_1.12.0 yaml_2.2.0
## [59] gridExtra_2.3 emdbook_1.3.11
## [61] bdsmatrix_1.3-3 stringi_1.4.3
## [63] BiocParallel_1.16.6 rlang_0.4.2
## [65] pkgconfig_2.0.3 bitops_1.0-6
## [67] evaluate_0.14 htmlwidgets_1.5.1
## [69] GenomicAlignments_1.18.1 tidyselect_0.2.5
## [71] plyr_1.8.5 magrittr_1.5
## [73] R6_2.4.1 generics_0.0.2
## [75] DelayedArray_0.8.0 DBI_1.1.0
## [77] mgcv_1.8-31 pillar_1.4.3
## [79] haven_2.2.0 withr_2.1.2
## [81] RCurl_1.95-4.12 modelr_0.1.5
## [83] crayon_1.3.4 rmarkdown_2.0
## [85] grid_3.5.1 readxl_1.3.1
## [87] data.table_1.12.8 reprex_0.3.0
## [89] digest_0.6.23 numDeriv_2016.8-1.1
## [91] R.utils_2.9.2 munsell_0.5.0
getwd()
## [1] "/Users/laura/Documents/roberts-lab/paper-oly-mbdbs-gen/code"
gannet
Files were downloaded from gannet on March 9th, 2020 from https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/ using curl -O [URL]
:
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_1_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_2_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_3_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_4_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_5_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_6_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_7_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_8_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_9_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_10_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_11_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_12_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_13_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_14_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_15_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_16_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_17_s456_trimmed_bismark_bt2.deduplicated.sorted.bam https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_18_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
file.list_18=list("/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_1_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_2_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_3_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_4_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_5_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_6_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_7_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_8_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_9_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_10_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_11_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_12_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_13_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_14_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_15_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_16_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_17_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Peach\ Backup/oly-mbdseq-bismark-files_sorted/zr1394_18_s456_trimmed_bismark_bt2.deduplicated.sorted.bam")
It also assigns minimum coverage of 2x and the treatments (in this case, the Olympia oyster population)
myobj_18 = processBismarkAln(location = file.list_18, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18"), assembly = "v071", read.context="CpG", mincov=2, treatment = c(0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1))
Object prepared by Steven in 2019 is also available at https://d.pr/f/vRtrqZ Object prepared by Laura in March 2020 is available at TBD
save(myobj_18, file = "../analyses/myobj_18")
#zip ../analyses/myobj_18.zip ../analyses/myobj_18
Load object if needed
load("../analyses/myobj_18")
getMethylationStats(myobj_18[[2]],plot=F,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 75.00 100.00 84.82 100.00 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 50.00000 66.66667 83.33333 100.00000 100.00000 100.00000 100.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
head(myobj_18[[5]])
## chr start end strand coverage numCs numTs
## 1 Contig0 37 37 - 2 1 1
## 2 Contig0 483 483 - 3 3 0
## 3 Contig0 1003 1003 + 2 2 0
## 4 Contig0 1036 1036 + 3 1 2
## 5 Contig0 955 955 + 2 2 0
## 6 Contig0 1004 1004 - 2 2 0
getMethylationStats(myobj_18[[13]],plot=T,both.strands=FALSE)
getCoverageStats(myobj_18[[5]],plot=TRUE,both.strands=FALSE)
getCoverageStats(myobj_18[[13]],plot=TRUE,both.strands=FALSE)
filtered.myobj=filterByCoverage(myobj_18,lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
meth_filter=methylKit::unite(filtered.myobj, destrand=TRUE, min.per.group=7L)
save(meth_filter, file = "../analyses/methylation-filtered/R-objects/meth_filter") #save object to file
Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.
Here I create % methylation matrices from the filtered object, and use it to do another cluster analysis
perc.meth=percMethylation(meth_filter, rowids=T)
hist(perc.meth, col="gray50" )
# Save % methylation df to object and .tab file
save(perc.meth, file = "../analyses/methylation-filtered/R-objects/perc.meth") #save object to file
write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "methylation-filtered", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)
# What percentage of loci have ZERO methylation?
100*table(perc.meth==0)[2]/table(perc.meth==0)[1] # Only 0.8% of loci unmethylated, averaged across all samples
## TRUE
## 0.8116747
# Mean % methylation across all samples
mean(perc.meth, na.rm=TRUE) # 89.6%
## [1] 89.5939
# Mean % methylation for each populaiton
mean(perc.meth[,1:9], na.rm=TRUE) # 89.6% <-- Hood Canal
## [1] 89.58982
mean(perc.meth[,10:18], na.rm=TRUE) # 89.6% <-- South Sound
## [1] 89.59771
perc.meth.summ <- data.frame(sample=1:18, methylated=1:18, coverage=1:18, mean.perc=1:18)
for (i in 1:18) {
perc.meth.summ[i,"sample"] <- i
perc.meth.summ[i,"methylated"] <-mean(as.vector(getData(meth_filter)[,grep(c("numCs"), colnames(meth_filter))][i][,1]), na.rm=T)
perc.meth.summ[i,"coverage"] <- mean(as.vector(getData(meth_filter)[,grep(c("coverage"), colnames(meth_filter))][i][,1]), na.rm=T)
perc.meth.summ[i,"mean.perc"] <- mean(as.vector(getData(meth_filter)[,grep(c("numCs"), colnames(meth_filter))][i][,1]) /
as.vector(getData(meth_filter)[,grep(c("coverage"), colnames(meth_filter))][i][,1]), na.rm=T)
}
mean(perc.meth.summ[1:9,"mean.perc"])
## [1] 0.8950195
mean(perc.meth.summ[10:18,"mean.perc"])
## [1] 0.8955731
myDiff=calculateDiffMeth(meth_filter,mc.cores=4)
## two groups detected:
## will calculate methylation difference as the difference of
## treatment (group: 1) - control (group: 0)
# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
write.table(myDiff25p, file = "../analyses/DMLs/myDiff25p.tab", sep = "\t", col.names = TRUE)
save(myDiff25p, file="../analyses/DMLs/myDiff25p")
# Write out bedfile for DMLs
dml25 <- myDiff25p %>%
mutate(start = start-1, end = end+1) %>%
dplyr::select(chr, start, end, meth.diff)
## Warning in class(x) <- c(subclass, tibble_class): Setting class(x) to multiple
## strings ("tbl_df", "tbl", ...); result will no longer be an S4 object
write_delim(dml25, "../analyses/DMLs/dml25.bed", delim = '\t', col_names = TRUE)
save(dml25, file="../analyses/DMLs/R-objects/dml25")
# Write out bedfile for all loci fed into DML analysis, to use as background for enrichment
mydiff.all <- mutate(myDiff, start = start -1, end = end + 1) %>% dplyr::select(chr, start, end, meth.diff) %>%
mutate_if(is.numeric, as.integer)
## Warning in class(x) <- c(subclass, tibble_class): Setting class(x) to multiple
## strings ("tbl_df", "tbl", ...); result will no longer be an S4 object
write_delim(mydiff.all, "../analyses/DMLs/mydiff-all.bed", delim = '\t', col_names = TRUE)
save(mydiff.all, file="../analyses/DMLs/R-objects/mydiff.all")
nrow(dml25)
## [1] 359
dml25_counts <- selectByOverlap(meth_filter,as(myDiff25p,"GRanges"))
class(dml25_counts) #class should be methylBase
## [1] "methylBase"
## attr(,"package")
## [1] "methylKit"
perc.methDMLs= percMethylation(dml25_counts, rowids=T) #create a percent methylated object for DMLs
# save count and percent objects to be used in subsequent notebook
save(dml25_counts, file = "../analyses/DMLs/R-objects/dml25_counts")
save(perc.methDMLs, file="../analyses/DMLs/R-objects/perc.methDMLs")
FYI I checked that the correlation matrix in getCorrelation
is derived from the percent methylated matrix. It is!
cor.pears <- cor(perc.meth, method="pearson") # create pearson correlation matrix.
dist.manhat <- dist(t(perc.meth), method = "manhattan", diag = T, upper = F)
dist.manhat.df <- reshape2::melt(as.matrix(dist.manhat), varnames = c("row", "col"))
cor.pears.DMLs <- cor(perc.methDMLs, method="pearson")
dist.manhat.DMLs <- dist(t(perc.methDMLs), method = "manhattan", diag = T, upper = F)
dist.manhat.DMLs.df <- reshape2::melt(as.matrix(dist.manhat.DMLs), varnames = c("row", "col"))
key <- read.csv(here::here("data", "sample-key.csv"))
dist.manhat.df <- merge(x=merge(x=dist.manhat.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.csv"), row.names=FALSE)
dist.manhat.DMLs.df <- merge(x=merge(x=dist.manhat.DMLs.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.DMLs.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.DMLs.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.DMLs.csv"), row.names=FALSE)
Pst was defined by Johnson & Kelly, 2019:
“[Pst is] … a measurement of epigenetic divergence (st). We defined PST as the methylation analogue of Wrights FST and calculated it for each locus by subtracting the total variance in methylation in all populations from the variance within a single population and divided by the variance in all populations (PST = (VarianceTotal − VarianceSub)/VarianceTotal; Leinonen, McCairns, O’Hara, & Merilä, 2013).”
I looked through their code, and it’s not totally clear how they did this calculation. BUT I believe they worked with % methylation data at each locus, so I will do the same.
head(perc.meth, 3)
## 1 2 3 4 5 6
## Contig0.39226.39226 52.38095 77.27273 45.45455 60.86957 39.13043 61.90476
## Contig0.39234.39234 41.66667 25.92593 37.03704 19.23077 42.85714 44.00000
## Contig0.64179.64179 90.00000 71.42857 NA 94.44444 72.72727 100.00000
## 7 8 9 10 11 12 13
## Contig0.39226.39226 45.00000 62.50000 58.06452 52.94118 NA 65.21739 58.82353
## Contig0.39234.39234 30.43478 33.33333 22.22222 36.84211 NA 34.48276 42.10526
## Contig0.64179.64179 38.46154 91.66667 61.53846 NA NA 73.68421 72.72727
## 14 15 16 17 18
## Contig0.39226.39226 50.00000 40.0 61.53846 42.10526 33.33333
## Contig0.39234.39234 41.66667 48.0 29.41176 52.17391 31.25000
## Contig0.64179.64179 76.19048 87.5 100.00000 100.00000 46.66667
The following R code was provided by Kevin Johnson
perc.meth.Pst <- read_delim(file = here::here("analyses", "methylation-filtered", "percent-methylation-filtered.tab"), delim = "\t", col_names = TRUE)
## Parsed with column specification:
## cols(
## contig = col_character(),
## `1` = col_double(),
## `2` = col_double(),
## `3` = col_double(),
## `4` = col_double(),
## `5` = col_double(),
## `6` = col_double(),
## `7` = col_double(),
## `8` = col_double(),
## `9` = col_double(),
## `10` = col_double(),
## `11` = col_double(),
## `12` = col_double(),
## `13` = col_double(),
## `14` = col_double(),
## `15` = col_double(),
## `16` = col_double(),
## `17` = col_double(),
## `18` = col_double()
## )
apply(perc.meth.Pst, 2, function(x) any(is.na(x))) #any NA values?
## contig 1 2 3 4 5 6 7 8 9 10
## FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 11 12 13 14 15 16 17 18
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#Here I removed rows with NA values, the Pstat function will work with some NA values, but to speed things up I removed them
perc.meth.Pst2 <- perc.meth.Pst[c(complete.cases(perc.meth.Pst)),]
apply(perc.meth.Pst2, 2, function(x) any(is.na(x))) #confirm no more NA values
## contig 1 2 3 4 5 6 7 8 9 10
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 11 12 13 14 15 16 17 18
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#I also removed rows that had a mean methylation value of 0 as these throw off the calculation
perc.meth.Pst2 <- perc.meth.Pst2[c(rowMeans(perc.meth.Pst2[-1] )>0),]
#If you want to see how this influences the Pst.hc vs Pst.ss comparison run the next line:
#perc.meth.Pst <- perc.meth.Pst2
#Now I modify the data a little bit, first setting contig as row name
#perc.meth.Pst2 <- as.data.frame(perc.meth.Pst)
row.names(perc.meth.Pst2) <- perc.meth.Pst2$contig
## Warning: Setting row names on a tibble is deprecated.
perc.meth.Pst2$contig = NULL
#Now I flip the data frame so that each column is a unique loci
perc.meth.Pst2 <- as.data.frame(t(perc.meth.Pst2))
#Now add a new column that has population assignment
perc.meth.Pst2$Population <- c("HC","HC","HC","HC","HC","HC","HC","HC","HC","SS","SS","SS","SS","SS","SS","SS","SS","SS")
ncol(perc.meth.Pst2)
## [1] 7826
#reorder these columns so that population is the first column
perc.meth.Pst2 <- perc.meth.Pst2[,c(7826,1:7825)]
require(Pstat)
#Now run the following line and it will provide Pst estimates for every gene. NOTE: this can take a long time.
perc.meth.Pst.done <- Pst(perc.meth.Pst2)
## [1] "Populations sizes are:"
## HC SS
## 9 9
save(perc.meth.Pst.done, file="../analyses/methylation-filtered/R-objects/perc.meth.Pst.done")
# Load Pst dataframe if needed.
#load("../analyses/methylation-filtered/R-objects/perc.meth.Pst.done")
hist(perc.meth.Pst.done$Pst_Values, col = "gray50")
summary(perc.meth.Pst.done$Pst_Values)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.06509 0.23269 0.28724 0.47375 0.95411
nrow(perc.meth.Pst.done)
## [1] 7825
# Write out Pst results
write.table(perc.meth.Pst.done, file = here::here("analyses", "methylation-filtered", "Pst_locus.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = FALSE)
The function clusters samples using hclust function and various distance metrics derived from percent methylation per base or per region for each sample.
clusterSamples(meth_filter, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 18
Customized methylKit
biplots from Yaamini Venkataraman’s code, https://github.com/epigeneticstoocean/paper-gonad-meth/blob/master/code/04-methylkit.Rmd#L270
#Create dataframe with sample treatment information, color & symbol scheme
plotCustomization <- data.frame(sample = 1:18, population=c(
rep("HC", times=9),
rep("SS", times=9)),
color=c(rep("firebrick3", times=9),
rep("dodgerblue3", times=9)),
symbol=c(rep(16, times=9),
rep(17, times=9)))
PCA.filtered <- PCASamples(meth_filter, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.filtered) #Look at summary statistics. The first PC explains 21.9% of variation, the second PC explains 18.3% of variation
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 21.8973 18.26734 17.60077 16.93541 16.19327 15.19167
## Proportion of Variance 0.1225 0.08524 0.07913 0.07326 0.06698 0.05895
## Cumulative Proportion 0.1225 0.20771 0.28684 0.36010 0.42708 0.48603
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 14.8991 14.75004 14.30407 14.06376 13.85301 13.55712
## Proportion of Variance 0.0567 0.05557 0.05226 0.05052 0.04902 0.04695
## Cumulative Proportion 0.5427 0.59830 0.65056 0.70108 0.75010 0.79705
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 13.39880 12.83144 12.67249 12.20035 11.872 3.201e-14
## Proportion of Variance 0.04586 0.04206 0.04102 0.03802 0.036 0.000e+00
## Cumulative Proportion 0.84290 0.88496 0.92598 0.96400 1.000 1.000e+00
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.filtered, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 3) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (21.9%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (18.3%)", line = 3, cex = 1.5) #Add y-axis label
legend("topleft",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.6, bty = "n") #Add a legend with information about ambient and elevated samples
NOTE: this PCA was run with all methylated loci data after filtering for 10-100x across 7 of 9 samples per pop.
save(PCA.filtered, file="../analyses/methylation-filtered/R-objects/PCA.filtered")
write_delim(data.frame(PCA.filtered$x) %>% tibble::rownames_to_column("sample"), "../analyses/methylation-filtered/R-objects/PC-scores-filtered-methylation.tab", delim = '\t', col_names = TRUE)
PCA.DMLs <- PCASamples(dml25_counts, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.DMLs) #Look at summary statistics. The first PC explains 3.7% of variation, the second PC explains 1.9% of variation
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.2374 1.6346 1.47196 1.36849 1.2932 1.20820 1.02255
## Proportion of Variance 0.4192 0.1069 0.08667 0.07491 0.0669 0.05839 0.04182
## Cumulative Proportion 0.4192 0.5261 0.61277 0.68768 0.7546 0.81297 0.85480
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.94244 0.84787 0.70428 0.62673 0.5701 0.53070 0.46986
## Proportion of Variance 0.03553 0.02876 0.01984 0.01571 0.0130 0.01127 0.00883
## Cumulative Proportion 0.89032 0.91908 0.93892 0.95463 0.9676 0.97890 0.98773
## PC15 PC16 PC17 PC18
## Standard deviation 0.42949 0.30375 0.1734 4.54e-16
## Proportion of Variance 0.00738 0.00369 0.0012 0.00e+00
## Cumulative Proportion 0.99511 0.99880 1.0000 1.00e+00
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.DMLs, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 3) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
ordiellipse(PCA.DMLs, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
ordiellipse(PCA.DMLs, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (3.7%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (1.9%)", line = 3, cex = 1.5) #Add y-axis label
legend("bottomright",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.6, bty = "n") #Add a legend with information about ambient and elevated samples
NOTE: this PCA was run with DMLs only
save(PCA.DMLs, file="../analyses/DMLs/R-objects/PCA.DMLs")
write_delim(data.frame(PCA.DMLs$x) %>% tibble::rownames_to_column("sample"), "../analyses/DMLs/PC-scores-DMLs.tab", delim = '\t', col_names = TRUE)
Pst.100 <- perc.meth.Pst.done %>%
mutate(Quant_Varia=as.character(Quant_Varia)) %>%
separate(Quant_Varia, c("contig", "start", "end"), sep = "\\.") %>%
mutate_at(vars(start, end), funs(as.integer)) %>%
mutate(contig_start=paste(contig, start, sep="_")) %>%
top_n(100, Pst_Values)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
Pst.100.ranges=GRanges(seqnames=Pst.100$contig,
ranges=IRanges(start=Pst.100$start,width=1))
# Run PCA and simultaneously select methylKit records that lie inside the Pst100 regions
PCA.Pst100 <- PCASamples(selectByOverlap(meth_filter,Pst.100.ranges),
obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.Pst100) #The first PC explains 5.1% of variation, the second PC explains 2.1% of variation
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 5.0605 2.05969 1.7000 1.54940 1.45867 1.38136 1.30626
## Proportion of Variance 0.5122 0.08485 0.0578 0.04801 0.04255 0.03816 0.03413
## Cumulative Proportion 0.5122 0.59701 0.6548 0.70283 0.74538 0.78354 0.81767
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.20860 1.17439 1.15832 1.06358 1.00689 0.95990 0.81261
## Proportion of Variance 0.02921 0.02758 0.02683 0.02262 0.02028 0.01843 0.01321
## Cumulative Proportion 0.84688 0.87447 0.90130 0.92393 0.94420 0.96263 0.97584
## PC15 PC16 PC17 PC18
## Standard deviation 0.72859 0.62211 0.53879 1.076e-15
## Proportion of Variance 0.01062 0.00774 0.00581 0.000e+00
## Cumulative Proportion 0.98645 0.99419 1.00000 1.000e+00
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.Pst100, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 3) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
ordiellipse(PCA.Pst100, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
ordiellipse(PCA.Pst100, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (5.1%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (2.1%)", line = 3, cex = 1.5) #Add y-axis label
legend("top",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.6, bty = "n") #Add a legend with information about ambient and elevated samples
NOTE: this PCA was run with loci with 100-highest Pst values only
save(PCA.Pst100, file="../analyses/methylation-filtered/R-objects/PCA.Pst100")
write_delim(data.frame(PCA.Pst100$x) %>% tibble::rownames_to_column("sample"), "../analyses/methylation-filtered/PC-scores-Pst100.tab", delim = '\t', col_names = TRUE)
Pst.200 <- perc.meth.Pst.done %>%
mutate(Quant_Varia=as.character(Quant_Varia)) %>%
separate(Quant_Varia, c("contig", "start", "end"), sep = "\\.") %>%
mutate_at(vars(start, end), funs(as.integer)) %>%
mutate(contig_start=paste(contig, start, sep="_")) %>%
top_n(200, Pst_Values)
Pst.200.ranges=GRanges(seqnames=Pst.200$contig,
ranges=IRanges(start=Pst.200$start,width=1))
# Run PCA and simultaneously select methylKit records that lie inside the Pst200 regions
PCA.Pst200 <- PCASamples(selectByOverlap(meth_filter,Pst.200.ranges),
obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.Pst200) #The first PC explains 6.6% of variation, the second PC explains 2.7% of variation
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 6.6246 2.68787 2.40727 2.28591 2.16600 2.1118 1.91507
## Proportion of Variance 0.4389 0.07225 0.05795 0.05225 0.04692 0.0446 0.03667
## Cumulative Proportion 0.4389 0.51110 0.56905 0.62130 0.66822 0.7128 0.74949
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.84514 1.81047 1.80512 1.72831 1.62635 1.53280 1.43904
## Proportion of Variance 0.03405 0.03278 0.03258 0.02987 0.02645 0.02349 0.02071
## Cumulative Proportion 0.78354 0.81631 0.84890 0.87877 0.90522 0.92871 0.94942
## PC15 PC16 PC17 PC18
## Standard deviation 1.42171 1.27435 1.18853 2.29e-15
## Proportion of Variance 0.02021 0.01624 0.01413 0.00e+00
## Cumulative Proportion 0.96963 0.98587 1.00000 1.00e+00
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.Pst200, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 3) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
ordiellipse(PCA.Pst200, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
ordiellipse(PCA.Pst200, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (6.6%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (2.7%)", line = 3, cex = 1.5) #Add y-axis label
legend("top",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.6, bty = "n") #Add a legend with information about ambient and elevated samples
NOTE: this PCA was run with loci with 200-highest Pst values only
save(PCA.Pst200, file="../analyses/methylation-filtered/R-objects/PCA.Pst200")
write_delim(data.frame(PCA.Pst200$x) %>% tibble::rownames_to_column("sample"), "../analyses/methylation-filtered/PC-scores-Pst200.tab", delim = '\t', col_names = TRUE)
meth_filter_reshaped <- melt(data=meth_filter, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
dcast(chr+start+end+strand+sample ~ variable) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
mutate(population=as.character(sample))
## Using count as value column: use value.var to override.
meth_filter_reshaped$population <- car::recode(meth_filter_reshaped$population, "c('1', '2', '3', '4', '5', '6', '7', '8', '9')='HC'")
meth_filter_reshaped$population <- car::recode(meth_filter_reshaped$population, "c('10','11','12','13','14','15','16','17','18')='SS'")
readr::write_delim(meth_filter_reshaped, here::here("analyses", "methylation-filtered", "meth_filter_long.tab"), delim = '\t', col_names = FALSE)
save(meth_filter_reshaped, file="../analyses/methylation-filtered/R-objects/meth_filter_reshaped")
# mean % methylation across samples
dml25_counts_unique <- unique(getData(dml25_counts)[,c("chr", "start")]) %>%
mutate(chr_start=paste(chr, start, sep="_"))
#Calculate ratio between mean % methylation in Hood Canal : South Sound
DML.ratios <- meth_filter_reshaped %>%
mutate(chr_start=paste(chr, start, sep="_")) %>%
filter(chr_start %in% dml25_counts_unique$chr_start) %>%
group_by(population, chr, start) %>%
summarise(mean_percentMeth = mean(percMeth, na.rm = TRUE)) %>% #could instead use ... allCs_percent = 100*(sum(numCs)/sum(coverage)),
dcast(chr + start ~ population) %>%
mutate(ratio_HC.SS = HC/SS, #in this ratio column, values <1 = HC hypomethylated, values >1 = SS hypomethylated
chr_start=paste(chr, start, sep="_"))
## Using mean_percentMeth as value column: use value.var to override.
nrow(DML.ratios)==nrow(dml25_counts_unique) #confirm same # loci; should=0
## [1] TRUE
save(DML.ratios, file="../analyses/DMLs/R-objects/DML.ratios")
# For all loci, calculate mean and sd percent methylation across samples by population
DML.calcs <- meth_filter_reshaped %>%
group_by(population, chr, start) %>%
dplyr::summarise(
mean_percMeth = mean(percMeth, na.rm=TRUE),
sd_percMeth=sd(percMeth, na.rm=TRUE),
n()) %>%
filter(chr %in% c(DML.ratios$chr) &
start %in% c(DML.ratios$start))
DMLs_heatmap <- meth_filter_reshaped %>%
mutate(sample=factor(sample, levels = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18)),
chr_start=paste(chr, start, sep="_")) %>%
filter(
start %in% dml25_counts_unique$start &
chr %in% DML.ratios$chr)
save(DMLs_heatmap, file="../analyses/DMLs/R-objects/DMLs_heatmap")
#ggplotly(
ggplot(DMLs_heatmap, aes(sample, chr_start, fill= percMeth)) + xlab("Sample") + geom_tile(na.rm = T) +
scale_y_discrete(limits=(DML.ratios[order(DML.ratios$ratio_HC.SS),]$chr_start)) +
#scale_fill_viridis(discrete=FALSE)
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
scale_fill_distiller(palette = "YlGnBu", direction = 1) #)
#scale_fill_gradient(low="gray5", high="white")
# First look at coverage for each DML, by population
# use this to call specific contigs
DML.calcs %>%
filter(chr=="Contig26041") %>%
ggplot(aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.25) +
theme_light() + ggtitle("Tonsoku-like protein, DNA repair") +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))
## Warning: Ignoring unknown aesthetics: width
# For loop to plot a list of loci (this plots all of them, but there are >400 DMLs, so I only plot the first 3 here)
for (i in 1:3) {
temp <- meth_filter_reshaped %>%
mutate(chr_start=paste(chr, start, sep="_")) %>%
filter(chr_start == dml25_counts_unique[i,"chr_start"]) %>%
group_by(population, chr, start) %>%
summarise(mean_percentMeth = mean(percMeth, na.rm=TRUE))
print((ggplot(temp, aes(x = population, y = mean_percentMeth, fill = population)) +
geom_bar(stat="identity") + ylim(0,100) +
theme_light() + ggtitle(paste("Locus = ", dml25_counts_unique[i, "chr"], "_", dml25_counts_unique[i, "start"], sep="")) +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))))
}
Old version of dml25 bed file saved at http://gannet.fish.washington.edu/seashell/snaps/dml25.bed
FactoMineR::PCA(perc.meth)
## Warning in FactoMineR::PCA(perc.meth): Missing values are imputed by the mean of
## the variable: you should use the imputePCA function of the missMDA package
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 33738 individuals, described by 18 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
FactoMineR::PCA(perc.methDMLs)
## Warning in FactoMineR::PCA(perc.methDMLs): Missing values are imputed by the
## mean of the variable: you should use the imputePCA function of the missMDA
## package
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 359 individuals, described by 18 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
DML_plots <- list()
for (i in 1:nrow(DML.ratios)) {
test <- DML.calcs %>%
filter(chr==DML.ratios$chr[i] &
start==DML.ratios$start[i])
hypo_SS_plots[[i]] <- ggplot(test, aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.25) +
theme_light() + ggtitle(paste("Contig = ", DML.ratios[i, "chr"], ", Locus = ",
DML.ratios[i, "start"], sep="")) +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))
}
do.call("grid.arrange", DML_plots)
Summarize methylation information over tiling windows, to then assess differentially methylated regions (DMRs). The function below tiles the genome with windows 1000bp length and 1000bp step-size and summarizes the methylation information on those tiles. In this case, it returns a methylRawList object which can be fed into unite and calculateDiffMeth functions consecutively to get differentially methylated regions. The tilling function adds up C and T counts from each covered cytosine and returns a total C and T count for each tile.
#tiles_1000=tileMethylCounts(filtered.myobj,win.size=1000,step.size=1000, mc.cores=4)
head(tiles_1000[[1]],3)
#tiles_200=tileMethylCounts(filtered.myobj,win.size=200,step.size=200, mc.cores=4)
#head(tiles_200)
meth_tile_filter=methylKit::unite(tiles_1000, destrand=TRUE)
meth_tile_filter
clusterSamples(meth_tile_filter, dist="correlation", method="ward", plot=TRUE)
PCASamples(meth_tile_filter)
myDiff_tiled=calculateDiffMeth(meth_tile_filter,mc.cores=4)
# get hyper methylated regions
myDiff_tiled_25p.hyper=getMethylDiff(myDiff_tiled,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated regions
myDiff_tiled_25p.hypo=getMethylDiff(myDiff_tiled,difference=25,qvalue=0.01,type="hypo")
#
# get all differentially methylated regions (25% different)
myDiff_tiled_25p=getMethylDiff(myDiff_tiled,difference=25,qvalue=0.01) #only 2 regions!
write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)
require(matrixStats)
require(readr)
require(tidyverse)
perc.meth.Pst <- read_delim(file = "../analyses/percent-methylation-filtered.tab", delim = "\t", col_names = TRUE)
perc.meth.Pst$var_all <- rowVars((perc.meth.Pst[-1] %>% as.matrix()), na.rm = TRUE) #calculate % methylation variance across all samples
perc.meth.Pst$var_hc <- rowVars((perc.meth.Pst[,2:10] %>% as.matrix()), na.rm = TRUE) #calculate % methylation variance across Hood Canal samples (sample #1 - #9)
perc.meth.Pst$var_ss <- rowVars((perc.meth.Pst[,11:19] %>% as.matrix()), na.rm = TRUE) #calculate % methylation variance across South Sound samples (sample #10 - #18)
perc.meth.Pst$Pst.hc <- (perc.meth.Pst$var_all - perc.meth.Pst$var_hc) / perc.meth.Pst$var_all #calculate Pst using Hood Canal variance as the within-group variance
perc.meth.Pst$Pst.ss <- (perc.meth.Pst$var_all - perc.meth.Pst$var_ss) / perc.meth.Pst$var_all #calculate Pst using South Sound variance as the within-group variance
plot(perc.meth.Pst$Pst.hc ~ perc.meth.Pst$Pst.ss) #Do Pst calculated using HC variance and SS variance differ? Yes, not 1:1
hist(perc.meth.Pst$Pst.hc) # Slightly right skewed
hist(perc.meth.Pst$Pst.ss) # Slightly left skewed