list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot", "janitor", "googlesheets4", "ggrepel", "clipr", "plotly") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
# Load MethylKit
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("methylKit")
require(methylKit)
`%!in%` = Negate(`%in%`)
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] BiocManager_1.30.19 clipr_0.8.0
## [3] ggrepel_0.9.2 googlesheets4_1.0.1
## [5] janitor_2.1.0 corrplot_0.92
## [7] PerformanceAnalytics_2.0.4 xts_0.12.2
## [9] zoo_1.8-11 factoextra_1.0.7
## [11] vegan_2.6-4 lattice_0.20-45
## [13] permute_0.9-7 viridis_0.6.2
## [15] viridisLite_0.4.1 Pstat_1.2
## [17] matrixStats_0.62.0 ggforce_0.4.1
## [19] methylKit_1.24.0 GenomicRanges_1.49.0
## [21] GenomeInfoDb_1.34.2 IRanges_2.32.0
## [23] S4Vectors_0.36.0 BiocGenerics_0.44.0
## [25] here_1.0.1 reshape2_1.4.4
## [27] forcats_0.5.2 stringr_1.4.1
## [29] dplyr_1.0.10 purrr_0.3.5
## [31] readr_2.1.3 tidyr_1.2.1
## [33] tibble_3.1.8 ggplot2_3.4.0
## [35] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1
## [3] plyr_1.8.7 splines_4.2.2
## [5] BiocParallel_1.32.1 digest_0.6.30
## [7] htmltools_0.5.3 fansi_1.0.3
## [9] magrittr_2.0.3 cluster_2.1.4
## [11] tzdb_0.3.0 limma_3.54.0
## [13] Biostrings_2.66.0 modelr_0.1.9
## [15] R.utils_2.12.2 bdsmatrix_1.3-6
## [17] timechange_0.1.1 colorspace_2.0-3
## [19] rvest_1.0.3 haven_2.5.1
## [21] xfun_0.34 crayon_1.5.2
## [23] RCurl_1.98-1.9 jsonlite_1.8.3
## [25] glue_1.6.2 polyclip_1.10-4
## [27] gtable_0.3.1 gargle_1.2.1
## [29] zlibbioc_1.44.0 XVector_0.38.0
## [31] DelayedArray_0.23.2 scales_1.2.1
## [33] mvtnorm_1.1-3 DBI_1.1.3
## [35] Rcpp_1.0.9 emdbook_1.3.12
## [37] mclust_6.0.0 httr_1.4.4
## [39] ellipsis_0.3.2 pkgconfig_2.0.3
## [41] XML_3.99-0.12 R.methodsS3_1.8.2
## [43] farver_2.1.1 sass_0.4.2
## [45] dbplyr_2.2.1 utf8_1.2.2
## [47] tidyselect_1.2.0 rlang_1.0.6
## [49] munsell_0.5.0 cellranger_1.1.0
## [51] tools_4.2.2 cachem_1.0.6
## [53] cli_3.4.1 generics_0.1.3
## [55] fastseg_1.44.0 broom_1.0.1
## [57] evaluate_0.18 fastmap_1.1.0
## [59] yaml_2.3.6 knitr_1.40
## [61] fs_1.5.2 nlme_3.1-160
## [63] R.oo_1.25.0 xml2_1.3.3
## [65] compiler_4.2.2 rstudioapi_0.14
## [67] reprex_2.0.2 tweenr_2.0.2
## [69] bslib_0.4.1 stringi_1.7.8
## [71] Matrix_1.5-1 vctrs_0.5.0
## [73] pillar_1.8.1 lifecycle_1.0.3
## [75] jquerylib_0.1.4 data.table_1.14.4
## [77] bitops_1.0-7 rtracklayer_1.58.0
## [79] qvalue_2.30.0 R6_2.5.1
## [81] BiocIO_1.8.0 gridExtra_2.3
## [83] codetools_0.2-18 MASS_7.3-58.1
## [85] gtools_3.9.3 assertthat_0.2.1
## [87] SummarizedExperiment_1.28.0 rprojroot_2.0.3
## [89] rjson_0.2.21 withr_2.5.0
## [91] GenomicAlignments_1.34.0 Rsamtools_2.14.0
## [93] GenomeInfoDbData_1.2.9 mgcv_1.8-41
## [95] parallel_4.2.2 hms_1.1.2
## [97] quadprog_1.5-8 grid_4.2.2
## [99] coda_0.19-4 snakecase_0.11.0
## [101] rmarkdown_2.17 MatrixGenerics_1.10.0
## [103] googledrive_2.0.0 bbmle_1.0.25
## [105] numDeriv_2016.8-1.1 Biobase_2.58.0
## [107] lubridate_1.9.0 restfulr_0.0.15
I ran this code interactively on Sedna
srun --pty /bin/bash
# First, I entered an interactive
Sedna node module load R
# Then, load R R
#start R interactively Now proceed with the following code
# ## AGAIN- this code was run interactively on Sedna!
# # Install and load programs
#
# # Load MethylKit
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("methylKit")
#
# require(methylKit)
# require(tidyverse)
#
# # Load sample info dataframe
# load(file="/home/lspencer/DuMOAR/R/sample.info")
#
# # Triple check that treatments are in the order of sequence sample number (1-20)
# (sample.info %>% arrange(sample_seq))$sample # List out library prep sample number
# (sample.info %>% arrange(sample_seq))$sample_seq # List out sequence sample
# (sample.info %>% arrange(sample_seq))$high_or_low_co2 # List out treatment
#
# # Read .bam files into Methylkit on Sedna
# (file.list=list("/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_06_S1_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_14_S2_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_22_S3_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_38_S4_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_04_S5_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_15_S6_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_33_S7_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_01_S8_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_06_S9_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_21_S10_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_24_S11_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_06_S12_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_11_S13_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_24_S14_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_02_S15_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_13_S16_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_28_S17_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_01_S18_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_08_S19_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_11_S20_R1_bismark_bt2_pe.deduplicated.sorted.bam"))
#
# #### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)
#
# meth_obj = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"),
# assembly = "PGA_assembly.fasta", read.context="CpG", mincov=2,
# treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2))
#
# #### Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward.
#
# save(meth_obj, file = "/home/lspencer/DuMOAR/R/meth_obj")
getwd()
## [1] "C:/Users/laura.spencer/Work/DuMOAR/scripts"
# Read in data from GoogleSheet
sample.info <- read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "DNAisolations_only") %>%
clean_names() %>%
mutate_at(vars(tank, treatment, high_or_low_co2, developmental_stage, dna_siolation_batch), as.factor) %>%
# Join with bismark summary report
right_join(
# Read in bismark summary report
read_delim(file="../results/bismark/scaffold-only/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
clean_names() %>%
separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
mutate(sample_prep=paste(a, b, sep = "-")) %>%
mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
dplyr::select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
mutate(perc_totalread_unique=unique_reads_remaining/total_reads,
CpGs_Meth=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs),
CHGs_Meth=100*methylated_chgs/(methylated_chgs+unmethylated_chgs),
CHHs_Meth=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)),
by = c("sample"="sample_prep")) %>%
# add mbd librar prep stats
left_join(
read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "MBD_calcs") %>%
clean_names() %>%
dplyr::select(sample, total_recovery_ng, percent_recovery_percent, library_bioanlyzer_mean_fragment_size_bp,
library_bioanalyzer_molarity_pmol_l, qubit_concentration_ng_u_l),
by = "sample")
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <]8;;https://gargle.r-lib.org/articles/non-interactive-auth.htmlhttps://gargle.r-lib.org/articles/non-interactive-auth.html]8;;>
## ℹ The googlesheets4 package is using a cached token for
## ']8;;mailto:laura.spencer@noaa.govlaura.spencer@noaa.gov]8;;'.
## ✔ Reading from "OA Crab Sample Collection 071119".
## ✔ Range ''DNAisolations_only''.
## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ✔ Reading from "OA Crab Sample Collection 071119".
##
## ✔ Range ''MBD_calcs''.
##
## New names:
save(sample.info, file="../data/sample.info")
load(file="../data/sample.info")
cor(sample.info %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=0.65)
#trace("chart.Correlation", edit=T) #to edit function
# In line 17, change the cex value for what you want, i changed it to 2
chart.Correlation(sample.info %>%
dplyr::select(CpGs_Meth, a260_280, mean_mw_highest_peak, aligned_reads,
unaligned_reads, ambiguously_aligned_reads, no_genomic_sequence,
unique_reads_remaining, total_cs, perc_totalread_unique, CHGs_Meth, CHHs_Meth),
histogram=F, pch=19)
Question asked by Zymo (creater of PicoMethyl kit) - any correlation between the samples with abnormal CpG methylation and samples that had more yield from the MBD enrichment step?
Answer: no not really
chart.Correlation(sample.info %>%
dplyr::select(CpGs_Meth, total_recovery_ng, percent_recovery_percent,
library_bioanlyzer_mean_fragment_size_bp, library_bioanalyzer_molarity_pmol_l,
qubit_concentration_ng_u_l),
histogram=F, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
# CpG methylation is not associated with MBD yield
# Use this code to generate
# sample.info %>% dplyr::select(sample_seq, high_or_low_co2, CpGs_Meth,
# total_recovery_ng, percent_recovery_percent, library_bioanlyzer_mean_fragment_size_bp,
# library_bioanalyzer_molarity_pmol_l, qubit_concentration_ng_u_l) %>%
# pivot_longer(cols = c("total_recovery_ng", "percent_recovery_percent", "library_bioanlyzer_mean_fragment_size_bp",
# "library_bioanalyzer_molarity_pmol_l", "qubit_concentration_ng_u_l"), names_to = "MBD.stat") %>%
# mutate(MBD.stat=as.factor(MBD.stat)) %>%
#
# ggplot(aes(x=value, y=CpGs_Meth)) +
# geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
# scale_color_manual(values=c("red", "blue")) +
# geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
# theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
# ggtitle("% CpG Methylation ~ MBD Stats") +
# facet_wrap(~MBD.stat, scales = "free")
# Weak correlation with MBD library fragment size, but not convincing
ggplot(sample.info, aes(x=library_bioanlyzer_mean_fragment_size_bp, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 500, y = 17, label = "r = 0.42", size=6) +
ggtitle("% CpG Methylation ~ library_bioanlyzer_mean_fragment_size_bp")
## Warning in geom_point(aes(label = sample_seq, color = high_or_low_co2), :
## Ignoring unknown aesthetics: label
## `geom_smooth()` using formula = 'y ~ x'
# Low CpG methylation is associated with high CHH methylation
ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 33, y = 70, label = "r = -0.59", size=6) +
ggtitle("% CpG Methylation ~ CHHs_Meth")
## Warning in geom_point(aes(label = sample_seq, color = high_or_low_co2), :
## Ignoring unknown aesthetics: label
## `geom_smooth()` using formula = 'y ~ x'
Correlations between CpGs_Meth and the following:
- a260_280: 0.52* - meh
- mean_mw_highest_peak: 0.51* - not predictive, but low CpG meth samples
have lower mean MW.
- aligned_reads: 0.66** - low CpG meth = fewer aligned reads
- unaligned_reads: -0.77***
- ambiguously_aligned_reads: -0.60**
- unique_reads_remaining: 0.73***
- total_cs: 0.72***
- perc_totalread_unique: 0.82***
- CHGs_Meth: -0.38
- CHHs_Meth: -0.59**
ggplot(sample.info, aes(x=a260_280, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 1.5, y = 75, label = "r = 0.52", size=6) +
ggtitle("% CpG Methylation ~ a260_280")
ggplot(sample.info, aes(x=mean_mw_highest_peak, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 10000, y = 100, label = "r = 0.51", size=6) +
ggtitle("% CpG Methylation ~ Mean MW (highest peak)")
ggplot(sample.info, aes(x=aligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 6e6, y = 85, label = "r = 0.66", size=6) +
ggtitle("% CpG Methylation ~ # of aligned reads")
ggplot(sample.info, aes(x=unaligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 1e7, y = 85, label = "r = -0.77", size=6) +
ggtitle("% CpG Methylation ~ # unaligned reads")
ggplot(sample.info, aes(x=ambiguously_aligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 5e6, y = 85, label = "r = -0.60", size=6) +
ggtitle("% CpG Methylation ~ # ambiguously aligned reads")
ggplot(sample.info, aes(x=no_genomic_sequence, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
#annotate("text", x = 285, y = 75, label = "r = -0.93", size=6) +
ggtitle("% CpG Methylation ~ # unaligned- no genomic sequence")
No. of reads not aligning due to no genomic sequence corrlates very well with % methylation, but the actual # of reads is super low across all samples (<300!). What does this mean?!
ggplot(sample.info, aes(x=unique_reads_remaining, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 3.5e6, y = 95, label = "r = 0.73", size=6) +
ggtitle("% CpG Methylation ~ # unique aligned reads")
ggplot(sample.info, aes(x=total_cs, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 6e7, y = 90, label = "r = 0.72", size=6) +
ggtitle("% CpG Methylation ~ total # Cs")
ggplot(sample.info, aes(x=perc_totalread_unique, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = .15, y = 85, label = "r = 0.82", size=6) +
ggtitle("% CpG Methylation ~ % total reads uniquely aligned")
ggplot(sample.info, aes(x=CHGs_Meth, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) +
ggtitle("% CpG Methylation ~ % CHG Methylation")
ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 30, y = 75, label = "r = -0.59", size=6) +
ggtitle("% CpG Methylation ~ % CHH Methylation")
load("../results/methylkit/meth_obj")
# Check out data for sample #1
head(meth_obj[[1]])
## chr start end strand coverage numCs
## 1 PGA_scaffold0__40_contigs__length_4818635 458 458 + 2 0
## 2 PGA_scaffold0__40_contigs__length_4818635 464 464 + 2 0
## 3 PGA_scaffold0__40_contigs__length_4818635 469 469 + 2 0
## 4 PGA_scaffold0__40_contigs__length_4818635 508 508 + 2 0
## 5 PGA_scaffold0__40_contigs__length_4818635 512 512 + 2 0
## 6 PGA_scaffold0__40_contigs__length_4818635 530 530 + 2 0
## numTs
## 1 2
## 2 2
## 3 2
## 4 2
## 5 2
## 6 2
for (i in 1:20) {
getMethylationStats(meth_obj[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
#### Now look at coverage for each sample
for (i in 1:20) {
getCoverageStats(meth_obj[[i]],plot=TRUE,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_5x=filterByCoverage(meth_obj,lo.count=5,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(meth_5x, file="../results/methylkit/meth_5x")
for (i in 1:20) {
getMethylationStats(meth_5x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_10x=filterByCoverage(meth_obj,lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(meth_10x, file="../results/methylkit/meth_10x")
for (i in 1:20) {
getMethylationStats(meth_10x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_unite=methylKit::unite(meth_obj, destrand=TRUE, min.per.group=1L)
## destranding...
## uniting...
save(meth_unite, file = "../results/methylkit/meth_unite") #save object to file
#load(file = "../results/methylkit/meth_unite") #load object from file if needed
clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 20
PCASamples(meth_unite, screeplot = T)
PCASamples(meth_unite)
# Here - use all methylation data
#load(file = "../results/methylkit/meth_unite") #save object to file
# NOTE: since the sample IDs in the meth_unite() object are sequential from 1:20, the sample IDs in the column names containing data (coverage, numCs, numTs) are correct in this instance.
meth_unite_reshaped <- meth_unite %>%
melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
drop_na(count) %>%
mutate(stat=gsub('[0-9]+', '', variable)) %>%
mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
dplyr::select(-variable) %>%
pivot_wider(names_from = stat, values_from = count) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
mutate(sample=as.numeric(sample))
save(meth_unite_reshaped, file = "../results/methylkit/meth_unite_reshaped")
#load(file = "../results/methylkit/meth_unite_reshaped")
# # Here - use a smaller data set, already filtered for min coverage 5x
# meth_unite_5x <- methylKit::unite(meth_5x, destrand=TRUE, min.per.group=1L)
# save(meth_unite_5x, file = "../results/methylkit/meth_unite_5x") #save object to file
# load(file = "../results/methylkit/meth_unite_5x") #save object to file
#
# meth_unite_5x_reshaped <- meth_unite_5x %>%
# melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
# drop_na(count) %>%
# mutate(stat=gsub('[0-9]+', '', variable)) %>%
# mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
# dplyr::select(-variable) %>%
# pivot_wider(names_from = stat, values_from = count) %>%
# mutate(percMeth = 100*(numCs/coverage)) %>%
# mutate(sample=as.numeric(sample))
#
# save(meth_unite_5x_reshaped, file = "meth_unite_5x_reshaped")
# save(meth_unite_5x_reshaped, file = "../results/methylkit/meth_unite_5x_reshaped")
meth_unite_reshaped %>%
group_by(sample) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, no coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=5) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci
, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 5x coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=10) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(label.size=0.1, min.segment.length = 0.5, aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 10x coverage minimum") + xlab("# Loci") + ylab("Mean % CpG Methylation") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=15) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 15x coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
#### Is % methylation a function of coverage within a sample?
The following samples have low CpG methylation: 5, 6, 8 (kinda), 9, 13, 14, 16, 19. Check out each sample individually
Weird samples have lots of loci with 0% methylation
meth_unite_reshaped %>%
filter(coverage>=5) %>%
ggplot() + geom_point(aes(x=coverage, y=percMeth), size=0.05) +
theme_minimal() +
facet_wrap(~sample)
Bad samples: all have mean % methylation <70% control pco2: 5, 6, 13, 14 high pco2: 9, 16, 19, 8 (kinda)
Good samples: all have mean % methylation >= 70% and nloci meeting 10x > 3e5 control pCO2: 1, 2, 3, 4, 7, 12 high pCO2: 10, 11, 15, 17, 18, 20
At 10x coverage, here’s the mean % methylation and number of loci per sample
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=10) %>%
#filter(sample %in% c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
#filter(sample %!in% c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
arrange(sample) #%>% dplyr::select(nloci) %>% write_clip()
## # A tibble: 20 × 3
## sample mean nloci
## <dbl> <dbl> <int>
## 1 1 77.7 414469
## 2 2 76.9 395542
## 3 3 71.7 390654
## 4 4 75.6 472769
## 5 5 15.3 66376
## 6 6 50.3 142489
## 7 7 77.3 413071
## 8 8 70.3 515296
## 9 9 3.62 37374
## 10 10 73.1 524988
## 11 11 75.6 555113
## 12 12 74.3 411316
## 13 13 6.07 58666
## 14 14 5.31 64674
## 15 15 77.5 507968
## 16 16 66.1 217503
## 17 17 70.2 431186
## 18 18 76.5 445896
## 19 19 63.6 280965
## 20 20 75.8 568630
# Summary stats by "good" and "bad" samples
# Bad samples
sample.info %>% filter(sample_seq %!in% c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
summarise(mean.aligned=mean(perc_totalread_unique), sd.aligned=sd(perc_totalread_unique),
min.aligned=min(perc_totalread_unique), max.aligned=max(perc_totalread_unique),
mean.reads=mean(total_reads), sd.reads=sd(total_reads),
mean.CpGs=mean(total_cs), sd.CpGs=sd(total_cs))
## # A tibble: 1 × 8
## mean.aligned sd.aligned min.aligned max.alig…¹ mean.…² sd.re…³ mean.…⁴ sd.CpGs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.204 0.0573 0.120 0.282 2.03e7 2.90e6 7.17e7 1.20e7
## # … with abbreviated variable names ¹max.aligned, ²mean.reads, ³sd.reads,
## # ⁴mean.CpGs
# Good samples
sample.info %>% filter(sample_seq %in% c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
summarise(mean.aligned=mean(perc_totalread_unique), sd.aligned=sd(perc_totalread_unique),
min.aligned=min(perc_totalread_unique), max.aligned=max(perc_totalread_unique),
mean.reads=mean(total_reads), sd.reads=sd(total_reads),
mean.CpGs=mean(total_cs), sd.CpGs=sd(total_cs))
## # A tibble: 1 × 8
## mean.aligned sd.aligned min.aligned max.alig…¹ mean.…² sd.re…³ mean.…⁴ sd.CpGs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.309 0.0200 0.265 0.353 1.91e7 2.79e6 1.06e8 1.86e7
## # … with abbreviated variable names ¹max.aligned, ²mean.reads, ³sd.reads,
## # ⁴mean.CpGs
# sample.info %>% arrange(sample_seq) %>% dplyr::select(sample_seq, sample) %>%
# filter(sample_seq %!in% c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>% write_clip()
Good samples: control pCO2: 1, 2, 3, 4, 7, 12 high pCO2: 10, 11, 15, 17, 18, 20
meth_unite_nolows <- reorganize(methylObj = meth_unite,
sample.ids = c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20"),
treatment = as.numeric((sample.info %>%
arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20)])
PCASamples(meth_unite_nolows)
(sample.info %>% arrange(sample_seq))[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20),]
## # A tibble: 12 × 37
## sample tank treatm…¹ high_…² devel…³ dna_s…⁴ ng_u_l yield…⁵ a260_…⁶ a260_…⁷
## <chr> <fct> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 CH01-06 3 LC L J7 1a 36.8 1.65 1.92 1.51
## 2 CH01-14 3 LC L J6 3a 39.0 1.76 1.82 1.1
## 3 CH01-22 1 LB L J6 2b 24.0 1.08 1.69 1.27
## 4 CH01-38 1 LB L J6 1b 29.9 1.35 1.88 1.39
## 5 CH03-33 1 LB L J6 3b 40.6 1.83 1.87 1.22
## 6 CH05-21 2 HB H J7 3a 120. 5.38 1.75 1.26
## 7 CH05-24 4 HC H J6 1a 40.7 1.83 1.79 1.28
## 8 CH07-06 3 LC L J7 2a 106. 4.78 1.81 1.96
## 9 CH09-02 4 HC H J6 3b 79.5 3.58 1.87 1.32
## 10 CH09-28 6 HA H J6 4a 22.8 1.03 1.93 1.52
## 11 CH10-01 2 HB H J6 4a 23.2 1.04 1.8 1.59
## 12 CH10-11 6 HA H J6 3b 38.5 1.73 1.84 1
## # … with 27 more variables: mean_mw_highest_peak <dbl>, notes <chr>,
## # file <chr>, sample_seq <dbl>, total_reads <dbl>, aligned_reads <dbl>,
## # unaligned_reads <dbl>, ambiguously_aligned_reads <dbl>,
## # no_genomic_sequence <dbl>, duplicate_reads_removed <dbl>,
## # unique_reads_remaining <dbl>, total_cs <dbl>, methylated_cp_gs <dbl>,
## # unmethylated_cp_gs <dbl>, methylated_chgs <dbl>, unmethylated_chgs <dbl>,
## # methylated_ch_hs <dbl>, unmethylated_ch_hs <dbl>, …
meth_obj_nolow <- reorganize(methylObj = meth_obj, sample.ids = c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20"),
treatment = as.numeric((sample.info %>%
arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20)])
meth_5x=filterByCoverage(meth_obj_nolow,
lo.count=5,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
meth_unite_5x=methylKit::unite(meth_5x, destrand=TRUE, min.per.group=5L)
## destranding...
## uniting...
# For some reason the sample id's aren't accurately assigned to each column.
PCA.5x <- PCASamples(meth_unite_5x, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.5x)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 116.173 111.021 109.443 105.22253 103.70801 103.23718
## Proportion of Variance 0.116 0.106 0.103 0.09518 0.09246 0.09163
## Cumulative Proportion 0.116 0.222 0.325 0.42015 0.51261 0.60424
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 98.63485 97.69303 95.6181 94.83148 92.88020 7.635e-13
## Proportion of Variance 0.08364 0.08205 0.0786 0.07731 0.07416 0.000e+00
## Cumulative Proportion 0.68787 0.76992 0.8485 0.92584 1.00000 1.000e+00
#PC1=11.6% PC2=10.6%
meth_10x=filterByCoverage(meth_obj_nolow,
lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
meth_unite_10x=methylKit::unite(meth_10x, destrand=TRUE) # by not setting min.per.group manually, all samples must meet threshold for locus to be retained
## destranding...
## uniting...
PCA.10x <- PCASamples(meth_unite_10x, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.10x)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 93.6610 87.3212 84.89460 84.42583 83.37979 81.42982
## Proportion of Variance 0.1196 0.1039 0.09822 0.09713 0.09474 0.09036
## Cumulative Proportion 0.1196 0.2235 0.32167 0.41881 0.51355 0.60391
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 78.4163 77.59839 76.4255 75.88959 72.76045 4.696e-13
## Proportion of Variance 0.0838 0.08206 0.0796 0.07849 0.07215 0.000e+00
## Cumulative Proportion 0.6877 0.76977 0.8494 0.92785 1.00000 1.000e+00
#PC1=12.0% PC2=10.4%
save(meth_10x, file="../results/methylkit/meth_10x")
save(meth_unite_10x, file="../results/methylkit/meth_unite_10x")
Important note! The column names in the meth_unite_10x object are NOT the sample IDs.
THE UNITE() FUNCTION ASSIGNS COLUMN NAMES USING THE CONSECUTIVE NUMBERS 1-13, WHILE THE SAMPLE ID’S ARE 1,2,3,4,7,10,11,12,15,17,18,20.
getSampleID(meth_unite_10x) #correct sample IDs
## [1] "1" "2" "3" "4" "7" "10" "11" "12" "15" "17" "18" "20"
colnames(meth_unite_10x) # columns containing data do NOT have sample iDs, but are rather the n'th item in the vector of sample IDs
## [1] "chr" "start" "end" "strand" "coverage1"
## [6] "numCs1" "numTs1" "coverage2" "numCs2" "numTs2"
## [11] "coverage3" "numCs3" "numTs3" "coverage4" "numCs4"
## [16] "numTs4" "coverage5" "numCs5" "numTs5" "coverage6"
## [21] "numCs6" "numTs6" "coverage7" "numCs7" "numTs7"
## [26] "coverage8" "numCs8" "numTs8" "coverage9" "numCs9"
## [31] "numTs9" "coverage10" "numCs10" "numTs10" "coverage11"
## [36] "numCs11" "numTs11" "coverage12" "numCs12" "numTs12"
PCA analysis and figure
plotCustomization <- (sample.info %>% arrange(sample_seq))[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20),c("high_or_low_co2", "sample_seq", "tank")] %>%
mutate(color=case_when(high_or_low_co2 == "H" ~ "firebrick3", TRUE ~ "dodgerblue3"),
symbol=case_when(high_or_low_co2 == "H" ~ 16, TRUE ~ 17))
# Save size: width=650 height=600
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.10x, choices = c(1, 2), type = "none", display = "sites", cex = 0.4, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.2) #use points
#text(PCA.figure$sites, col = as.character(plotCustomization$color), labels=plotCustomization$sample_seq, cex=1.5) #use sample number
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (12.0%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (10.4%)", line = 3, cex = 1.5) #Add y-axis label
legend("topleft",
pch = c(16, 17),
legend = c("High pCO2", "Control pCO2"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.3, bty = "n") #Add a legend with information about ambient and elevated samples
NOTE: this PCA was run with all methylated loci data after filtering for 10x across all retained samples.
save(PCA.10x, file="../results/methylkit/PCA.10x")
write_delim(data.frame(PCA.10x$x) %>% tibble::rownames_to_column("sample"), "../results/methylkit/PC-scores-methylation-10x.tab", delim = '\t', col_names = TRUE)
Filter for 10x across ALL samples (this is including the low-CpG methylation samples) to assess clustering. Yes, the weird samples still are separated along PC1, which explains the vast majority of variation (68%), from the other samples. Removal of those samples is confirmed.
meth_10x_allsamples=filterByCoverage(meth_obj,
lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
meth_unite_10x_allsamples=methylKit::unite(meth_10x_allsamples, destrand=TRUE, min.per.group=10L) #require all 10 samples to have 10x
## destranding...
## uniting...
PCA.10x_allsamples <- PCASamples(meth_unite_10x_allsamples, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.10x_allsamples)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 17.7089 5.59402 3.73028 3.53793 3.38165 3.20095 3.03176
## Proportion of Variance 0.6803 0.06788 0.03018 0.02715 0.02481 0.02223 0.01994
## Cumulative Proportion 0.6803 0.74815 0.77833 0.80549 0.83029 0.85252 0.87246
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 2.78400 2.77715 2.72782 2.60744 2.50167 2.37892 2.21148
## Proportion of Variance 0.01681 0.01673 0.01614 0.01475 0.01358 0.01228 0.01061
## Cumulative Proportion 0.88927 0.90600 0.92214 0.93689 0.95046 0.96274 0.97335
## PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 2.07597 1.70969 1.57960 1.14887 1.11316 3.298e-15
## Proportion of Variance 0.00935 0.00634 0.00541 0.00286 0.00269 0.000e+00
## Cumulative Proportion 0.98270 0.98904 0.99445 0.99731 1.00000 1.000e+00
#PC1=68.0% PC2=6.8%
plotCustomization_allsamples <- (sample.info %>% arrange(sample_seq))[,c("high_or_low_co2", "sample_seq", "tank")] %>%
mutate(color=case_when(high_or_low_co2 == "H" ~ "firebrick3", TRUE ~ "dodgerblue3"),
symbol=case_when(high_or_low_co2 == "H" ~ 16, TRUE ~ 17))
# Save size: width=650 height=600
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure_allsamples <- ordiplot(PCA.10x_allsamples, choices = c(1, 2), type = "none", display = "sites", cex = 0.4, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
#points(PCA.figure_allsamples, "sites", col = as.character(plotCustomization_allsamples$color), pch = plotCustomization_allsamples$symbol, cex = 2.5) #use points
text(PCA.figure_allsamples$sites, col = as.character(plotCustomization_allsamples$color), labels=plotCustomization_allsamples$sample_seq, cex=1.3) #use tank number
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (60%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (9.5%)", line = 3, cex = 1.5) #Add y-axis label
legend("bottomright",
pch = c(16, 17),
legend = c("High pCO2", "Low pCO2"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.3, bty = "n") #Add a legend with information about ambient and elevated samples
getCorrelation(meth_unite_10x_allsamples,plot=TRUE)
## 1 2 3 4 5 6 7
## 1 1.0000000 0.9204265 0.8780766 0.9036344 0.4048809 0.4814508 0.9217204
## 2 0.9204265 1.0000000 0.8395954 0.9026195 0.4073819 0.4607784 0.8850474
## 3 0.8780766 0.8395954 1.0000000 0.8328876 0.4055727 0.5138931 0.8697469
## 4 0.9036344 0.9026195 0.8328876 1.0000000 0.4070781 0.4578477 0.8596412
## 5 0.4048809 0.4073819 0.4055727 0.4070781 1.0000000 0.6896490 0.3857924
## 6 0.4814508 0.4607784 0.5138931 0.4578477 0.6896490 1.0000000 0.4889302
## 7 0.9217204 0.8850474 0.8697469 0.8596412 0.3857924 0.4889302 1.0000000
## 8 0.8404887 0.8360214 0.7721337 0.8651078 0.5280643 0.5594744 0.7970856
## 9 0.3896555 0.3998853 0.3959884 0.4138474 0.5666359 0.6283754 0.3931891
## 10 0.9175444 0.9028906 0.8375860 0.9294828 0.4580994 0.4995762 0.8584545
## 11 0.9204629 0.8947195 0.8486419 0.8977719 0.4561448 0.5151380 0.8828099
## 12 0.8970112 0.8721374 0.8613525 0.8520427 0.4390870 0.5611729 0.8919687
## 13 0.4219498 0.4335411 0.4248621 0.4534204 0.5027114 0.5268636 0.4125169
## 14 0.3549197 0.3599255 0.4201602 0.3865198 0.6215416 0.6332094 0.3524266
## 15 0.8937873 0.8746116 0.8173156 0.8950938 0.4624917 0.5177194 0.8438624
## 16 0.7949671 0.7825378 0.7353708 0.8247615 0.4106378 0.4532041 0.7468411
## 17 0.8768790 0.8551198 0.8639592 0.8338317 0.3825381 0.5091718 0.8654018
## 18 0.8964975 0.8826445 0.8489631 0.8789646 0.4074075 0.4773450 0.8741046
## 19 0.7016148 0.7002250 0.6539892 0.7255738 0.6508480 0.6780934 0.6718843
## 20 0.8925928 0.8856902 0.8042061 0.9023818 0.4283288 0.4802889 0.8487865
## 8 9 10 11 12 13 14
## 1 0.8404887 0.3896555 0.9175444 0.9204629 0.8970112 0.4219498 0.3549197
## 2 0.8360214 0.3998853 0.9028906 0.8947195 0.8721374 0.4335411 0.3599255
## 3 0.7721337 0.3959884 0.8375860 0.8486419 0.8613525 0.4248621 0.4201602
## 4 0.8651078 0.4138474 0.9294828 0.8977719 0.8520427 0.4534204 0.3865198
## 5 0.5280643 0.5666359 0.4580994 0.4561448 0.4390870 0.5027114 0.6215416
## 6 0.5594744 0.6283754 0.4995762 0.5151380 0.5611729 0.5268636 0.6332094
## 7 0.7970856 0.3931891 0.8584545 0.8828099 0.8919687 0.4125169 0.3524266
## 8 1.0000000 0.4982531 0.8901618 0.8728944 0.8201237 0.5346438 0.5065826
## 9 0.4982531 1.0000000 0.4380341 0.4506208 0.4541685 0.5432220 0.6011789
## 10 0.8901618 0.4380341 1.0000000 0.9109196 0.8682796 0.4684503 0.4081516
## 11 0.8728944 0.4506208 0.9109196 1.0000000 0.8913509 0.4706310 0.4231644
## 12 0.8201237 0.4541685 0.8682796 0.8913509 1.0000000 0.4857654 0.4422424
## 13 0.5346438 0.5432220 0.4684503 0.4706310 0.4857654 1.0000000 0.5398804
## 14 0.5065826 0.6011789 0.4081516 0.4231644 0.4422424 0.5398804 1.0000000
## 15 0.8969565 0.4656602 0.9205118 0.9177343 0.8581328 0.4883840 0.4337526
## 16 0.8217936 0.4350830 0.8355187 0.8197600 0.7626254 0.4554694 0.4148755
## 17 0.7916514 0.4160851 0.8484988 0.8591937 0.8714400 0.4776570 0.3842985
## 18 0.8442023 0.3788949 0.9034909 0.8998645 0.8707650 0.4506587 0.3985575
## 19 0.8197620 0.5644667 0.7671241 0.7448981 0.7005226 0.5554938 0.5360734
## 20 0.8901958 0.4296343 0.9196465 0.9148622 0.8499677 0.4467872 0.4023128
## 15 16 17 18 19 20
## 1 0.8937873 0.7949671 0.8768790 0.8964975 0.7016148 0.8925928
## 2 0.8746116 0.7825378 0.8551198 0.8826445 0.7002250 0.8856902
## 3 0.8173156 0.7353708 0.8639592 0.8489631 0.6539892 0.8042061
## 4 0.8950938 0.8247615 0.8338317 0.8789646 0.7255738 0.9023818
## 5 0.4624917 0.4106378 0.3825381 0.4074075 0.6508480 0.4283288
## 6 0.5177194 0.4532041 0.5091718 0.4773450 0.6780934 0.4802889
## 7 0.8438624 0.7468411 0.8654018 0.8741046 0.6718843 0.8487865
## 8 0.8969565 0.8217936 0.7916514 0.8442023 0.8197620 0.8901958
## 9 0.4656602 0.4350830 0.4160851 0.3788949 0.5644667 0.4296343
## 10 0.9205118 0.8355187 0.8484988 0.9034909 0.7671241 0.9196465
## 11 0.9177343 0.8197600 0.8591937 0.8998645 0.7448981 0.9148622
## 12 0.8581328 0.7626254 0.8714400 0.8707650 0.7005226 0.8499677
## 13 0.4883840 0.4554694 0.4776570 0.4506587 0.5554938 0.4467872
## 14 0.4337526 0.4148755 0.3842985 0.3985575 0.5360734 0.4023128
## 15 1.0000000 0.8423870 0.8368522 0.8909114 0.7800942 0.9252607
## 16 0.8423870 1.0000000 0.7552273 0.8033447 0.7232328 0.8291582
## 17 0.8368522 0.7552273 1.0000000 0.8603223 0.6840687 0.8273257
## 18 0.8909114 0.8033447 0.8603223 1.0000000 0.7209976 0.8956864
## 19 0.7800942 0.7232328 0.6840687 0.7209976 1.0000000 0.7717557
## 20 0.9252607 0.8291582 0.8273257 0.8956864 0.7717557 1.0000000
The function clusters samples using hclust function and various distance metrics derived from percent methylation per base or per region for each sample.
clusterSamples(meth_unite_10x, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 12
getData(meth_unite_5x) %>% nrow() #340,394 loci @ 5x coverage
## [1] 340394
getData(meth_unite_10x) %>% nrow() #146,761 loci @ 10x coverage
## [1] 146761
Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.
Here I create % methylation matrices from the filtered object, and use it to do another cluster analysis
perc.meth=percMethylation(meth_unite_10x, rowids=T)
hist(perc.meth, col="gray50" )
# # Save % methylation df to object and .tab file
# save(perc.meth, file = "../analyses/methylation-filtered/R-objects/perc.meth") #save object to file
# #load(file = "../analyses/methylation-filtered/R-objects/perc.meth") #load object if needed
# write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "methylation-filtered", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)
# What percentage of loci have ZERO methylation?
100*table(perc.meth==0)[2]/table(perc.meth==0)[1]
## TRUE
## 1.780652
# 1.8% of loci unmethylated, averaged across all samples
# Mean % methylation across all samples
mean(perc.meth, na.rm=TRUE)
## [1] 79.2303
#79.2%
sample.info %>% dplyr::select(high_or_low_co2, sample_seq) %>% arrange(sample_seq)
## # A tibble: 20 × 2
## high_or_low_co2 sample_seq
## <fct> <dbl>
## 1 L 1
## 2 L 2
## 3 L 3
## 4 L 4
## 5 L 5
## 6 L 6
## 7 L 7
## 8 H 8
## 9 H 9
## 10 H 10
## 11 H 11
## 12 L 12
## 13 L 13
## 14 L 14
## 15 H 15
## 16 H 16
## 17 H 17
## 18 H 18
## 19 H 19
## 20 H 20
(perc.meth %>% as.matrix())[,c("1","2","3","4","7","12")] %>% head() # low pCO2
## 1 2
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 95.65217 95.83333
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 64.10256 73.68421
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 78.57143 75.00000
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 89.79592 68.42105
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 53.03030 64.28571
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 96.07843 72.72727
## 3 4
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 60.00000 94.28571
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 84.61538 80.43478
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 84.61538 78.84615
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 80.00000 73.07692
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 60.00000 56.25000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 72.22222 100.00000
## 7 12
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 86.53846 48.14815
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 90.69767 82.14286
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 82.35294 76.47059
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 96.66667 82.75862
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 73.01587 67.50000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 100.00000 100.00000
(perc.meth %>% as.matrix())[,c("10","11","15", "17", "18", "20")] %>% head() # high pCO2
## 10 11
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 97.22222 93.93939
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 89.13043 74.10714
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 84.78261 84.40367
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 55.88235 91.52542
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 58.13953 36.95652
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 81.81818 90.32258
## 15 17
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 85.18519 100.00000
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 75.75758 85.71429
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 70.37037 100.00000
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 45.83333 100.00000
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 66.66667 37.50000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 81.81818 87.50000
## 18 20
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 100.00000 90.47619
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 65.51724 66.66667
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 61.53846 91.66667
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 86.20690 80.00000
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 20.00000 36.11111
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 50.00000 90.47619
# Mean % methylation for each treatment
mean((perc.meth %>% as.matrix())[,c("1","2","3","4","7","12")], na.rm=TRUE) # Low CO2 exposed - 79.5%
## [1] 79.53615
mean((perc.meth %>% as.matrix())[,c("10","11","15", "17", "18", "20")], na.rm=TRUE) # High CO2 exposed - 78.9%
## [1] 78.92444
perc.meth.summ <- data.frame(sample=1:12, methylated=1:12, coverage=1:12, mean.perc=1:12)
for (i in 1:12) {
perc.meth.summ[i,"sample"] <- getSampleID(meth_unite_10x)[i]
perc.meth.summ[i,"methylated"] <-mean(as.vector(getData(meth_unite_10x)[,grep(c("numCs"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
perc.meth.summ[i,"coverage"] <- mean(as.vector(getData(meth_unite_10x)[,grep(c("coverage"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
perc.meth.summ[i,"mean.perc"] <- mean(as.vector(getData(meth_unite_10x)[,grep(c("numCs"), colnames(meth_unite_10x))][i][,1]) /
as.vector(getData(meth_unite_10x)[,grep(c("coverage"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
}
mean(perc.meth.summ[perc.meth.summ$sample %in% c("1","2","3","4","7","12"),"mean.perc"]) #Loe pCO2 treatment = 79.5%
## [1] 0.7953615
mean(perc.meth.summ[perc.meth.summ$sample %in% c("10","11","15", "17", "18", "20"),"mean.perc"]) #78.9%
## [1] 0.7892444
paste("No. of loci in filtered dataset:", nrow(perc.meth)) #146,761 loci
## [1] "No. of loci in filtered dataset: 146761"
paste("No. of samples:", ncol(perc.meth)) #12 sampes
## [1] "No. of samples: 12"
myDiff=calculateDiffMeth(meth_unite_10x)
## two groups detected:
## will calculate methylation difference as the difference of
## treatment (group: 2) - control (group: 1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# get deferentially methylated bases using 12% difference and save to files - 12,971 loci
myDiff12p=getMethylDiff(myDiff,difference=12,qvalue=0.01)
paste("Number diff. methylated loci with 12% difference: ", getData(myDiff12p) %>% nrow())
## [1] "Number diff. methylated loci with 12% difference: 12971"
# get all differentially methylated bases - 1,427 loci
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
paste("Number diff. methylated loci with 25% difference: ", getData(myDiff25p) %>% nrow())
## [1] "Number diff. methylated loci with 25% difference: 1427"
# Extract hypo and hyper methylated loci. Here "type" must think that the low pCO2 is the experimental treatment, since hypo and hyper are switched.
# get hypo methylated bases - 843 loci
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
paste("Number loci hypo-methylated in high pCO2 with 25% difference: ", getData(myDiff25p.hyper) %>% nrow())
## [1] "Number loci hypo-methylated in high pCO2 with 25% difference: 834"
# get hypo methylated bases - 593 loci
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
paste("Number loci hyper-methylated in high pCO2 with 25% difference: ", getData(myDiff25p.hypo) %>% nrow())
## [1] "Number loci hyper-methylated in high pCO2 with 25% difference: 593"
#write.table(myDiff12p, file = "../analyses/DMLs/myDiff12p.tab", sep = "\t", col.names = TRUE)
#write_delim(getData(myDiff12p) %>% mutate(start = start-1, end = end+1) %>% dplyr::select(chr, start, end, meth.diff),
# "../analyses/DMLs/dml12.bed", delim = '\t', col_names = TRUE)
write.table(myDiff25p, file = "../results/methylkit/myDiff25p.tab", sep = "\t", col.names = TRUE)
save(myDiff25p, file="../results/methylkit/myDiff25p")
#load(file="../analyses/DMLs/myDiff25p") #load if needed
# Write out bedfile for DMLs
dml25 <- myDiff25p %>% getData() %>%
mutate(start = start-1, end = end+1) %>%
dplyr::select(chr, start, end, meth.diff)
write_delim(dml25, "../results/methylkit/dml25.bed", delim = '\t', col_names = TRUE)
write_delim(dml25, "../results/methylkit/dml25_forIGV.bed", delim = '\t', col_names = FALSE) #no column names for IGV
save(dml25, file="../results/methylkit/dml25")
#load(file="../results/methylkit/dml25") #load dml25 if needed
# Write out bedfile for all loci fed into DML analysis, to use as background for enrichment
mydiff.all <- mutate(getData(myDiff), start = start -1, end = end + 1) %>% dplyr::select(chr, start, end, meth.diff) %>% mutate_if(is.numeric, as.integer)
write_delim(mydiff.all, "../results/methylkit/mydiff-all.bed", delim = '\t', col_names = TRUE)
save(mydiff.all, file="../results/methylkit/mydiff.all")
#load(file="../results/methylkit/mydiff.all")
paste("No. of DMLs that had higher methylation in high pCO2 = ", dml25 %>%filter(meth.diff<0)%>%nrow(), " (", 100*round(dml25 %>%filter(meth.diff<0)%>%nrow()/nrow(dml25), digits = 2), "%)", sep="")
## [1] "No. of DMLs that had higher methylation in high pCO2 = 593 (42%)"
paste("No. of DMLs that had lower methylation in high pCO2= ", dml25 %>%filter(meth.diff>0)%>%nrow(), " (", 100*round(dml25 %>%filter(meth.diff>0)%>%nrow()/nrow(dml25), digits = 2), "%)", sep="")
## [1] "No. of DMLs that had lower methylation in high pCO2= 834 (58%)"
# Save DML bed files with hyper- and hypo-methylated loci in high pCO2 treatment
# meth.diff < 0 = loci where methylation rate is HIGHER in high pCO2 treatment
write_delim(dml25 %>%filter(meth.diff<0), "../results/methylkit/dml25-hypermeth_in_OA.bed", delim = '\t', col_names = TRUE)
write_delim(dml25 %>%filter(meth.diff>0), "../results/methylkit/dml25-hypometh_in_OA.bed", delim = '\t', col_names = TRUE)
Plot mean % methylation for loci by those that hypo- and hyper-methylated in OA-exposed crab
meth_unite_reshaped %>%
filter(coverage>=10) %>%
filter(sample %in% c(1,2,3,4,7,12, #low pCO3
8,10,11,15,17,18,20)) %>% #high pCO2
mutate(locus=paste(chr, start-1, sep=".")) %>% # Create locus column; subtract 1 from the "start" column to match dml25
filter(locus %in% (dml25 %>%filter(meth.diff<0) %>% mutate(locus=paste(chr, start, sep=".")))$locus) %>% #keep loci with meth.diff < 0, higher meth in high pCO2
left_join(sample.info[,c("high_or_low_co2", "sample_seq")], by=c("sample"="sample_seq")) %>%
group_by(high_or_low_co2, locus) %>%
summarise(mean = mean(percMeth)) %>%
ggplot(aes(x=high_or_low_co2, y=mean)) + geom_violin() + geom_jitter() +
theme_minimal() + xlab("pCO2 treatment") +
ggtitle("Loci with higher methylation in OA (n=593)")
## `summarise()` has grouped output by 'high_or_low_co2'. You can override using
## the `.groups` argument.
#dml25 %>%filter(meth.diff>0) %>% nrow()
meth_unite_reshaped %>%
filter(coverage>=10) %>%
filter(sample %in% c(1,2,3,4,7,12, #low pCO3
8,10,11,15,17,18,20)) %>% #high pCO2
mutate(locus=paste(chr, start-1, sep=".")) %>% # Create locus column; subtract 1 from the "start" column to match dml25
filter(locus %in% (dml25 %>%filter(meth.diff>0) %>% mutate(locus=paste(chr, start, sep=".")))$locus) %>% #keep loci with meth.diff > 0, lower meth in high pCO2
left_join(sample.info[,c("high_or_low_co2", "sample_seq")], by=c("sample"="sample_seq")) %>%
group_by(high_or_low_co2, locus) %>%
summarise(mean = mean(percMeth)) %>%
ggplot(aes(x=high_or_low_co2, y=mean)) + geom_violin() + geom_jitter() +
theme_minimal() + xlab("pCO2 treatment") +
ggtitle("Loci with lower methylation in OA (n=834)")
## `summarise()` has grouped output by 'high_or_low_co2'. You can override using
## the `.groups` argument.
### Subset the filtered methylation count dataframe for only those that are differentially methylated. Save object to file.
dml25_counts <- selectByOverlap(meth_unite_10x,as(myDiff25p,"GRanges"))
class(dml25_counts) #class should be methylBase
## [1] "methylBase"
## attr(,"package")
## [1] "methylKit"
perc.methDMLs= percMethylation(dml25_counts, rowids=T) #create a percent methylated object for DMLs
# save count and percent objects to be used in subsequent notebook
save(dml25_counts, file = "../results/methylkit/dml25_counts")
save(perc.methDMLs, file="../results/methylkit/perc.methDMLs")
perc.meth.d<-vegdist(data.frame(t(perc.meth)),"euclidean", na.rm = TRUE)
meth.perm <- adonis(data.frame(t(perc.meth)) ~ high_or_low_co2, data=plotCustomization,
permutations=1000, method='euclidean', na.rm = TRUE) #yes, global population difference
## 'adonis' will be deprecated: use 'adonis2' instead
meth.perm$aov.tab #
## Permutation: free
## Number of permutations: 1000
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## high_or_low_co2 1 23530917 23530917 1.0419 0.09436 0.1059
## Residuals 10 225841244 22584124 0.90564
## Total 11 249372161 1.00000
hist(meth.perm$f.perms, main="Histogram of pseudo-F values under the null model",
xlab="pseudo=F values", col="gray", xlim=c(0.5,2.5))
abline(v=1.9743, col="red") # indicates F-value is significant.
source("../references/biostats.R")
pca.eigenval(PCA.10x) #The Proporation of Variance = %variance explained
## Importance of components:
## PC1 PC2 PC3 PC4
## Variance(eigenvalue) 8772.3902168 7624.9967243 7207.0922603 7127.7209667
## Proportion of Variance 0.1195474 0.1039111 0.0982160 0.0971344
## Cumulative Proportion 0.1195474 0.2234585 0.3216746 0.4188089
## Broken-stick value 3.1032107 2.1032107 1.6032107 1.2698773
## PC5 PC6 PC7 PC8
## Variance(eigenvalue) 6952.1896370 6630.8151862 6149.1144766 6021.5108973
## Proportion of Variance 0.0947423 0.0903627 0.0837982 0.0820593
## Cumulative Proportion 0.5135512 0.6039139 0.6877122 0.7697715
## Broken-stick value 1.0198773 0.8198773 0.6532107 0.5103535
## PC9 PC10 PC11 PC12
## Variance(eigenvalue) 5840.8562662 5759.2298241 5294.0835444 0.0000000
## Proportion of Variance 0.0795974 0.0784850 0.0721461 0.0000000
## Cumulative Proportion 0.8493689 0.9278539 1.0000000 1.0000000
## Broken-stick value 0.3853535 0.2742424 0.1742424 0.0833333
screeplot(PCA.10x, bstick=TRUE) #that's weird- PC's 1-5 aren't sign. but the higher PCs are.
# # check sign. of PCs - can't get this to work, though.
# ordi.monte(PCA.10x, ord="pca", dim=2, perm = 1000, scale = F, plot = F)
anova(meth.bdp<-betadisper(perc.meth.d,plotCustomization$high_or_low_co2)) #no sign. difference in dispersion, which is good
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 84611 84611 1.5642 0.2395
## Residuals 10 540928 54093
boxplot(meth.bdp)
# # perform PCoA just for fun on distance matrix
# meth.pcoa<-cmdscale(perc.meth.d, eig=TRUE, add=T)
# ordiplot(meth.pcoa, choices = c(1, 2), type="text", display="sites", xlab="PC 1", ylab="PC 2")
# Get sample ID vector from methylBase object (created from unite() function)
samples <- getSampleID(meth_unite_10x)
meth_unite_10x_reshaped <- melt(data=meth_unite_10x, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
tidyr::separate(col = "variable" , into = c("variable", "sample_temp"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
dcast(chr+start+end+strand+sample_temp ~ variable) %>%
#Correct sample IDs extracted from unite() function
mutate(sample=case_when(
sample_temp== 1 ~ samples[1],
sample_temp== 2 ~ samples[2],
sample_temp== 3 ~ samples[3],
sample_temp== 4 ~ samples[4],
sample_temp== 5 ~ samples[5],
sample_temp== 6 ~ samples[6],
sample_temp== 7 ~ samples[7],
sample_temp== 8 ~ samples[8],
sample_temp== 9 ~ samples[9],
sample_temp== 10 ~ samples[10],
sample_temp== 11 ~ samples[11],
sample_temp== 12 ~ samples[12],
sample_temp== 13 ~ samples[13])) %>%
dplyr::select(-sample_temp) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
left_join(sample.info[c("sample_seq", "high_or_low_co2")] %>%
mutate(sample_seq=as.character(sample_seq)), by=c("sample"="sample_seq")) %>%
dplyr::rename("treatment"="high_or_low_co2")
## Using count as value column: use value.var to override.
save(meth_unite_10x_reshaped, file="../results/methylkit/meth_unite_10x_reshaped")
#load(file="../results/methylkit/meth_unite_10x_reshaped")
# save to tab file if needed
#readr::write_delim(meth_unite_10x_reshaped, "../results/methylkit/meth_unite_10x_reshaped.tab"), delim = '\t', col_names = FALSE)
Any trends? No, similar distribution of % CpG methyation
meth_unite_10x_reshaped %>%
ggplot(aes(x= treatment, y=percMeth, fill= treatment)) +
geom_violin(width=1, alpha=0.8) + theme_minimal() +
ylab("% CpG Methylation") + theme(axis.title.x = element_blank(), legend.position = "none",
text = element_text(size=16)) +
scale_x_discrete(labels=c("H" = "High pCO2", "L" = "Control pCO2")) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"))
View distributions in density plot
Looks as though there is slightly more loci with higher methylation rates in the High pCO2 treatment (red distribution is shifted to the right a bit)
#ggplotly(
meth_unite_10x_reshaped %>%
ggplot(aes(x=percMeth, fill= treatment)) +
geom_density(alpha=0.5) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"),
labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) +
#xlim(90,100) + #use to find location on x axis of second highest peak
theme_minimal() +
ylab("Density") + xlab("% CpG Methylation") +
theme(text = element_text(size=18), legend.position = "top") #)
Find location of peaks in each treatment in the above figure. Used this as a reference: http://ianmadd.github.io/pages/PeakDensityDistribution.html
Answer:
- Largest peak for high pCO2 treatment is at 99.98% - Largest peak for
low pCO2 treatment (control) is 94.56%
## Find largest peak
# Find largest peak for High pCO2
density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA")$percMeth) # Y maximum is 4.780e-02
##
## Call:
## density.default(x = subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA")$percMeth)
##
## Data: subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA")$percMeth (880566 obs.); Bandwidth 'bw' = 1.003
##
## x y
## Min. : -3.01 Min. :8.356e-05
## 1st Qu.: 23.49 1st Qu.:1.981e-03
## Median : 50.00 Median :4.248e-03
## Mean : 50.00 Mean :9.423e-03
## 3rd Qu.: 76.51 3rd Qu.:9.761e-03
## Max. :103.01 Max. :4.780e-02
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA")$percMeth)$y) #answer = index #497
## [1] 497
# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA")$percMeth)$x[497] #H max (4.895e-02) is @ 99.90%
## [1] 99.89823
##--------------
# Find largest peak for Low pCO2, i.e. control
density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA")$percMeth) # Y maximum is 4.663e-02
##
## Call:
## density.default(x = subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA")$percMeth)
##
## Data: subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA")$percMeth (880566 obs.); Bandwidth 'bw' = 0.9293
##
## x y
## Min. : -2.788 Min. :8.979e-05
## 1st Qu.: 23.606 1st Qu.:1.869e-03
## Median : 50.000 Median :3.985e-03
## Mean : 50.000 Mean :9.463e-03
## 3rd Qu.: 76.394 3rd Qu.:9.571e-03
## Max. :102.788 Max. :4.663e-02
which.max(density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA")$percMeth)$y) #answer = 472
## [1] 472
density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA")$percMeth)$x[472] #L max (0.0469560 ) is 94.52%
## [1] 94.52361
## --------------
## Find 2nd largest peak. NOTE: I looked at the ggplotly density figure above and noted that the 2nd peaks end before ~98%, so I subset the percMeth data to retain only those <98%, then find the peak
# Find 2nd largest peak for High pCO2
density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth) # Y maximum is .04997
##
## Call:
## density.default(x = subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA" & percMeth < 98)$percMeth)
##
## Data: subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA" & percMeth < 98)$percMeth (766028 obs.); Bandwidth 'bw' = 1.129
##
## x y
## Min. : -3.388 Min. :0.0000502
## 1st Qu.: 22.803 1st Qu.:0.0021764
## Median : 48.993 Median :0.0046571
## Mean : 48.993 Mean :0.0095358
## 3rd Qu.: 75.184 3rd Qu.:0.0097195
## Max. :101.375 Max. :0.0499662
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth)$y) #answer = index #478
## [1] 478
# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped, treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth)$x[478] #H max (5.281e-02) is @ 94.40%
## [1] 94.40423
#---
# Find 2nd largest peak for Low pCO2
density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth) # Y maximum is 1.1686
##
## Call:
## density.default(x = subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA" & percMeth > 95)$percMeth)
##
## Data: subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA" & percMeth > 95)$percMeth (213494 obs.); Bandwidth 'bw' = 0.1402
##
## x y
## Min. : 94.61 Min. :0.0001824
## 1st Qu.: 96.06 1st Qu.:0.0633958
## Median : 97.51 Median :0.1470459
## Mean : 97.51 Mean :0.1717644
## 3rd Qu.: 98.97 3rd Qu.:0.1829494
## Max. :100.42 Max. :1.1686438
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth)$y) #answer = index #475
## [1] 475
# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped, treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth)$x[475] #L max (4.953e-02) is @ 100.00%
## [1] 99.99955
# Very similar 2nd peak CpG methylation - High pCO2 = 94.24%, and Low pCO2 = 94.61% (only slightly higher in low pCO2 treatment)
#----- Double check # of loci
meth_unite_10x_reshaped %>%
filter( treatment=="H" & percMeth!="NA") %>%
dplyr::select(chr, start) %>% unique() %>% nrow() #146,761
## [1] 146761
meth_unite_10x_reshaped %>%
filter( treatment=="L" & percMeth!="NA") %>%
dplyr::select(chr, start) %>% unique() %>% nrow() #146,761 (good they are the same)
## [1] 146761
#----- average % meth by pop
# High pCO2 mean = 78.92%, median = 88.27%
meth_unite_10x_reshaped %>%
filter( treatment=="H" & percMeth!="NA") %>%
dplyr::select(percMeth) %>% summary()
## percMeth
## Min. : 0.00
## 1st Qu.: 71.83
## Median : 88.27
## Mean : 78.92
## 3rd Qu.: 94.92
## Max. :100.00
# Low (control) pCO2 mean = 79.54%, median = 88.71%
meth_unite_10x_reshaped %>%
filter( treatment=="L" & percMeth!="NA") %>%
dplyr::select(percMeth) %>% summary()
## percMeth
## Min. : 0.00
## 1st Qu.: 73.49
## Median : 88.71
## Mean : 79.54
## 3rd Qu.: 94.87
## Max. :100.00
Takeaway = high and low (control) pCO2 have similar %CpG distributions on average, but the distribution of %meth varies a bit - both treatments’ % methylation peak around 94% and 100%, but in the high pCO2 treatment the 100% peak is higher, and the 94% peak is lower. So, high pCO2 has more instances of 100% methylation.
slightly higher methylation rates in the high pCO2 reared crab.
# mean % methylation across samples
dml25_counts_unique <- unique(getData(dml25_counts)[,c("chr", "start")]) %>%
mutate(chr_start=paste(chr, start, sep="_"))
#Calculate ratio between mean % methylation in High pCO2 : Low pCO2
DML.ratios <- meth_unite_10x_reshaped %>%
mutate(chr_start=paste(chr, start, sep="_")) %>%
filter(chr_start %in% dml25_counts_unique$chr_start) %>%
group_by( treatment, chr, start) %>%
summarise(median_percentMeth = median(percMeth, na.rm = TRUE)) %>% #mean_percentMeth = mean(percMeth, na.rm = TRUE),
dcast(chr + start ~ treatment) %>%
mutate(ratio_H.L = H/L, #in this ratio column, values <1 = High pCO2 hypomethylated, values >1 = Low pCO2 hypomethylated
chr_start=paste(chr, start, sep="_"),
diff_H.L = H-L) #this is the difference between High pCO2 & Low pCO2 (H - L)
## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.
## Using median_percentMeth as value column: use value.var to override.
nrow(DML.ratios)==nrow(dml25_counts_unique) #confirm same # loci; should=TRUE
## [1] TRUE
save(DML.ratios, file="../results/methylkit/DML.ratios")
#load(file="../analyses/methykit/DML.ratios")
# For all loci, calculate mean and sd percent methylation across samples by treatment
DML.calcs <- meth_unite_10x_reshaped %>%
group_by( treatment, chr, start) %>%
dplyr::summarise(
mean_percMeth = mean(percMeth, na.rm=TRUE),
sd_percMeth=sd(percMeth, na.rm=TRUE),
n()) %>%
filter(chr %in% c(DML.ratios$chr) &
start %in% c(DML.ratios$start))
## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.
Interesting! DMLs on average have lower % methylation in the high pCO2 treatment
DML.ratios %>%
pivot_longer(cols = c(H, L),
names_to = "treatment",
values_to = "mean_meth") %>%
mutate(treatment=as.factor(treatment)) %>%
ggplot(aes(x=treatment, y=mean_meth, fill=treatment)) +
geom_violin(alpha=0.2) + geom_boxplot(alpha=0.4) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"),
labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) +
theme_minimal() + ggtitle("Methylation rates of DMLs") +
ylab("Methylation (mean within treatment)") + xlab(NULL)
DML.ratios %>%
pivot_longer(cols = c(H, L),
names_to = "treatment",
values_to = "mean_meth") %>%
mutate(treatment=as.factor(treatment)) %>%
group_by(treatment) %>%
summarise(mean=mean(mean_meth),
median=median(mean_meth),)
## # A tibble: 2 × 3
## treatment mean median
## <fct> <dbl> <dbl>
## 1 H 57.4 57.5
## 2 L 62.3 67.1
#Calculate ratio between mean % methylation in High pCO2 : Low pCO2 (i.e. control)
all.loci.ratios <- meth_unite_10x_reshaped %>%
mutate(chr_start=paste(chr, start, sep="_")) %>%
group_by( treatment, chr, start) %>%
summarise(mean_percentMeth = mean(percMeth, na.rm = TRUE)) %>% #could instead use ... allCs_percent = 100*(sum(numCs)/sum(coverage)),
dcast(chr + start ~ treatment) %>%
mutate(ratio_H.L = H/L, #in this ratio column, values <1 = HC hypomethylated, values >1 = SS hypomethylated
chr_start=paste(chr, start, sep="_"),
diff_H.L = H-L) #this is the difference between H & L (H - L)
## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.
## Using mean_percentMeth as value column: use value.var to override.
# Any majuor differences in global methylatin among the treatments? Here are summarys tats of % methylation across all shared loci:
summary(all.loci.ratios$H)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 72.48 87.04 78.92 93.24 100.00
summary(all.loci.ratios$L)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 74.06 87.70 79.54 93.25 100.00
# As determined in previous code chunks % meth is slightly higher in the high pCO2 treatment group, while % meth is slightly lower in DMLs in high pCO2 group
# Do %methylation (all loci) distributions differ among treatment using all loci?
#ggplotly(
all.loci.ratios %>%
pivot_longer(cols = c(H, L),
names_to = "treatment",
values_to = "mean_meth") %>%
ggplot(aes(x=mean_meth, fill= treatment)) +
geom_density(alpha=0.2) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"),
labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) +
theme_minimal() #)
# Using DMLs
DML.ratios %>%
pivot_longer(cols = c(H, L),
names_to = "treatment",
values_to = "mean_meth") %>%
ggplot(aes(x=mean_meth, fill= treatment)) +
geom_density(alpha=0.2) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"),
labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) +
theme_minimal() + xlab("Mean methylation (within treatment)") +
ggtitle("% Methylation distribution for DMLs by treatment")
#load("../analyses/methylation-filtered/R-objects/meth_unite_10x_reshaped")
meth_unite_10x_reshaped %>% dplyr::select(sample, treatment) %>% distinct()
## sample treatment
## 1 1 L
## 2 2 L
## 3 3 L
## 4 4 L
## 5 7 L
## 6 10 H
## 7 11 H
## 8 12 L
## 9 15 H
## 10 17 H
## 11 18 H
## 12 20 H
DMLs_heatmap <-
meth_unite_10x_reshaped %>%
left_join(sample.info[c("sample_seq", "tank")] %>% mutate(sample_seq=as.character(sample_seq)), by=c("sample"="sample_seq")) %>%
arrange(treatment, tank) %>%
mutate(sample=factor(sample, ordered = T, levels = c(3,4,7,1,2,12,20,17,8,10,18,11,15)),
chr_start=paste(chr, start, sep="_")) %>%
filter(
start %in% dml25_counts_unique$start &
chr %in% DML.ratios$chr)
#ggplotly(
ggplot(DMLs_heatmap, aes(sample, chr_start, fill= percMeth)) + xlab("Sample") + geom_tile(na.rm = T) +
scale_y_discrete(limits=(DML.ratios[order(DML.ratios$ratio_H.L),]$chr_start)) +
#scale_fill_viridis(discrete=FALSE)
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
scale_fill_distiller(palette = "YlGnBu", direction = 1) #)
#scale_fill_gradient(low="gray5", high="white")
sample.info[c("high_or_low_co2", "sample_seq", "tank")]
## # A tibble: 20 × 3
## high_or_low_co2 sample_seq tank
## <fct> <dbl> <fct>
## 1 H 11 4
## 2 H 15 4
## 3 H 8 2
## 4 H 10 2
## 5 H 19 6
## 6 H 20 6
## 7 H 9 6
## 8 L 3 1
## 9 L 4 1
## 10 L 5 1
## 11 L 6 1
## 12 L 7 1
## 13 L 14 1
## 14 L 1 3
## 15 L 2 3
## 16 L 13 3
## 17 L 12 3
## 18 H 16 6
## 19 H 18 2
## 20 H 17 6
# Join sample info with bismark summary report
(sample.info.mito <- sample.info %>%
dplyr::select(sample, tank, treatment, high_or_low_co2, sample_seq, developmental_stage,
dna_siolation_batch, perc_totalread_unique, CpGs_Meth, CHGs_Meth, CHHs_Meth) %>%
right_join(
# Read in bismark summary report
read_delim(file="../results/bismark/mito/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
clean_names() %>%
separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
mutate(sample_prep=paste(a, b, sep = "-")) %>%
mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
#dplyr::select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
mutate(perc_totalread_unique_mito=unique_reads_remaining/total_reads,
CpGs_Meth_mito=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs),
CHGs_Meth_mito=100*methylated_chgs/(methylated_chgs+unmethylated_chgs),
CHHs_Meth_mito=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)) %>%
dplyr::select(sample_prep, perc_totalread_unique_mito, CpGs_Meth_mito, CHGs_Meth_mito, CHHs_Meth_mito),
by = c("sample"="sample_prep")) %>%
mutate(included=case_when(
sample_seq== 1 ~ "yes",
sample_seq== 2 ~ "yes",
sample_seq== 3 ~ "yes",
sample_seq== 4 ~ "yes",
sample_seq== 7 ~ "yes",
sample_seq== 12 ~ "yes",
sample_seq== 10 ~ "yes",
sample_seq== 11 ~ "yes",
sample_seq== 15 ~ "yes",
sample_seq== 17 ~ "yes",
sample_seq== 18 ~ "yes",
sample_seq== 20 ~ "yes",
TRUE ~ "no")))
## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 20 × 16
## sample tank treat…¹ high_…² sampl…³ devel…⁴ dna_s…⁵ perc_…⁶ CpGs_…⁷ CHGs_…⁸
## <chr> <fct> <fct> <fct> <dbl> <fct> <fct> <dbl> <dbl> <dbl>
## 1 CH05-24 4 HC H 11 J6 1a 0.315 71.6 1.59
## 2 CH09-02 4 HC H 15 J6 3b 0.297 68.4 1.81
## 3 CH05-01 2 HB H 8 J7 1a 0.282 51.8 2.05
## 4 CH05-21 2 HB H 10 J7 3a 0.311 64.1 1.28
## 5 CH10-08 6 HA H 19 J6 2b 0.279 31.2 1.82
## 6 CH10-11 6 HA H 20 J6 3b 0.319 67.7 1.52
## 7 CH05-06 6 HA H 9 J7 3a 0.160 7.01 1.40
## 8 CH01-22 1 LB L 3 J6 2b 0.300 66.2 1.17
## 9 CH01-38 1 LB L 4 J6 1b 0.304 68.1 1.63
## 10 CH03-04 1 LB L 5 J7 1b 0.235 11.4 2.48
## 11 CH03-15 1 LB L 6 J6 2a 0.191 19.4 1.53
## 12 CH03-33 1 LB L 7 J6 3b 0.308 72.5 1.15
## 13 CH07-24 1 LB L 14 J7 2a 0.181 7.00 1.58
## 14 CH01-06 3 LC L 1 J7 1a 0.311 73.4 1.25
## 15 CH01-14 3 LC L 2 J6 3a 0.265 67.7 1.67
## 16 CH07-11 3 LC L 13 J6 3a 0.181 7.80 1.57
## 17 CH07-06 3 LC L 12 J7 2a 0.307 64.1 1.34
## 18 CH09-13 6 HA H 16 J7 2b 0.120 45.8 1.91
## 19 CH10-01 2 HB H 18 J6 4a 0.353 70.8 1.44
## 20 CH09-28 6 HA H 17 J6 4a 0.317 67.6 1.73
## # … with 6 more variables: CHHs_Meth <dbl>, perc_totalread_unique_mito <dbl>,
## # CpGs_Meth_mito <dbl>, CHGs_Meth_mito <dbl>, CHHs_Meth_mito <dbl>,
## # included <chr>, and abbreviated variable names ¹treatment,
## # ²high_or_low_co2, ³sample_seq, ⁴developmental_stage, ⁵dna_siolation_batch,
## # ⁶perc_totalread_unique, ⁷CpGs_Meth, ⁸CHGs_Meth
# **Good samples:** all have mean % methylation >= 70% and nloci meeting 10x > 3e5
# control pCO2: 1, 2, 3, 4, 7, 12
# high pCO2: 10, 11, 15, 17, 18, 20
ggplot(sample.info.mito, aes(x=CpGs_Meth_mito, y=CpGs_Meth)) +
geom_point(aes(shape=high_or_low_co2, color=included), size=3) +
scale_color_manual(values=c("red", "blue")) +
#geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
#annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) +
ggtitle("% CpG Methylation full genome ~ mitochondria") +
xlab("Mitochindria CpG meth %") + ylab("Full Genome CpG % meth")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(sample.info.mito, aes(x=perc_totalread_unique_mito, y=CpGs_Meth_mito)) +
geom_point(aes(shape=high_or_low_co2, color=included), size=3) +
scale_color_manual(values=c("red", "blue")) +
#geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
#annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) +
ggtitle("mitochondria % CpG methylation ~ alignment rate") +
xlab("Mitochindria alignment rate") + ylab("Mitochindria CpG meth %")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(sample.info.mito, aes(x=included, y=CpGs_Meth_mito, color=included)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.15, aes(shape=high_or_low_co2)) +
theme_minimal() + scale_color_manual(values=c("red", "blue")) #+
#geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
#theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
#annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) +
#ggtitle("% CpG Methylation full genome ~ mitochondria") +
# xlab("Mitochindria CpG meth %") + ylab("Full Genome CpG % meth")
summary(sample.info.mito$CpGs_Meth_mito)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.828 6.274 13.773 13.900 20.622 27.509
Divergent Bar plot of DMLs
# DML.ratios %>%
# mutate(hypermethylated=ifelse(diff_H.L>0, "H", "L")) %>%
# ggplot(aes(x = chr_start, y = diff_H.L, fill=hypermethylated)) +
# geom_bar(stat = "identity", width=0.41) +
# scale_x_discrete(limits=(DML.ratios[order(DML.ratios$diff_H.L),]$chr_start)) +
# scale_y_continuous(limits=c(-80,80), breaks=c(-75, -50, -25, 0),
# labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%"),
# sec.axis = sec_axis(trans=~.*1, breaks=c(75, 25, 50, 0),
# labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%"))) +
# theme(axis.text.y=element_blank(), axis.ticks.y = element_blank(),
# legend.position = "none", text = element_text(size=14, color="gray30"),
# panel.background = element_blank(),
# panel.border = element_rect(colour = "gray30", fill=NA, size=.4)) +
# #ggtitle("DMLs, showing % methylation difference among treatments") +
# scale_fill_manual(values=c("firebrick3", "dodgerblue3")) +
# #annotate(geom="text", x=-20, y=-65, label="Loci hypermethylated in\nSouth Sound", color="dodgerblue3") +
# #annotate(geom="text", x=260, y=62, label="Loci hypermethylated in\nHood Canal", color="firebrick3") +
# ylab("% methylation difference among treatments") + xlab("Locus") + coord_flip()
# ggsave(filename = "../analyses/DMLs/DMLs-diff-barplot.pdf", width=7, height=5)
FYI I checked that the correlation matrix in
getCorrelation
is derived from the percent methylated
matrix. It is!
# cor.pears <- cor(perc.meth, method="pearson") # create pearson correlation matrix.
# dist.manhat <- dist(t(perc.meth), method = "manhattan", diag = T, upper = F)
# dist.manhat.df <- reshape2::melt(as.matrix(dist.manhat), varnames = c("row", "col"))
#
# cor.pears.DMLs <- cor(perc.methDMLs, method="pearson")
# dist.manhat.DMLs <- dist(t(perc.methDMLs), method = "manhattan", diag = T, upper = F)
# dist.manhat.DMLs.df <- reshape2::melt(as.matrix(dist.manhat.DMLs), varnames = c("row", "col"))
#key <- read.csv(here::here("data", "sample-key.csv"))
# dist.manhat.df <- merge(x=merge(x=dist.manhat.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
# colnames(dist.manhat.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
# write.csv(dist.manhat.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.csv"), row.names=FALSE)
#
# dist.manhat.DMLs.df <- merge(x=merge(x=dist.manhat.DMLs.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
# colnames(dist.manhat.DMLs.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
# write.csv(dist.manhat.DMLs.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.DMLs.csv"), row.names=FALSE)
# load("../analyses/DMLs/R-objects/dml25_counts")
#
# PCA.DMLs <- PCASamples(dml25_counts, obj.return = TRUE, sd.filter = T) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
# nrow(PCA.DMLs$rotation)
#
# summary(PCA.DMLs) #Look at summary statistics. The first PC explains 36.3% of variation, the second PC explains 8.5% of variation. # new analysis/filtering AND >5x replace with NA: PC1=36.5% PC2=9.3%
# save(PCA.DMLs, file="../analyses/DMLs/R-objects/PCA.DMLs")
# write_delim(data.frame(PCA.DMLs$x) %>% tibble::rownames_to_column("sample"), "../analyses/DMLs/PC-scores-DMLs.tab", delim = '\t', col_names = TRUE)
# par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
# PCA.figure <- ordiplot(PCA.DMLs, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
# points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.5) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
# #Add multiple white boxes on top of the default black box to manually change the color
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# #ordiellipse(PCA.DMLs, plotCustomization$ treatment, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
# #ordiellipse(PCA.DMLs, plotCustomization$ treatment, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
# axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
# mtext(side = 1, text = "PC 1 (36.5%)", line = 3, cex = 1.5) #Add x-axis label
# axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
# mtext(side = 2, text = "PC 2 (9.3%)", line = 3, cex = 1.5) #Add y-axis label
# legend("topright",
# pch = c(16, 17),
# legend = c("Hood Canal", "South Sound"),
# col = c(c("firebrick3", "dodgerblue3")),
# cex = 1.4, bty = "n") #Add a legend with information about ambient and elevated samples
#
# save(PCA.figure, file="../analyses/DMLs/R-objects/PCA.figure")
OTHER POSSIBLE ANALYSIS OPTIONS - randomize which samples are “Treatment” and run 10 permutations to see now many loci are DMLs then - Try to pull SNPs from RNASeq data to identify pop structure - Research cool software for integrating meth+rnaseq. AI options for integration?