MethylKit

Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot", "janitor", "googlesheets4", "ggrepel", "clipr", "plotly") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

# Load MethylKit
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("methylKit")
require(methylKit)

`%!in%` = Negate(`%in%`)

Obtain session information

sessionInfo()

## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] BiocManager_1.30.19        clipr_0.8.0               
##  [3] ggrepel_0.9.2              googlesheets4_1.0.1       
##  [5] janitor_2.1.0              corrplot_0.92             
##  [7] PerformanceAnalytics_2.0.4 xts_0.12.2                
##  [9] zoo_1.8-11                 factoextra_1.0.7          
## [11] vegan_2.6-4                lattice_0.20-45           
## [13] permute_0.9-7              viridis_0.6.2             
## [15] viridisLite_0.4.1          Pstat_1.2                 
## [17] matrixStats_0.62.0         ggforce_0.4.1             
## [19] methylKit_1.24.0           GenomicRanges_1.49.0      
## [21] GenomeInfoDb_1.34.2        IRanges_2.32.0            
## [23] S4Vectors_0.36.0           BiocGenerics_0.44.0       
## [25] here_1.0.1                 reshape2_1.4.4            
## [27] forcats_0.5.2              stringr_1.4.1             
## [29] dplyr_1.0.10               purrr_0.3.5               
## [31] readr_2.1.3                tidyr_1.2.1               
## [33] tibble_3.1.8               ggplot2_3.4.0             
## [35] tidyverse_1.3.2           
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1                backports_1.4.1            
##   [3] plyr_1.8.7                  splines_4.2.2              
##   [5] BiocParallel_1.32.1         digest_0.6.30              
##   [7] htmltools_0.5.3             fansi_1.0.3                
##   [9] magrittr_2.0.3              cluster_2.1.4              
##  [11] tzdb_0.3.0                  limma_3.54.0               
##  [13] Biostrings_2.66.0           modelr_0.1.9               
##  [15] R.utils_2.12.2              bdsmatrix_1.3-6            
##  [17] timechange_0.1.1            colorspace_2.0-3           
##  [19] rvest_1.0.3                 haven_2.5.1                
##  [21] xfun_0.34                   crayon_1.5.2               
##  [23] RCurl_1.98-1.9              jsonlite_1.8.3             
##  [25] glue_1.6.2                  polyclip_1.10-4            
##  [27] gtable_0.3.1                gargle_1.2.1               
##  [29] zlibbioc_1.44.0             XVector_0.38.0             
##  [31] DelayedArray_0.23.2         scales_1.2.1               
##  [33] mvtnorm_1.1-3               DBI_1.1.3                  
##  [35] Rcpp_1.0.9                  emdbook_1.3.12             
##  [37] mclust_6.0.0                httr_1.4.4                 
##  [39] ellipsis_0.3.2              pkgconfig_2.0.3            
##  [41] XML_3.99-0.12               R.methodsS3_1.8.2          
##  [43] farver_2.1.1                sass_0.4.2                 
##  [45] dbplyr_2.2.1                utf8_1.2.2                 
##  [47] tidyselect_1.2.0            rlang_1.0.6                
##  [49] munsell_0.5.0               cellranger_1.1.0           
##  [51] tools_4.2.2                 cachem_1.0.6               
##  [53] cli_3.4.1                   generics_0.1.3             
##  [55] fastseg_1.44.0              broom_1.0.1                
##  [57] evaluate_0.18               fastmap_1.1.0              
##  [59] yaml_2.3.6                  knitr_1.40                 
##  [61] fs_1.5.2                    nlme_3.1-160               
##  [63] R.oo_1.25.0                 xml2_1.3.3                 
##  [65] compiler_4.2.2              rstudioapi_0.14            
##  [67] reprex_2.0.2                tweenr_2.0.2               
##  [69] bslib_0.4.1                 stringi_1.7.8              
##  [71] Matrix_1.5-1                vctrs_0.5.0                
##  [73] pillar_1.8.1                lifecycle_1.0.3            
##  [75] jquerylib_0.1.4             data.table_1.14.4          
##  [77] bitops_1.0-7                rtracklayer_1.58.0         
##  [79] qvalue_2.30.0               R6_2.5.1                   
##  [81] BiocIO_1.8.0                gridExtra_2.3              
##  [83] codetools_0.2-18            MASS_7.3-58.1              
##  [85] gtools_3.9.3                assertthat_0.2.1           
##  [87] SummarizedExperiment_1.28.0 rprojroot_2.0.3            
##  [89] rjson_0.2.21                withr_2.5.0                
##  [91] GenomicAlignments_1.34.0    Rsamtools_2.14.0           
##  [93] GenomeInfoDbData_1.2.9      mgcv_1.8-41                
##  [95] parallel_4.2.2              hms_1.1.2                  
##  [97] quadprog_1.5-8              grid_4.2.2                 
##  [99] coda_0.19-4                 snakecase_0.11.0           
## [101] rmarkdown_2.17              MatrixGenerics_1.10.0      
## [103] googledrive_2.0.0           bbmle_1.0.25               
## [105] numDeriv_2016.8-1.1         Biobase_2.58.0             
## [107] lubridate_1.9.0             restfulr_0.0.15

Import bismark alignment .bams into MethylKit. NOTE: This was done on Sedna!

I ran this code interactively on Sedna srun --pty /bin/bash # First, I entered an interactive Sedna node module load R # Then, load R R #start R interactively Now proceed with the following code

# ## AGAIN- this code was run interactively on Sedna! 
# # Install and load programs
# 
# # Load MethylKit
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("methylKit")
# 
# require(methylKit)
# require(tidyverse)
# 
# # Load sample info dataframe 
# load(file="/home/lspencer/DuMOAR/R/sample.info")
# 
# # Triple check that treatments are in the order of sequence sample number (1-20)
# (sample.info %>% arrange(sample_seq))$sample # List out library prep sample number 
# (sample.info %>% arrange(sample_seq))$sample_seq # List out sequence sample 
# (sample.info %>% arrange(sample_seq))$high_or_low_co2 # List out treatment
# 
# # Read .bam files into Methylkit on Sedna 
# (file.list=list("/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_06_S1_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_14_S2_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_22_S3_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_38_S4_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_04_S5_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_15_S6_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_33_S7_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_01_S8_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_06_S9_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_21_S10_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_24_S11_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_06_S12_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_11_S13_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_24_S14_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_02_S15_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_13_S16_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_28_S17_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_01_S18_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_08_S19_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_11_S20_R1_bismark_bt2_pe.deduplicated.sorted.bam"))
# 
# #### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)
# 
# meth_obj = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"), 
#                           assembly = "PGA_assembly.fasta", read.context="CpG", mincov=2, 
#                           treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2))
# 
# #### Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward. 
# 
# save(meth_obj, file = "/home/lspencer/DuMOAR/R/meth_obj")

getwd()

## [1] "C:/Users/laura.spencer/Work/DuMOAR/scripts"

Read in Bismark summary stats and some library prep stats

# Read in data from GoogleSheet 
sample.info <- read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "DNAisolations_only") %>% 
  clean_names() %>% 
  mutate_at(vars(tank, treatment, high_or_low_co2, developmental_stage, dna_siolation_batch), as.factor) %>%
  
  # Join with bismark summary report 
  right_join(

  # Read in bismark summary report
    read_delim(file="../results/bismark/scaffold-only/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
  clean_names() %>% 
   separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
   mutate(sample_prep=paste(a, b, sep = "-")) %>%
    mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
   dplyr::select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
  mutate(perc_totalread_unique=unique_reads_remaining/total_reads, 
         CpGs_Meth=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs), 
         CHGs_Meth=100*methylated_chgs/(methylated_chgs+unmethylated_chgs), 
         CHHs_Meth=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)),

  by = c("sample"="sample_prep")) %>%

# add mbd librar prep stats
  left_join(
  
  read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "MBD_calcs") %>% 
  clean_names() %>% 
  dplyr::select(sample, total_recovery_ng, percent_recovery_percent, library_bioanlyzer_mean_fragment_size_bp,
         library_bioanalyzer_molarity_pmol_l, qubit_concentration_ng_u_l), 

  by = "sample")

## ! Using an auto-discovered, cached token.

##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.

##   See gargle's "Non-interactive auth" vignette for more details:

##   <]8;;https://gargle.r-lib.org/articles/non-interactive-auth.htmlhttps://gargle.r-lib.org/articles/non-interactive-auth.html]8;;>

## ℹ The googlesheets4 package is using a cached token for
##   ']8;;mailto:laura.spencer@noaa.govlaura.spencer@noaa.gov]8;;'.

## ✔ Reading from "OA Crab Sample Collection 071119".

## ✔ Range ''DNAisolations_only''.

## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ✔ Reading from "OA Crab Sample Collection 071119".
## 
## ✔ Range ''MBD_calcs''.
## 
## New names:

save(sample.info, file="../data/sample.info")
load(file="../data/sample.info")

Investigate possible factors influencing low %CpGmeth

Correlation bubble plot

cor(sample.info %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=0.65)

Look closer at factors that appear to correlate with CpGs_Meth to get correlation statistics

#trace("chart.Correlation", edit=T) #to edit function
# In line 17, change the cex value for what you want, i changed it to 2

chart.Correlation(sample.info %>% 
                    dplyr::select(CpGs_Meth, a260_280, mean_mw_highest_peak, aligned_reads, 
                           unaligned_reads, ambiguously_aligned_reads, no_genomic_sequence, 
                           unique_reads_remaining, total_cs, perc_totalread_unique, CHGs_Meth, CHHs_Meth), 
                  histogram=F, pch=19)

Question asked by Zymo (creater of PicoMethyl kit) - any correlation between the samples with abnormal CpG methylation and samples that had more yield from the MBD enrichment step?

Answer: no not really

chart.Correlation(sample.info %>% 
                    dplyr::select(CpGs_Meth, total_recovery_ng, percent_recovery_percent, 
                           library_bioanlyzer_mean_fragment_size_bp, library_bioanalyzer_molarity_pmol_l, 
                           qubit_concentration_ng_u_l), 
                  histogram=F, pch=19)

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

# CpG methylation is not associated with MBD yield


# Use this code to generate 
# sample.info %>% dplyr::select(sample_seq, high_or_low_co2, CpGs_Meth, 
#                        total_recovery_ng, percent_recovery_percent, library_bioanlyzer_mean_fragment_size_bp, 
#                        library_bioanalyzer_molarity_pmol_l, qubit_concentration_ng_u_l) %>%
#   pivot_longer(cols = c("total_recovery_ng", "percent_recovery_percent", "library_bioanlyzer_mean_fragment_size_bp", 
#                        "library_bioanalyzer_molarity_pmol_l", "qubit_concentration_ng_u_l"), names_to = "MBD.stat") %>%
#   mutate(MBD.stat=as.factor(MBD.stat)) %>%
# 
# ggplot(aes(x=value, y=CpGs_Meth)) + 
#   geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
#   scale_color_manual(values=c("red", "blue")) +
#            geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
#   theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
# ggtitle("% CpG Methylation ~ MBD Stats") + 
#   facet_wrap(~MBD.stat, scales = "free")

# Weak correlation with MBD library fragment size, but not convincing
ggplot(sample.info, aes(x=library_bioanlyzer_mean_fragment_size_bp, y=CpGs_Meth)) +
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
    annotate("text", x = 500, y = 17, label = "r = 0.42", size=6) + 
  ggtitle("% CpG Methylation ~ library_bioanlyzer_mean_fragment_size_bp")

## Warning in geom_point(aes(label = sample_seq, color = high_or_low_co2), :
## Ignoring unknown aesthetics: label

## `geom_smooth()` using formula = 'y ~ x'

# Low CpG methylation is associated with high CHH methylation 
ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) +
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
    annotate("text", x = 33, y = 70, label = "r = -0.59", size=6) + 
  ggtitle("% CpG Methylation ~ CHHs_Meth")

## Warning in geom_point(aes(label = sample_seq, color = high_or_low_co2), :
## Ignoring unknown aesthetics: label

## `geom_smooth()` using formula = 'y ~ x'

Correlations between CpGs_Meth and the following:
- a260_280: 0.52* - meh
- mean_mw_highest_peak: 0.51* - not predictive, but low CpG meth samples have lower mean MW.
- aligned_reads: 0.66** - low CpG meth = fewer aligned reads
- unaligned_reads: -0.77***
- ambiguously_aligned_reads: -0.60**
- unique_reads_remaining: 0.73***
- total_cs: 0.72***
- perc_totalread_unique: 0.82***
- CHGs_Meth: -0.38
- CHHs_Meth: -0.59**

ggplot(sample.info, aes(x=a260_280, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 1.5, y = 75, label = "r = 0.52", size=6) + 
ggtitle("% CpG Methylation ~ a260_280")

ggplot(sample.info, aes(x=mean_mw_highest_peak, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 10000, y = 100, label = "r = 0.51", size=6) + 
ggtitle("% CpG Methylation ~ Mean MW (highest peak)")

ggplot(sample.info, aes(x=aligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 6e6, y = 85, label = "r = 0.66", size=6) + 
ggtitle("% CpG Methylation ~ # of aligned reads")

ggplot(sample.info, aes(x=unaligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 1e7, y = 85, label = "r = -0.77", size=6) + 
ggtitle("% CpG Methylation ~ # unaligned reads")

ggplot(sample.info, aes(x=ambiguously_aligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 5e6, y = 85, label = "r = -0.60", size=6) + 
ggtitle("% CpG Methylation ~ # ambiguously aligned reads")

ggplot(sample.info, aes(x=no_genomic_sequence, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  #annotate("text", x = 285, y = 75, label = "r = -0.93", size=6) + 
ggtitle("% CpG Methylation ~ # unaligned- no genomic sequence")

No. of reads not aligning due to no genomic sequence corrlates very well with % methylation, but the actual # of reads is super low across all samples (<300!). What does this mean?!

ggplot(sample.info, aes(x=unique_reads_remaining, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 3.5e6, y = 95, label = "r = 0.73", size=6) + 
ggtitle("% CpG Methylation ~ # unique aligned reads")

ggplot(sample.info, aes(x=total_cs, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 6e7, y = 90, label = "r = 0.72", size=6) + 
ggtitle("% CpG Methylation ~ total # Cs")

ggplot(sample.info, aes(x=perc_totalread_unique, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = .15, y = 85, label = "r = 0.82", size=6) + 
ggtitle("% CpG Methylation ~ % total reads uniquely aligned")

ggplot(sample.info, aes(x=CHGs_Meth, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) + 
ggtitle("% CpG Methylation ~ % CHG Methylation")

ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 30, y = 75, label = "r = -0.59", size=6) + 
ggtitle("% CpG Methylation ~ % CHH Methylation")

Load methylKit object that was created on Sedna

load("../results/methylkit/meth_obj")

What does the resulting methylkit object look like?

# Check out data for sample #1 
head(meth_obj[[1]])

##                                         chr start end strand coverage numCs
## 1 PGA_scaffold0__40_contigs__length_4818635   458 458      +        2     0
## 2 PGA_scaffold0__40_contigs__length_4818635   464 464      +        2     0
## 3 PGA_scaffold0__40_contigs__length_4818635   469 469      +        2     0
## 4 PGA_scaffold0__40_contigs__length_4818635   508 508      +        2     0
## 5 PGA_scaffold0__40_contigs__length_4818635   512 512      +        2     0
## 6 PGA_scaffold0__40_contigs__length_4818635   530 530      +        2     0
##   numTs
## 1     2
## 2     2
## 3     2
## 4     2
## 5     2
## 6     2

Check the basic stats about the methylation data - coverage and percent methylation.

First look at % CpG methylation for each sampe

for (i in 1:20) {
getMethylationStats(meth_obj[[i]],plot=T,both.strands=TRUE)
     mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

#### Now look at coverage for each sample

for (i in 1:20) {
 getCoverageStats(meth_obj[[i]],plot=TRUE,both.strands=TRUE) 
   mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Generate CpG methylation histograms after filtering for coverage

Filter out loci where coverage is less than 5x or greater than 100x.

meth_5x=filterByCoverage(meth_obj,lo.count=5,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(meth_5x, file="../results/methylkit/meth_5x")

Now look at CpG methylation after filtering for 5x coverage

for (i in 1:20) {
getMethylationStats(meth_5x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Filter out loci where coverage is less than 10x or greater than 100x.

meth_10x=filterByCoverage(meth_obj,lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(meth_10x, file="../results/methylkit/meth_10x")

Now look at CpG methylation after filtering for 10x coverage

for (i in 1:20) {
getMethylationStats(meth_10x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Unite methylation data into one object for methylkit functions

column bind all samples, and destrand

meth_unite=methylKit::unite(meth_obj, destrand=TRUE, min.per.group=1L)

## destranding...

## uniting...

save(meth_unite, file = "../results/methylkit/meth_unite") #save object to file
#load(file = "../results/methylkit/meth_unite") #load object from file if needed

Cluster analysis

clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)

## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 20

PCASamples(meth_unite, screeplot = T)

PCASamples(meth_unite)

Reshape methylation to inspect % methylation at various filtering thresholds

# Here - use all methylation data
#load(file = "../results/methylkit/meth_unite") #save object to file

# NOTE: since the sample IDs in the meth_unite() object are sequential from 1:20, the sample IDs in the column names containing data (coverage, numCs, numTs) are correct in this instance. 

meth_unite_reshaped <- meth_unite %>% 
  melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
  drop_na(count) %>%
  mutate(stat=gsub('[0-9]+', '', variable)) %>%
  mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
  dplyr::select(-variable) %>%
  pivot_wider(names_from = stat, values_from = count) %>%
  mutate(percMeth = 100*(numCs/coverage)) %>%
  mutate(sample=as.numeric(sample))

save(meth_unite_reshaped, file = "../results/methylkit/meth_unite_reshaped")
#load(file = "../results/methylkit/meth_unite_reshaped")

# # Here - use a smaller data set, already filtered for min coverage 5x 
# meth_unite_5x <- methylKit::unite(meth_5x, destrand=TRUE, min.per.group=1L)
# save(meth_unite_5x, file = "../results/methylkit/meth_unite_5x") #save object to file
# load(file = "../results/methylkit/meth_unite_5x") #save object to file
# 
# meth_unite_5x_reshaped <- meth_unite_5x %>%
#   melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
#   drop_na(count) %>%
#   mutate(stat=gsub('[0-9]+', '', variable)) %>%
#   mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
#   dplyr::select(-variable) %>%
#   pivot_wider(names_from = stat, values_from = count) %>%
#   mutate(percMeth = 100*(numCs/coverage)) %>%
#   mutate(sample=as.numeric(sample))
# 
# save(meth_unite_5x_reshaped, file = "meth_unite_5x_reshaped")
# save(meth_unite_5x_reshaped, file = "../results/methylkit/meth_unite_5x_reshaped")

No coverage threshold, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, no coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")

## `geom_smooth()` using formula = 'y ~ x'

5x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=5) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci
             , y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 5x coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")

## `geom_smooth()` using formula = 'y ~ x'

10x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=10) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>% 
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(label.size=0.1, min.segment.length = 0.5, aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 10x coverage minimum") + xlab("# Loci") + ylab("Mean % CpG Methylation") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")

## `geom_smooth()` using formula = 'y ~ x'

What is mean % methylated with 15x coverage?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=15) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 15x coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")

## `geom_smooth()` using formula = 'y ~ x'

#### Is % methylation a function of coverage within a sample?

The following samples have low CpG methylation: 5, 6, 8 (kinda), 9, 13, 14, 16, 19. Check out each sample individually

Weird samples have lots of loci with 0% methylation

meth_unite_reshaped %>%
  filter(coverage>=5) %>%
  ggplot() + geom_point(aes(x=coverage, y=percMeth), size=0.05) +
  theme_minimal() + 
  facet_wrap(~sample)

Bad samples: all have mean % methylation <70% control pco2: 5, 6, 13, 14 high pco2: 9, 16, 19, 8 (kinda)

Good samples: all have mean % methylation >= 70% and nloci meeting 10x > 3e5 control pCO2: 1, 2, 3, 4, 7, 12 high pCO2: 10, 11, 15, 17, 18, 20

At 10x coverage, here’s the mean % methylation and number of loci per sample

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=10) %>%
  #filter(sample %in%  c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
  #filter(sample %!in%  c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>% 
  arrange(sample) #%>% dplyr::select(nloci) %>% write_clip()

## # A tibble: 20 × 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1 77.7  414469
##  2      2 76.9  395542
##  3      3 71.7  390654
##  4      4 75.6  472769
##  5      5 15.3   66376
##  6      6 50.3  142489
##  7      7 77.3  413071
##  8      8 70.3  515296
##  9      9  3.62  37374
## 10     10 73.1  524988
## 11     11 75.6  555113
## 12     12 74.3  411316
## 13     13  6.07  58666
## 14     14  5.31  64674
## 15     15 77.5  507968
## 16     16 66.1  217503
## 17     17 70.2  431186
## 18     18 76.5  445896
## 19     19 63.6  280965
## 20     20 75.8  568630

# Summary stats by "good" and "bad" samples  

# Bad samples
sample.info %>% filter(sample_seq %!in%  c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
  summarise(mean.aligned=mean(perc_totalread_unique), sd.aligned=sd(perc_totalread_unique),
            min.aligned=min(perc_totalread_unique), max.aligned=max(perc_totalread_unique),
            mean.reads=mean(total_reads), sd.reads=sd(total_reads),
            mean.CpGs=mean(total_cs), sd.CpGs=sd(total_cs))

## # A tibble: 1 × 8
##   mean.aligned sd.aligned min.aligned max.alig…¹ mean.…² sd.re…³ mean.…⁴ sd.CpGs
##          <dbl>      <dbl>       <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1        0.204     0.0573       0.120      0.282  2.03e7  2.90e6  7.17e7  1.20e7
## # … with abbreviated variable names ¹max.aligned, ²mean.reads, ³sd.reads,
## #   ⁴mean.CpGs

# Good samples
sample.info %>% filter(sample_seq %in%  c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>%
  summarise(mean.aligned=mean(perc_totalread_unique), sd.aligned=sd(perc_totalread_unique),
            min.aligned=min(perc_totalread_unique), max.aligned=max(perc_totalread_unique),
            mean.reads=mean(total_reads), sd.reads=sd(total_reads),
            mean.CpGs=mean(total_cs), sd.CpGs=sd(total_cs))

## # A tibble: 1 × 8
##   mean.aligned sd.aligned min.aligned max.alig…¹ mean.…² sd.re…³ mean.…⁴ sd.CpGs
##          <dbl>      <dbl>       <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1        0.309     0.0200       0.265      0.353  1.91e7  2.79e6  1.06e8  1.86e7
## # … with abbreviated variable names ¹max.aligned, ²mean.reads, ³sd.reads,
## #   ⁴mean.CpGs

# sample.info %>% arrange(sample_seq) %>% dplyr::select(sample_seq, sample) %>% 
#   filter(sample_seq %!in%  c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20")) %>% write_clip()

Run analyses without the low-CpG methylation samples

Good samples: control pCO2: 1, 2, 3, 4, 7, 12 high pCO2: 10, 11, 15, 17, 18, 20

Generate PCA without weird samples (those with very low meth rates)

meth_unite_nolows <- reorganize(methylObj = meth_unite, 
                                sample.ids = c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20"),
                                treatment = as.numeric((sample.info %>% 
                                                          arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20)])
PCASamples(meth_unite_nolows)

(sample.info %>% arrange(sample_seq))[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20),]

## # A tibble: 12 × 37
##    sample  tank  treatm…¹ high_…² devel…³ dna_s…⁴ ng_u_l yield…⁵ a260_…⁶ a260_…⁷
##    <chr>   <fct> <fct>    <fct>   <fct>   <fct>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 CH01-06 3     LC       L       J7      1a        36.8    1.65    1.92    1.51
##  2 CH01-14 3     LC       L       J6      3a        39.0    1.76    1.82    1.1 
##  3 CH01-22 1     LB       L       J6      2b        24.0    1.08    1.69    1.27
##  4 CH01-38 1     LB       L       J6      1b        29.9    1.35    1.88    1.39
##  5 CH03-33 1     LB       L       J6      3b        40.6    1.83    1.87    1.22
##  6 CH05-21 2     HB       H       J7      3a       120.     5.38    1.75    1.26
##  7 CH05-24 4     HC       H       J6      1a        40.7    1.83    1.79    1.28
##  8 CH07-06 3     LC       L       J7      2a       106.     4.78    1.81    1.96
##  9 CH09-02 4     HC       H       J6      3b        79.5    3.58    1.87    1.32
## 10 CH09-28 6     HA       H       J6      4a        22.8    1.03    1.93    1.52
## 11 CH10-01 2     HB       H       J6      4a        23.2    1.04    1.8     1.59
## 12 CH10-11 6     HA       H       J6      3b        38.5    1.73    1.84    1   
## # … with 27 more variables: mean_mw_highest_peak <dbl>, notes <chr>,
## #   file <chr>, sample_seq <dbl>, total_reads <dbl>, aligned_reads <dbl>,
## #   unaligned_reads <dbl>, ambiguously_aligned_reads <dbl>,
## #   no_genomic_sequence <dbl>, duplicate_reads_removed <dbl>,
## #   unique_reads_remaining <dbl>, total_cs <dbl>, methylated_cp_gs <dbl>,
## #   unmethylated_cp_gs <dbl>, methylated_chgs <dbl>, unmethylated_chgs <dbl>,
## #   methylated_ch_hs <dbl>, unmethylated_ch_hs <dbl>, …

meth_obj_nolow <- reorganize(methylObj = meth_obj, sample.ids = c("1", "2", "3", "4", "7", "10", "11", "12", "15", "17", "18", "20"),
                             treatment = as.numeric((sample.info %>% 
                                                          arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20)])

meth_5x=filterByCoverage(meth_obj_nolow,
                         lo.count=5,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)

meth_unite_5x=methylKit::unite(meth_5x, destrand=TRUE, min.per.group=5L)

## destranding...

## uniting...

# For some reason the sample id's aren't accurately assigned to each column. 


PCA.5x <- PCASamples(meth_unite_5x, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix

summary(PCA.5x)

## Importance of components:
##                            PC1     PC2     PC3       PC4       PC5       PC6
## Standard deviation     116.173 111.021 109.443 105.22253 103.70801 103.23718
## Proportion of Variance   0.116   0.106   0.103   0.09518   0.09246   0.09163
## Cumulative Proportion    0.116   0.222   0.325   0.42015   0.51261   0.60424
##                             PC7      PC8     PC9     PC10     PC11      PC12
## Standard deviation     98.63485 97.69303 95.6181 94.83148 92.88020 7.635e-13
## Proportion of Variance  0.08364  0.08205  0.0786  0.07731  0.07416 0.000e+00
## Cumulative Proportion   0.68787  0.76992  0.8485  0.92584  1.00000 1.000e+00

#PC1=11.6% PC2=10.6% 


meth_10x=filterByCoverage(meth_obj_nolow,
                         lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)

meth_unite_10x=methylKit::unite(meth_10x, destrand=TRUE)  # by not setting min.per.group manually, all samples must meet threshold for locus to be retained

## destranding...
## uniting...

PCA.10x <- PCASamples(meth_unite_10x, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix

summary(PCA.10x)

## Importance of components:
##                            PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     93.6610 87.3212 84.89460 84.42583 83.37979 81.42982
## Proportion of Variance  0.1196  0.1039  0.09822  0.09713  0.09474  0.09036
## Cumulative Proportion   0.1196  0.2235  0.32167  0.41881  0.51355  0.60391
##                            PC7      PC8     PC9     PC10     PC11      PC12
## Standard deviation     78.4163 77.59839 76.4255 75.88959 72.76045 4.696e-13
## Proportion of Variance  0.0838  0.08206  0.0796  0.07849  0.07215 0.000e+00
## Cumulative Proportion   0.6877  0.76977  0.8494  0.92785  1.00000 1.000e+00

#PC1=12.0% PC2=10.4% 

save(meth_10x, file="../results/methylkit/meth_10x")
save(meth_unite_10x, file="../results/methylkit/meth_unite_10x")

Important note! The column names in the meth_unite_10x object are NOT the sample IDs.

THE UNITE() FUNCTION ASSIGNS COLUMN NAMES USING THE CONSECUTIVE NUMBERS 1-13, WHILE THE SAMPLE ID’S ARE 1,2,3,4,7,10,11,12,15,17,18,20.

getSampleID(meth_unite_10x) #correct sample IDs

##  [1] "1"  "2"  "3"  "4"  "7"  "10" "11" "12" "15" "17" "18" "20"

colnames(meth_unite_10x) # columns containing data do NOT have sample iDs, but are rather the n'th item in the vector of sample IDs

##  [1] "chr"        "start"      "end"        "strand"     "coverage1" 
##  [6] "numCs1"     "numTs1"     "coverage2"  "numCs2"     "numTs2"    
## [11] "coverage3"  "numCs3"     "numTs3"     "coverage4"  "numCs4"    
## [16] "numTs4"     "coverage5"  "numCs5"     "numTs5"     "coverage6" 
## [21] "numCs6"     "numTs6"     "coverage7"  "numCs7"     "numTs7"    
## [26] "coverage8"  "numCs8"     "numTs8"     "coverage9"  "numCs9"    
## [31] "numTs9"     "coverage10" "numCs10"    "numTs10"    "coverage11"
## [36] "numCs11"    "numTs11"    "coverage12" "numCs12"    "numTs12"

PCA analysis and figure

plotCustomization <- (sample.info %>% arrange(sample_seq))[c(1:4, 7, 10, 11, 12, 15, 17, 18, 20),c("high_or_low_co2", "sample_seq", "tank")] %>%
  mutate(color=case_when(high_or_low_co2 == "H" ~ "firebrick3", TRUE ~ "dodgerblue3"),
         symbol=case_when(high_or_low_co2 == "H" ~ 16, TRUE ~ 17))

# Save size: width=650 height=600
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.10x, choices = c(1, 2), type = "none", display = "sites", cex = 0.4, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.2)  #use points
#text(PCA.figure$sites, col = as.character(plotCustomization$color), labels=plotCustomization$sample_seq, cex=1.5) #use sample number 

#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side =  1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (12.0%)", line = 3, cex = 1.5) #Add x-axis label
axis(side =  2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (10.4%)", line = 3, cex = 1.5) #Add y-axis label
legend("topleft", 
       pch = c(16, 17), 
       legend = c("High pCO2", "Control pCO2"), 
       col = c(c("firebrick3", "dodgerblue3")), 
       cex = 1.3, bty = "n") #Add a legend with information about ambient and elevated samples

Save PC scores and R object for possible integration with other data

NOTE: this PCA was run with all methylated loci data after filtering for 10x across all retained samples.

save(PCA.10x, file="../results/methylkit/PCA.10x")
write_delim(data.frame(PCA.10x$x) %>% tibble::rownames_to_column("sample"), "../results/methylkit/PC-scores-methylation-10x.tab",  delim = '\t', col_names = TRUE)

Quick check to confirm that removed samples should definitely be removed:

Filter for 10x across ALL samples (this is including the low-CpG methylation samples) to assess clustering. Yes, the weird samples still are separated along PC1, which explains the vast majority of variation (68%), from the other samples. Removal of those samples is confirmed.

meth_10x_allsamples=filterByCoverage(meth_obj,
                         lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)

meth_unite_10x_allsamples=methylKit::unite(meth_10x_allsamples, destrand=TRUE, min.per.group=10L) #require all 10 samples to have 10x

## destranding...

## uniting...

PCA.10x_allsamples <- PCASamples(meth_unite_10x_allsamples, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix

summary(PCA.10x_allsamples)

## Importance of components:
##                            PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     17.7089 5.59402 3.73028 3.53793 3.38165 3.20095 3.03176
## Proportion of Variance  0.6803 0.06788 0.03018 0.02715 0.02481 0.02223 0.01994
## Cumulative Proportion   0.6803 0.74815 0.77833 0.80549 0.83029 0.85252 0.87246
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     2.78400 2.77715 2.72782 2.60744 2.50167 2.37892 2.21148
## Proportion of Variance 0.01681 0.01673 0.01614 0.01475 0.01358 0.01228 0.01061
## Cumulative Proportion  0.88927 0.90600 0.92214 0.93689 0.95046 0.96274 0.97335
##                           PC15    PC16    PC17    PC18    PC19      PC20
## Standard deviation     2.07597 1.70969 1.57960 1.14887 1.11316 3.298e-15
## Proportion of Variance 0.00935 0.00634 0.00541 0.00286 0.00269 0.000e+00
## Cumulative Proportion  0.98270 0.98904 0.99445 0.99731 1.00000 1.000e+00

#PC1=68.0% PC2=6.8%

plotCustomization_allsamples <- (sample.info %>% arrange(sample_seq))[,c("high_or_low_co2", "sample_seq", "tank")] %>%
  mutate(color=case_when(high_or_low_co2 == "H" ~ "firebrick3", TRUE ~ "dodgerblue3"),
         symbol=case_when(high_or_low_co2 == "H" ~ 16, TRUE ~ 17))

# Save size: width=650 height=600
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure_allsamples <- ordiplot(PCA.10x_allsamples, choices = c(1, 2), type = "none", display = "sites", cex = 0.4, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
#points(PCA.figure_allsamples, "sites", col = as.character(plotCustomization_allsamples$color), pch = plotCustomization_allsamples$symbol, cex = 2.5)  #use points
text(PCA.figure_allsamples$sites, col = as.character(plotCustomization_allsamples$color), labels=plotCustomization_allsamples$sample_seq, cex=1.3) #use tank number 

#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side =  1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (60%)", line = 3, cex = 1.5) #Add x-axis label
axis(side =  2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (9.5%)", line = 3, cex = 1.5) #Add y-axis label
legend("bottomright", 
       pch = c(16, 17), 
       legend = c("High pCO2", "Low pCO2"), 
       col = c(c("firebrick3", "dodgerblue3")), 
       cex = 1.3, bty = "n") #Add a legend with information about ambient and elevated samples

getCorrelation(meth_unite_10x_allsamples,plot=TRUE)

##            1         2         3         4         5         6         7
## 1  1.0000000 0.9204265 0.8780766 0.9036344 0.4048809 0.4814508 0.9217204
## 2  0.9204265 1.0000000 0.8395954 0.9026195 0.4073819 0.4607784 0.8850474
## 3  0.8780766 0.8395954 1.0000000 0.8328876 0.4055727 0.5138931 0.8697469
## 4  0.9036344 0.9026195 0.8328876 1.0000000 0.4070781 0.4578477 0.8596412
## 5  0.4048809 0.4073819 0.4055727 0.4070781 1.0000000 0.6896490 0.3857924
## 6  0.4814508 0.4607784 0.5138931 0.4578477 0.6896490 1.0000000 0.4889302
## 7  0.9217204 0.8850474 0.8697469 0.8596412 0.3857924 0.4889302 1.0000000
## 8  0.8404887 0.8360214 0.7721337 0.8651078 0.5280643 0.5594744 0.7970856
## 9  0.3896555 0.3998853 0.3959884 0.4138474 0.5666359 0.6283754 0.3931891
## 10 0.9175444 0.9028906 0.8375860 0.9294828 0.4580994 0.4995762 0.8584545
## 11 0.9204629 0.8947195 0.8486419 0.8977719 0.4561448 0.5151380 0.8828099
## 12 0.8970112 0.8721374 0.8613525 0.8520427 0.4390870 0.5611729 0.8919687
## 13 0.4219498 0.4335411 0.4248621 0.4534204 0.5027114 0.5268636 0.4125169
## 14 0.3549197 0.3599255 0.4201602 0.3865198 0.6215416 0.6332094 0.3524266
## 15 0.8937873 0.8746116 0.8173156 0.8950938 0.4624917 0.5177194 0.8438624
## 16 0.7949671 0.7825378 0.7353708 0.8247615 0.4106378 0.4532041 0.7468411
## 17 0.8768790 0.8551198 0.8639592 0.8338317 0.3825381 0.5091718 0.8654018
## 18 0.8964975 0.8826445 0.8489631 0.8789646 0.4074075 0.4773450 0.8741046
## 19 0.7016148 0.7002250 0.6539892 0.7255738 0.6508480 0.6780934 0.6718843
## 20 0.8925928 0.8856902 0.8042061 0.9023818 0.4283288 0.4802889 0.8487865
##            8         9        10        11        12        13        14
## 1  0.8404887 0.3896555 0.9175444 0.9204629 0.8970112 0.4219498 0.3549197
## 2  0.8360214 0.3998853 0.9028906 0.8947195 0.8721374 0.4335411 0.3599255
## 3  0.7721337 0.3959884 0.8375860 0.8486419 0.8613525 0.4248621 0.4201602
## 4  0.8651078 0.4138474 0.9294828 0.8977719 0.8520427 0.4534204 0.3865198
## 5  0.5280643 0.5666359 0.4580994 0.4561448 0.4390870 0.5027114 0.6215416
## 6  0.5594744 0.6283754 0.4995762 0.5151380 0.5611729 0.5268636 0.6332094
## 7  0.7970856 0.3931891 0.8584545 0.8828099 0.8919687 0.4125169 0.3524266
## 8  1.0000000 0.4982531 0.8901618 0.8728944 0.8201237 0.5346438 0.5065826
## 9  0.4982531 1.0000000 0.4380341 0.4506208 0.4541685 0.5432220 0.6011789
## 10 0.8901618 0.4380341 1.0000000 0.9109196 0.8682796 0.4684503 0.4081516
## 11 0.8728944 0.4506208 0.9109196 1.0000000 0.8913509 0.4706310 0.4231644
## 12 0.8201237 0.4541685 0.8682796 0.8913509 1.0000000 0.4857654 0.4422424
## 13 0.5346438 0.5432220 0.4684503 0.4706310 0.4857654 1.0000000 0.5398804
## 14 0.5065826 0.6011789 0.4081516 0.4231644 0.4422424 0.5398804 1.0000000
## 15 0.8969565 0.4656602 0.9205118 0.9177343 0.8581328 0.4883840 0.4337526
## 16 0.8217936 0.4350830 0.8355187 0.8197600 0.7626254 0.4554694 0.4148755
## 17 0.7916514 0.4160851 0.8484988 0.8591937 0.8714400 0.4776570 0.3842985
## 18 0.8442023 0.3788949 0.9034909 0.8998645 0.8707650 0.4506587 0.3985575
## 19 0.8197620 0.5644667 0.7671241 0.7448981 0.7005226 0.5554938 0.5360734
## 20 0.8901958 0.4296343 0.9196465 0.9148622 0.8499677 0.4467872 0.4023128
##           15        16        17        18        19        20
## 1  0.8937873 0.7949671 0.8768790 0.8964975 0.7016148 0.8925928
## 2  0.8746116 0.7825378 0.8551198 0.8826445 0.7002250 0.8856902
## 3  0.8173156 0.7353708 0.8639592 0.8489631 0.6539892 0.8042061
## 4  0.8950938 0.8247615 0.8338317 0.8789646 0.7255738 0.9023818
## 5  0.4624917 0.4106378 0.3825381 0.4074075 0.6508480 0.4283288
## 6  0.5177194 0.4532041 0.5091718 0.4773450 0.6780934 0.4802889
## 7  0.8438624 0.7468411 0.8654018 0.8741046 0.6718843 0.8487865
## 8  0.8969565 0.8217936 0.7916514 0.8442023 0.8197620 0.8901958
## 9  0.4656602 0.4350830 0.4160851 0.3788949 0.5644667 0.4296343
## 10 0.9205118 0.8355187 0.8484988 0.9034909 0.7671241 0.9196465
## 11 0.9177343 0.8197600 0.8591937 0.8998645 0.7448981 0.9148622
## 12 0.8581328 0.7626254 0.8714400 0.8707650 0.7005226 0.8499677
## 13 0.4883840 0.4554694 0.4776570 0.4506587 0.5554938 0.4467872
## 14 0.4337526 0.4148755 0.3842985 0.3985575 0.5360734 0.4023128
## 15 1.0000000 0.8423870 0.8368522 0.8909114 0.7800942 0.9252607
## 16 0.8423870 1.0000000 0.7552273 0.8033447 0.7232328 0.8291582
## 17 0.8368522 0.7552273 1.0000000 0.8603223 0.6840687 0.8273257
## 18 0.8909114 0.8033447 0.8603223 1.0000000 0.7209976 0.8956864
## 19 0.7800942 0.7232328 0.6840687 0.7209976 1.0000000 0.7717557
## 20 0.9252607 0.8291582 0.8273257 0.8956864 0.7717557 1.0000000

Hierarchical Clustering using filtered methylation data

The function clusters samples using hclust function and various distance metrics derived from percent methylation per base or per region for each sample.

clusterSamples(meth_unite_10x, dist="correlation", method="ward", plot=TRUE)

## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 12

getData(meth_unite_5x) %>% nrow() #340,394 loci @ 5x coverage

## [1] 340394

getData(meth_unite_10x) %>% nrow() #146,761 loci @ 10x coverage

## [1] 146761

Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.

Here I create % methylation matrices from the filtered object, and use it to do another cluster analysis

perc.meth=percMethylation(meth_unite_10x, rowids=T)
hist(perc.meth, col="gray50" )

# # Save % methylation df to object and .tab file 
# save(perc.meth, file = "../analyses/methylation-filtered/R-objects/perc.meth") #save object to file 
# #load(file = "../analyses/methylation-filtered/R-objects/perc.meth") #load object if needed
# write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "methylation-filtered", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)

# What percentage of loci have ZERO methylation? 
100*table(perc.meth==0)[2]/table(perc.meth==0)[1]

##     TRUE 
## 1.780652

# 1.8% of loci unmethylated, averaged across all samples 

# Mean % methylation across all samples 
mean(perc.meth, na.rm=TRUE)

## [1] 79.2303

#79.2%

sample.info %>% dplyr::select(high_or_low_co2, sample_seq) %>% arrange(sample_seq)

## # A tibble: 20 × 2
##    high_or_low_co2 sample_seq
##    <fct>                <dbl>
##  1 L                        1
##  2 L                        2
##  3 L                        3
##  4 L                        4
##  5 L                        5
##  6 L                        6
##  7 L                        7
##  8 H                        8
##  9 H                        9
## 10 H                       10
## 11 H                       11
## 12 L                       12
## 13 L                       13
## 14 L                       14
## 15 H                       15
## 16 H                       16
## 17 H                       17
## 18 H                       18
## 19 H                       19
## 20 H                       20

(perc.meth %>% as.matrix())[,c("1","2","3","4","7","12")] %>% head() # low pCO2

##                                                              1        2
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 95.65217 95.83333
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 64.10256 73.68421
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 78.57143 75.00000
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 89.79592 68.42105
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 53.03030 64.28571
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 96.07843 72.72727
##                                                              3         4
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 60.00000  94.28571
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 84.61538  80.43478
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 84.61538  78.84615
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 80.00000  73.07692
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 60.00000  56.25000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 72.22222 100.00000
##                                                               7        12
## PGA_scaffold0__40_contigs__length_4818635.21392.21392  86.53846  48.14815
## PGA_scaffold0__40_contigs__length_4818635.21416.21416  90.69767  82.14286
## PGA_scaffold0__40_contigs__length_4818635.21429.21429  82.35294  76.47059
## PGA_scaffold0__40_contigs__length_4818635.21455.21455  96.66667  82.75862
## PGA_scaffold0__40_contigs__length_4818635.24587.24587  73.01587  67.50000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 100.00000 100.00000

(perc.meth %>% as.matrix())[,c("10","11","15", "17", "18", "20")] %>% head() # high pCO2

##                                                             10       11
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 97.22222 93.93939
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 89.13043 74.10714
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 84.78261 84.40367
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 55.88235 91.52542
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 58.13953 36.95652
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 81.81818 90.32258
##                                                             15        17
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 85.18519 100.00000
## PGA_scaffold0__40_contigs__length_4818635.21416.21416 75.75758  85.71429
## PGA_scaffold0__40_contigs__length_4818635.21429.21429 70.37037 100.00000
## PGA_scaffold0__40_contigs__length_4818635.21455.21455 45.83333 100.00000
## PGA_scaffold0__40_contigs__length_4818635.24587.24587 66.66667  37.50000
## PGA_scaffold0__40_contigs__length_4818635.24668.24668 81.81818  87.50000
##                                                              18       20
## PGA_scaffold0__40_contigs__length_4818635.21392.21392 100.00000 90.47619
## PGA_scaffold0__40_contigs__length_4818635.21416.21416  65.51724 66.66667
## PGA_scaffold0__40_contigs__length_4818635.21429.21429  61.53846 91.66667
## PGA_scaffold0__40_contigs__length_4818635.21455.21455  86.20690 80.00000
## PGA_scaffold0__40_contigs__length_4818635.24587.24587  20.00000 36.11111
## PGA_scaffold0__40_contigs__length_4818635.24668.24668  50.00000 90.47619

# Mean % methylation for each treatment 
mean((perc.meth %>% as.matrix())[,c("1","2","3","4","7","12")], na.rm=TRUE) # Low CO2 exposed - 79.5%

## [1] 79.53615

mean((perc.meth %>% as.matrix())[,c("10","11","15", "17", "18", "20")], na.rm=TRUE) # High CO2 exposed - 78.9%

## [1] 78.92444

perc.meth.summ <- data.frame(sample=1:12, methylated=1:12, coverage=1:12, mean.perc=1:12)
for (i in 1:12) {
  perc.meth.summ[i,"sample"] <- getSampleID(meth_unite_10x)[i]
  perc.meth.summ[i,"methylated"] <-mean(as.vector(getData(meth_unite_10x)[,grep(c("numCs"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
  perc.meth.summ[i,"coverage"] <- mean(as.vector(getData(meth_unite_10x)[,grep(c("coverage"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
  perc.meth.summ[i,"mean.perc"] <- mean(as.vector(getData(meth_unite_10x)[,grep(c("numCs"), colnames(meth_unite_10x))][i][,1]) / 
                                     as.vector(getData(meth_unite_10x)[,grep(c("coverage"), colnames(meth_unite_10x))][i][,1]), na.rm=T)
}
mean(perc.meth.summ[perc.meth.summ$sample %in% c("1","2","3","4","7","12"),"mean.perc"]) #Loe pCO2 treatment = 79.5%

## [1] 0.7953615

mean(perc.meth.summ[perc.meth.summ$sample %in% c("10","11","15", "17", "18", "20"),"mean.perc"]) #78.9%

## [1] 0.7892444

paste("No. of loci in filtered dataset:", nrow(perc.meth)) #146,761 loci

## [1] "No. of loci in filtered dataset: 146761"

paste("No. of samples:", ncol(perc.meth)) #12 sampes

## [1] "No. of samples: 12"

myDiff=calculateDiffMeth(meth_unite_10x)

## two groups detected:
##  will calculate methylation difference as the difference of
## treatment (group: 2) - control (group: 1)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# get  deferentially methylated bases using 12% difference and save to files - 12,971 loci 
myDiff12p=getMethylDiff(myDiff,difference=12,qvalue=0.01)
paste("Number diff. methylated loci with 12% difference: ", getData(myDiff12p) %>% nrow())

## [1] "Number diff. methylated loci with 12% difference:  12971"

# get all differentially methylated bases - 1,427 loci
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
paste("Number diff. methylated loci with 25% difference: ", getData(myDiff25p) %>% nrow())

## [1] "Number diff. methylated loci with 25% difference:  1427"

# Extract hypo and hyper methylated loci. Here "type" must think that the low pCO2 is the experimental treatment, since hypo and hyper are switched. 
# get hypo methylated bases - 843 loci
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper") 
paste("Number loci hypo-methylated in high pCO2 with 25% difference: ", getData(myDiff25p.hyper) %>% nrow())

## [1] "Number loci hypo-methylated in high pCO2 with 25% difference:  834"

# get hypo methylated bases - 593 loci 
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
paste("Number loci hyper-methylated in high pCO2 with 25% difference: ", getData(myDiff25p.hypo) %>% nrow())

## [1] "Number loci hyper-methylated in high pCO2 with 25% difference:  593"

#write.table(myDiff12p, file = "../analyses/DMLs/myDiff12p.tab", sep = "\t", col.names = TRUE)
#write_delim(getData(myDiff12p) %>%  mutate(start = start-1, end = end+1) %>% dplyr::select(chr, start, end, meth.diff),
#            "../analyses/DMLs/dml12.bed",  delim = '\t', col_names = TRUE)

write.table(myDiff25p, file = "../results/methylkit/myDiff25p.tab", sep = "\t", col.names = TRUE)
save(myDiff25p, file="../results/methylkit/myDiff25p")
#load(file="../analyses/DMLs/myDiff25p") #load if needed

Take the DMLs to a bed

# Write out bedfile for DMLs 
dml25 <- myDiff25p %>% getData() %>% 
  mutate(start = start-1, end = end+1) %>% 
  dplyr::select(chr, start, end, meth.diff)

write_delim(dml25, "../results/methylkit/dml25.bed",  delim = '\t', col_names = TRUE) 
write_delim(dml25, "../results/methylkit/dml25_forIGV.bed",  delim = '\t', col_names = FALSE) #no column names for IGV
save(dml25, file="../results/methylkit/dml25")
#load(file="../results/methylkit/dml25") #load dml25 if needed

# Write out bedfile for all loci fed into DML analysis, to use as background for enrichment 
mydiff.all <-  mutate(getData(myDiff), start = start -1, end = end + 1) %>% dplyr::select(chr, start, end, meth.diff) %>% mutate_if(is.numeric, as.integer)
write_delim(mydiff.all, "../results/methylkit/mydiff-all.bed",  delim = '\t', col_names = TRUE)
save(mydiff.all, file="../results/methylkit/mydiff.all")
#load(file="../results/methylkit/mydiff.all")

paste("No. of DMLs that had higher methylation in high pCO2 = ", dml25 %>%filter(meth.diff<0)%>%nrow(), " (", 100*round(dml25 %>%filter(meth.diff<0)%>%nrow()/nrow(dml25), digits = 2), "%)", sep="")

## [1] "No. of DMLs that had higher methylation in high pCO2 = 593 (42%)"

paste("No. of DMLs that had lower methylation in high pCO2= ", dml25 %>%filter(meth.diff>0)%>%nrow(), " (", 100*round(dml25 %>%filter(meth.diff>0)%>%nrow()/nrow(dml25), digits = 2), "%)", sep="")

## [1] "No. of DMLs that had lower methylation in high pCO2= 834 (58%)"

# Save DML bed files with hyper- and hypo-methylated loci in high pCO2 treatment 
# meth.diff < 0 = loci where methylation rate is HIGHER in high pCO2 treatment 

write_delim(dml25 %>%filter(meth.diff<0), "../results/methylkit/dml25-hypermeth_in_OA.bed",  delim = '\t', col_names = TRUE) 
write_delim(dml25 %>%filter(meth.diff>0), "../results/methylkit/dml25-hypometh_in_OA.bed",  delim = '\t', col_names = TRUE)

Plot mean % methylation for loci by those that hypo- and hyper-methylated in OA-exposed crab

meth_unite_reshaped %>%
  filter(coverage>=10) %>%
  filter(sample %in% c(1,2,3,4,7,12, #low pCO3
                       8,10,11,15,17,18,20)) %>% #high pCO2
  mutate(locus=paste(chr, start-1, sep=".")) %>%  # Create locus column; subtract 1 from the "start" column to match dml25
  filter(locus %in% (dml25 %>%filter(meth.diff<0) %>% mutate(locus=paste(chr, start, sep=".")))$locus) %>% #keep loci with meth.diff < 0, higher meth in high pCO2
  left_join(sample.info[,c("high_or_low_co2", "sample_seq")], by=c("sample"="sample_seq")) %>%
  group_by(high_or_low_co2, locus) %>%
  summarise(mean = mean(percMeth)) %>%
  ggplot(aes(x=high_or_low_co2, y=mean)) + geom_violin() + geom_jitter() + 
  theme_minimal() + xlab("pCO2 treatment") +
  ggtitle("Loci with higher methylation in OA (n=593)")

## `summarise()` has grouped output by 'high_or_low_co2'. You can override using
## the `.groups` argument.

#dml25 %>%filter(meth.diff>0) %>% nrow()

meth_unite_reshaped %>%
  filter(coverage>=10) %>%
  filter(sample %in% c(1,2,3,4,7,12, #low pCO3
                     8,10,11,15,17,18,20)) %>% #high pCO2
  mutate(locus=paste(chr, start-1, sep=".")) %>%  # Create locus column; subtract 1 from the "start" column to match dml25
  filter(locus %in% (dml25 %>%filter(meth.diff>0) %>% mutate(locus=paste(chr, start, sep=".")))$locus) %>% #keep loci with meth.diff > 0, lower meth in high pCO2
  left_join(sample.info[,c("high_or_low_co2", "sample_seq")], by=c("sample"="sample_seq")) %>%
  group_by(high_or_low_co2, locus) %>%
  summarise(mean = mean(percMeth)) %>%
  ggplot(aes(x=high_or_low_co2, y=mean)) + geom_violin() + geom_jitter() + 
  theme_minimal() + xlab("pCO2 treatment") +
  ggtitle("Loci with lower methylation in OA (n=834)")

## `summarise()` has grouped output by 'high_or_low_co2'. You can override using
## the `.groups` argument.

### Subset the filtered methylation count dataframe for only those that are differentially methylated. Save object to file.

dml25_counts <- selectByOverlap(meth_unite_10x,as(myDiff25p,"GRanges"))
class(dml25_counts) #class should be methylBase

## [1] "methylBase"
## attr(,"package")
## [1] "methylKit"

perc.methDMLs= percMethylation(dml25_counts, rowids=T) #create a percent methylated object for DMLs

# save count and percent objects to be used in subsequent notebook 
save(dml25_counts, file = "../results/methylkit/dml25_counts")
save(perc.methDMLs, file="../results/methylkit/perc.methDMLs")

permANOVA and dispersion test, % methylation by treatment

perc.meth.d<-vegdist(data.frame(t(perc.meth)),"euclidean", na.rm = TRUE)
meth.perm <- adonis(data.frame(t(perc.meth)) ~ high_or_low_co2, data=plotCustomization,
                          permutations=1000, method='euclidean', na.rm = TRUE) #yes, global population difference

## 'adonis' will be deprecated: use 'adonis2' instead

meth.perm$aov.tab #

## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##                 Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## high_or_low_co2  1  23530917 23530917  1.0419 0.09436 0.1059
## Residuals       10 225841244 22584124         0.90564       
## Total           11 249372161                  1.00000

hist(meth.perm$f.perms, main="Histogram of pseudo-F values under the null model",
     xlab="pseudo=F values", col="gray", xlim=c(0.5,2.5))
abline(v=1.9743, col="red") # indicates F-value is significant.

source("../references/biostats.R")
pca.eigenval(PCA.10x) #The Proporation of Variance = %variance explained

## Importance of components:

##                                 PC1          PC2          PC3          PC4
## Variance(eigenvalue)   8772.3902168 7624.9967243 7207.0922603 7127.7209667
## Proportion of Variance    0.1195474    0.1039111    0.0982160    0.0971344
## Cumulative Proportion     0.1195474    0.2234585    0.3216746    0.4188089
## Broken-stick value        3.1032107    2.1032107    1.6032107    1.2698773
##                                 PC5          PC6          PC7          PC8
## Variance(eigenvalue)   6952.1896370 6630.8151862 6149.1144766 6021.5108973
## Proportion of Variance    0.0947423    0.0903627    0.0837982    0.0820593
## Cumulative Proportion     0.5135512    0.6039139    0.6877122    0.7697715
## Broken-stick value        1.0198773    0.8198773    0.6532107    0.5103535
##                                 PC9         PC10         PC11      PC12
## Variance(eigenvalue)   5840.8562662 5759.2298241 5294.0835444 0.0000000
## Proportion of Variance    0.0795974    0.0784850    0.0721461 0.0000000
## Cumulative Proportion     0.8493689    0.9278539    1.0000000 1.0000000
## Broken-stick value        0.3853535    0.2742424    0.1742424 0.0833333

screeplot(PCA.10x, bstick=TRUE) #that's weird- PC's 1-5 aren't sign. but the higher PCs are.

# # check sign. of PCs - can't get this to work, though.
# ordi.monte(PCA.10x, ord="pca", dim=2, perm = 1000, scale = F, plot = F)

anova(meth.bdp<-betadisper(perc.meth.d,plotCustomization$high_or_low_co2)) #no sign. difference in dispersion, which is good

## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value Pr(>F)
## Groups     1  84611   84611  1.5642 0.2395
## Residuals 10 540928   54093

boxplot(meth.bdp)

# # perform PCoA just for fun on distance matrix 
# meth.pcoa<-cmdscale(perc.meth.d, eig=TRUE, add=T)
# ordiplot(meth.pcoa, choices = c(1, 2), type="text", display="sites", xlab="PC 1", ylab="PC 2")

Reshape filtered data to create a new column with sample info, and calculate %methylation for each locus/sample

# Get sample ID vector from methylBase object (created from unite() function)
samples <- getSampleID(meth_unite_10x)

meth_unite_10x_reshaped <- melt(data=meth_unite_10x, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
 tidyr::separate(col = "variable" , into = c("variable", "sample_temp"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
  dcast(chr+start+end+strand+sample_temp ~  variable) %>%

  #Correct sample IDs extracted from unite() function 
  mutate(sample=case_when(
    sample_temp== 1 ~ samples[1],
    sample_temp== 2 ~ samples[2],
    sample_temp== 3 ~ samples[3],
    sample_temp== 4 ~ samples[4],
    sample_temp== 5 ~ samples[5],
    sample_temp== 6 ~ samples[6],
    sample_temp== 7 ~ samples[7],
    sample_temp== 8 ~ samples[8],
    sample_temp== 9 ~ samples[9],
    sample_temp== 10 ~ samples[10],
    sample_temp== 11 ~ samples[11],
    sample_temp== 12 ~ samples[12],
    sample_temp== 13 ~ samples[13])) %>%
  dplyr::select(-sample_temp) %>%
  
  mutate(percMeth = 100*(numCs/coverage)) %>% 
  left_join(sample.info[c("sample_seq", "high_or_low_co2")] %>% 
              mutate(sample_seq=as.character(sample_seq)), by=c("sample"="sample_seq")) %>%
  dplyr::rename("treatment"="high_or_low_co2")

## Using count as value column: use value.var to override.

save(meth_unite_10x_reshaped, file="../results/methylkit/meth_unite_10x_reshaped")
#load(file="../results/methylkit/meth_unite_10x_reshaped")

# save to tab file if needed
#readr::write_delim(meth_unite_10x_reshaped, "../results/methylkit/meth_unite_10x_reshaped.tab"),  delim = '\t', col_names = FALSE)

Check out overall distribution of filtered methylation data by treatment

Any trends? No, similar distribution of % CpG methyation

meth_unite_10x_reshaped %>% 
  ggplot(aes(x= treatment, y=percMeth, fill= treatment)) +
  geom_violin(width=1, alpha=0.8) + theme_minimal() +
  ylab("% CpG Methylation") + theme(axis.title.x = element_blank(), legend.position = "none",
                                    text = element_text(size=16)) +
  scale_x_discrete(labels=c("H" = "High pCO2", "L" = "Control pCO2")) + 
  scale_fill_manual(values=c("firebrick3", "dodgerblue3"))

View distributions in density plot

Looks as though there is slightly more loci with higher methylation rates in the High pCO2 treatment (red distribution is shifted to the right a bit)

#ggplotly(
meth_unite_10x_reshaped %>% 
  ggplot(aes(x=percMeth, fill= treatment)) +
  geom_density(alpha=0.5) + 
  scale_fill_manual(values=c("firebrick3", "dodgerblue3"), 
                    labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) +
  #xlim(90,100) + #use to find location on x axis of second highest peak
  theme_minimal() +
  ylab("Density") + xlab("% CpG Methylation") +
  theme(text = element_text(size=18), legend.position = "top") #)

Find location of peaks in each treatment in the above figure. Used this as a reference: http://ianmadd.github.io/pages/PeakDensityDistribution.html

Answer:
- Largest peak for high pCO2 treatment is at 99.98% - Largest peak for low pCO2 treatment (control) is 94.56%

## Find largest peak 

# Find largest peak for High pCO2
density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA")$percMeth) # Y maximum is 4.780e-02

## 
## Call:
##  density.default(x = subset(meth_unite_10x_reshaped, treatment ==     "H" & percMeth != "NA")$percMeth)
## 
## Data: subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA")$percMeth (880566 obs.);   Bandwidth 'bw' = 1.003
## 
##        x                y            
##  Min.   : -3.01   Min.   :8.356e-05  
##  1st Qu.: 23.49   1st Qu.:1.981e-03  
##  Median : 50.00   Median :4.248e-03  
##  Mean   : 50.00   Mean   :9.423e-03  
##  3rd Qu.: 76.51   3rd Qu.:9.761e-03  
##  Max.   :103.01   Max.   :4.780e-02

# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA")$percMeth)$y) #answer = index #497

## [1] 497

# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA")$percMeth)$x[497] #H max (4.895e-02) is @ 99.90%

## [1] 99.89823

##--------------

# Find largest peak for Low pCO2, i.e. control
density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA")$percMeth) # Y maximum is 4.663e-02

## 
## Call:
##  density.default(x = subset(meth_unite_10x_reshaped, treatment ==     "L" & percMeth != "NA")$percMeth)
## 
## Data: subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA")$percMeth (880566 obs.);   Bandwidth 'bw' = 0.9293
## 
##        x                 y            
##  Min.   : -2.788   Min.   :8.979e-05  
##  1st Qu.: 23.606   1st Qu.:1.869e-03  
##  Median : 50.000   Median :3.985e-03  
##  Mean   : 50.000   Mean   :9.463e-03  
##  3rd Qu.: 76.394   3rd Qu.:9.571e-03  
##  Max.   :102.788   Max.   :4.663e-02

which.max(density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA")$percMeth)$y) #answer = 472

## [1] 472

density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA")$percMeth)$x[472] #L max (0.0469560  ) is 94.52%

## [1] 94.52361

## --------------

## Find 2nd largest peak. NOTE: I looked at the ggplotly density figure above and noted that the 2nd peaks end before ~98%, so I subset the percMeth data to retain only those <98%, then find the peak 

# Find 2nd largest peak for High pCO2
density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth) # Y maximum is .04997

## 
## Call:
##  density.default(x = subset(meth_unite_10x_reshaped, treatment ==     "H" & percMeth != "NA" & percMeth < 98)$percMeth)
## 
## Data: subset(meth_unite_10x_reshaped, treatment == "H" & percMeth != "NA" & percMeth < 98)$percMeth (766028 obs.);   Bandwidth 'bw' = 1.129
## 
##        x                 y            
##  Min.   : -3.388   Min.   :0.0000502  
##  1st Qu.: 22.803   1st Qu.:0.0021764  
##  Median : 48.993   Median :0.0046571  
##  Mean   : 48.993   Mean   :0.0095358  
##  3rd Qu.: 75.184   3rd Qu.:0.0097195  
##  Max.   :101.375   Max.   :0.0499662

# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth)$y) #answer = index #478

## [1] 478

# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped,  treatment=="H" & percMeth!="NA" & percMeth<98)$percMeth)$x[478] #H max (5.281e-02) is @ 94.40%

## [1] 94.40423

#---
# Find 2nd largest peak for Low pCO2
density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth) # Y maximum is 1.1686

## 
## Call:
##  density.default(x = subset(meth_unite_10x_reshaped, treatment ==     "L" & percMeth != "NA" & percMeth > 95)$percMeth)
## 
## Data: subset(meth_unite_10x_reshaped, treatment == "L" & percMeth != "NA" & percMeth > 95)$percMeth (213494 obs.);   Bandwidth 'bw' = 0.1402
## 
##        x                y            
##  Min.   : 94.61   Min.   :0.0001824  
##  1st Qu.: 96.06   1st Qu.:0.0633958  
##  Median : 97.51   Median :0.1470459  
##  Mean   : 97.51   Mean   :0.1717644  
##  3rd Qu.: 98.97   3rd Qu.:0.1829494  
##  Max.   :100.42   Max.   :1.1686438

# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth)$y) #answer = index #475

## [1] 475

# Find the x value where the largest Y value is
density(subset(meth_unite_10x_reshaped,  treatment=="L" & percMeth!="NA" & percMeth>95)$percMeth)$x[475] #L max (4.953e-02) is @ 100.00%

## [1] 99.99955

# Very similar 2nd peak CpG methylation - High pCO2 = 94.24%, and Low pCO2 = 94.61% (only slightly higher in low pCO2 treatment)

#----- Double check # of loci 
meth_unite_10x_reshaped %>%
  filter( treatment=="H" & percMeth!="NA") %>%
  dplyr::select(chr, start) %>% unique() %>% nrow() #146,761

## [1] 146761

meth_unite_10x_reshaped %>%
  filter( treatment=="L" & percMeth!="NA") %>%
  dplyr::select(chr, start) %>% unique() %>% nrow() #146,761 (good they are the same)

## [1] 146761

#----- average % meth by pop

# High pCO2 mean = 78.92%, median = 88.27%
meth_unite_10x_reshaped %>%
  filter( treatment=="H" & percMeth!="NA") %>%
  dplyr::select(percMeth) %>% summary()

##     percMeth     
##  Min.   :  0.00  
##  1st Qu.: 71.83  
##  Median : 88.27  
##  Mean   : 78.92  
##  3rd Qu.: 94.92  
##  Max.   :100.00

# Low (control) pCO2 mean = 79.54%, median = 88.71%
meth_unite_10x_reshaped %>%
  filter( treatment=="L" & percMeth!="NA") %>%
  dplyr::select(percMeth) %>% summary()

##     percMeth     
##  Min.   :  0.00  
##  1st Qu.: 73.49  
##  Median : 88.71  
##  Mean   : 79.54  
##  3rd Qu.: 94.87  
##  Max.   :100.00

Takeaway = high and low (control) pCO2 have similar %CpG distributions on average, but the distribution of %meth varies a bit - both treatments’ % methylation peak around 94% and 100%, but in the high pCO2 treatment the 100% peak is higher, and the 94% peak is lower. So, high pCO2 has more instances of 100% methylation.

slightly higher methylation rates in the high pCO2 reared crab.

Calculations of % methylation by treatment

# mean % methylation across samples  
dml25_counts_unique <- unique(getData(dml25_counts)[,c("chr", "start")]) %>%
  mutate(chr_start=paste(chr, start, sep="_"))

#Calculate ratio between mean % methylation in High pCO2 : Low pCO2 
DML.ratios <- meth_unite_10x_reshaped %>% 
  mutate(chr_start=paste(chr, start, sep="_")) %>%
  filter(chr_start %in% dml25_counts_unique$chr_start) %>% 
  group_by( treatment, chr, start) %>%
  summarise(median_percentMeth = median(percMeth, na.rm = TRUE)) %>% #mean_percentMeth = mean(percMeth, na.rm = TRUE), 
  dcast(chr + start ~  treatment) %>% 
  mutate(ratio_H.L = H/L,  #in this ratio column, values <1 = High pCO2 hypomethylated, values >1 = Low pCO2 hypomethylated 
         chr_start=paste(chr, start, sep="_"),
         diff_H.L = H-L)   #this is the difference between High pCO2 & Low pCO2 (H - L)

## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.
## Using median_percentMeth as value column: use value.var to override.

nrow(DML.ratios)==nrow(dml25_counts_unique) #confirm same # loci; should=TRUE

## [1] TRUE

save(DML.ratios, file="../results/methylkit/DML.ratios")
#load(file="../analyses/methykit/DML.ratios")

# For all loci, calculate mean and sd percent methylation across samples by  treatment 
DML.calcs <- meth_unite_10x_reshaped %>% 
  group_by( treatment, chr, start) %>% 
  dplyr::summarise(
    mean_percMeth = mean(percMeth, na.rm=TRUE),
    sd_percMeth=sd(percMeth, na.rm=TRUE),
    n())  %>% 
  filter(chr %in% c(DML.ratios$chr) & 
           start %in% c(DML.ratios$start))

## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.

Interesting! DMLs on average have lower % methylation in the high pCO2 treatment

DML.ratios %>% 
pivot_longer(cols = c(H, L),
   names_to = "treatment",
   values_to = "mean_meth") %>%
  mutate(treatment=as.factor(treatment)) %>%
  ggplot(aes(x=treatment, y=mean_meth, fill=treatment)) +
  geom_violin(alpha=0.2) + geom_boxplot(alpha=0.4) +
  scale_fill_manual(values=c("firebrick3", "dodgerblue3"), 
                    labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) + 
theme_minimal() + ggtitle("Methylation rates of DMLs") + 
  ylab("Methylation (mean within treatment)") + xlab(NULL)

DML.ratios %>% 
pivot_longer(cols = c(H, L),
   names_to = "treatment",
   values_to = "mean_meth") %>%
  mutate(treatment=as.factor(treatment)) %>%
  group_by(treatment) %>%
  summarise(mean=mean(mean_meth), 
             median=median(mean_meth),)

## # A tibble: 2 × 3
##   treatment  mean median
##   <fct>     <dbl>  <dbl>
## 1 H          57.4   57.5
## 2 L          62.3   67.1

For ALL loci, calculate difference between mean % methylation in H : L for divergent bar plot

#Calculate ratio between mean % methylation in High pCO2 : Low pCO2 (i.e. control) 
all.loci.ratios <- meth_unite_10x_reshaped %>% 
  mutate(chr_start=paste(chr, start, sep="_")) %>%
  group_by( treatment, chr, start) %>%
  summarise(mean_percentMeth = mean(percMeth, na.rm = TRUE)) %>% #could instead use ... allCs_percent = 100*(sum(numCs)/sum(coverage)), 
  dcast(chr + start ~  treatment) %>% 
  mutate(ratio_H.L = H/L,  #in this ratio column, values <1 = HC hypomethylated, values >1 = SS hypomethylated 
         chr_start=paste(chr, start, sep="_"),
         diff_H.L = H-L)   #this is the difference between H & L (H - L)

## `summarise()` has grouped output by 'treatment', 'chr'. You can override using
## the `.groups` argument.
## Using mean_percentMeth as value column: use value.var to override.

# Any majuor differences in global methylatin among the  treatments? Here are summarys tats of % methylation across all shared loci: 
summary(all.loci.ratios$H)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   72.48   87.04   78.92   93.24  100.00

summary(all.loci.ratios$L)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   74.06   87.70   79.54   93.25  100.00

# As determined in previous code chunks % meth is slightly higher in the high pCO2 treatment group, while % meth is slightly lower in DMLs in high pCO2 group 

# Do %methylation (all loci) distributions differ among  treatment using all loci? 
#ggplotly(
all.loci.ratios %>% 
pivot_longer(cols = c(H, L),
   names_to = "treatment",
   values_to = "mean_meth") %>%
  ggplot(aes(x=mean_meth, fill= treatment)) +
  geom_density(alpha=0.2) +
    scale_fill_manual(values=c("firebrick3", "dodgerblue3"), 
                    labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) + 
theme_minimal() #)

# Using DMLs
DML.ratios %>% 
pivot_longer(cols = c(H, L),
   names_to = "treatment",
   values_to = "mean_meth") %>%
  ggplot(aes(x=mean_meth, fill= treatment)) +
  geom_density(alpha=0.2) +
    scale_fill_manual(values=c("firebrick3", "dodgerblue3"), 
                    labels=c("H" = "High pCO2", "L" = "Control pCO2"), name=NULL) + 
theme_minimal() + xlab("Mean methylation (within treatment)") + 
  ggtitle("% Methylation distribution for DMLs by treatment")

Heatmap

#load("../analyses/methylation-filtered/R-objects/meth_unite_10x_reshaped")
meth_unite_10x_reshaped %>% dplyr::select(sample, treatment) %>% distinct()

##    sample treatment
## 1       1         L
## 2       2         L
## 3       3         L
## 4       4         L
## 5       7         L
## 6      10         H
## 7      11         H
## 8      12         L
## 9      15         H
## 10     17         H
## 11     18         H
## 12     20         H

DMLs_heatmap <- 
  meth_unite_10x_reshaped %>% 
    left_join(sample.info[c("sample_seq", "tank")] %>%  mutate(sample_seq=as.character(sample_seq)), by=c("sample"="sample_seq")) %>% 
    arrange(treatment, tank) %>%
  mutate(sample=factor(sample, ordered = T, levels = c(3,4,7,1,2,12,20,17,8,10,18,11,15)), 
         chr_start=paste(chr, start, sep="_")) %>%
  filter(
           start %in% dml25_counts_unique$start & 
           chr %in% DML.ratios$chr)

#ggplotly(
ggplot(DMLs_heatmap, aes(sample, chr_start, fill= percMeth)) + xlab("Sample") + geom_tile(na.rm = T) +
  scale_y_discrete(limits=(DML.ratios[order(DML.ratios$ratio_H.L),]$chr_start)) + 
  #scale_fill_viridis(discrete=FALSE) 
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) + 
  scale_fill_distiller(palette = "YlGnBu", direction = 1) #)

  #scale_fill_gradient(low="gray5", high="white")

sample.info[c("high_or_low_co2", "sample_seq", "tank")]

## # A tibble: 20 × 3
##    high_or_low_co2 sample_seq tank 
##    <fct>                <dbl> <fct>
##  1 H                       11 4    
##  2 H                       15 4    
##  3 H                        8 2    
##  4 H                       10 2    
##  5 H                       19 6    
##  6 H                       20 6    
##  7 H                        9 6    
##  8 L                        3 1    
##  9 L                        4 1    
## 10 L                        5 1    
## 11 L                        6 1    
## 12 L                        7 1    
## 13 L                       14 1    
## 14 L                        1 3    
## 15 L                        2 3    
## 16 L                       13 3    
## 17 L                       12 3    
## 18 H                       16 6    
## 19 H                       18 2    
## 20 H                       17 6

Alignment to mitochondrial DNA to estimate conversion rate

# Join sample info with bismark summary report 

(sample.info.mito <- sample.info %>% 
   dplyr::select(sample, tank, treatment, high_or_low_co2, sample_seq, developmental_stage, 
                 dna_siolation_batch, perc_totalread_unique, CpGs_Meth, CHGs_Meth, CHHs_Meth) %>% 
  
  right_join(

  # Read in bismark summary report
    read_delim(file="../results/bismark/mito/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
  clean_names() %>% 
   separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
   mutate(sample_prep=paste(a, b, sep = "-")) %>%
    mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
   #dplyr::select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
  mutate(perc_totalread_unique_mito=unique_reads_remaining/total_reads, 
         CpGs_Meth_mito=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs), 
         CHGs_Meth_mito=100*methylated_chgs/(methylated_chgs+unmethylated_chgs), 
         CHHs_Meth_mito=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)) %>% 
    dplyr::select(sample_prep, perc_totalread_unique_mito, CpGs_Meth_mito, CHGs_Meth_mito, CHHs_Meth_mito),

  by = c("sample"="sample_prep")) %>%
  mutate(included=case_when(
    sample_seq== 1 ~ "yes",
    sample_seq== 2 ~ "yes",
    sample_seq== 3 ~ "yes",     
    sample_seq== 4 ~ "yes",
    sample_seq== 7 ~ "yes",
    sample_seq== 12 ~ "yes",
    sample_seq== 10 ~ "yes",
    sample_seq== 11 ~ "yes",
    sample_seq== 15 ~ "yes",
    sample_seq== 17 ~ "yes",
    sample_seq== 18 ~ "yes",
    sample_seq== 20 ~ "yes",
    TRUE ~ "no")))

## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 20 × 16
##    sample  tank  treat…¹ high_…² sampl…³ devel…⁴ dna_s…⁵ perc_…⁶ CpGs_…⁷ CHGs_…⁸
##    <chr>   <fct> <fct>   <fct>     <dbl> <fct>   <fct>     <dbl>   <dbl>   <dbl>
##  1 CH05-24 4     HC      H            11 J6      1a        0.315   71.6     1.59
##  2 CH09-02 4     HC      H            15 J6      3b        0.297   68.4     1.81
##  3 CH05-01 2     HB      H             8 J7      1a        0.282   51.8     2.05
##  4 CH05-21 2     HB      H            10 J7      3a        0.311   64.1     1.28
##  5 CH10-08 6     HA      H            19 J6      2b        0.279   31.2     1.82
##  6 CH10-11 6     HA      H            20 J6      3b        0.319   67.7     1.52
##  7 CH05-06 6     HA      H             9 J7      3a        0.160    7.01    1.40
##  8 CH01-22 1     LB      L             3 J6      2b        0.300   66.2     1.17
##  9 CH01-38 1     LB      L             4 J6      1b        0.304   68.1     1.63
## 10 CH03-04 1     LB      L             5 J7      1b        0.235   11.4     2.48
## 11 CH03-15 1     LB      L             6 J6      2a        0.191   19.4     1.53
## 12 CH03-33 1     LB      L             7 J6      3b        0.308   72.5     1.15
## 13 CH07-24 1     LB      L            14 J7      2a        0.181    7.00    1.58
## 14 CH01-06 3     LC      L             1 J7      1a        0.311   73.4     1.25
## 15 CH01-14 3     LC      L             2 J6      3a        0.265   67.7     1.67
## 16 CH07-11 3     LC      L            13 J6      3a        0.181    7.80    1.57
## 17 CH07-06 3     LC      L            12 J7      2a        0.307   64.1     1.34
## 18 CH09-13 6     HA      H            16 J7      2b        0.120   45.8     1.91
## 19 CH10-01 2     HB      H            18 J6      4a        0.353   70.8     1.44
## 20 CH09-28 6     HA      H            17 J6      4a        0.317   67.6     1.73
## # … with 6 more variables: CHHs_Meth <dbl>, perc_totalread_unique_mito <dbl>,
## #   CpGs_Meth_mito <dbl>, CHGs_Meth_mito <dbl>, CHHs_Meth_mito <dbl>,
## #   included <chr>, and abbreviated variable names ¹treatment,
## #   ²high_or_low_co2, ³sample_seq, ⁴developmental_stage, ⁵dna_siolation_batch,
## #   ⁶perc_totalread_unique, ⁷CpGs_Meth, ⁸CHGs_Meth

# **Good samples:** all have mean % methylation >= 70% and nloci meeting 10x > 3e5
# control pCO2: 1, 2, 3, 4, 7, 12
# high pCO2:    10, 11, 15, 17, 18, 20

ggplot(sample.info.mito, aes(x=CpGs_Meth_mito, y=CpGs_Meth)) + 
  geom_point(aes(shape=high_or_low_co2, color=included), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           #geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  #annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) + 
ggtitle("% CpG Methylation full genome ~ mitochondria") + 
  xlab("Mitochindria CpG meth %") + ylab("Full Genome CpG % meth")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(sample.info.mito, aes(x=perc_totalread_unique_mito, y=CpGs_Meth_mito)) + 
  geom_point(aes(shape=high_or_low_co2, color=included), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           #geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  #annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) + 
ggtitle("mitochondria % CpG methylation ~ alignment rate") + 
  xlab("Mitochindria alignment rate") + ylab("Mitochindria CpG meth %")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(sample.info.mito, aes(x=included, y=CpGs_Meth_mito, color=included)) + 
  geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.15, aes(shape=high_or_low_co2)) + 
  theme_minimal() + scale_color_manual(values=c("red", "blue")) #+

           #geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  #theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  #annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) + 
#ggtitle("% CpG Methylation full genome ~ mitochondria") + 
#  xlab("Mitochindria CpG meth %") + ylab("Full Genome CpG % meth")

summary(sample.info.mito$CpGs_Meth_mito)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.828   6.274  13.773  13.900  20.622  27.509

BONEYARD

Divergent Bar plot of DMLs

# DML.ratios %>% 
#            mutate(hypermethylated=ifelse(diff_H.L>0, "H", "L")) %>%
#   ggplot(aes(x = chr_start, y = diff_H.L, fill=hypermethylated)) +
#   geom_bar(stat = "identity", width=0.41) +
#     scale_x_discrete(limits=(DML.ratios[order(DML.ratios$diff_H.L),]$chr_start)) + 
#   scale_y_continuous(limits=c(-80,80), breaks=c(-75, -50, -25, 0),
#                        labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%"),
#                      sec.axis = sec_axis(trans=~.*1, breaks=c(75, 25, 50, 0),
#                        labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%")))  +
#     theme(axis.text.y=element_blank(), axis.ticks.y = element_blank(), 
#           legend.position = "none", text = element_text(size=14, color="gray30"),
#           panel.background = element_blank(),
#           panel.border = element_rect(colour = "gray30", fill=NA, size=.4)) +
#     #ggtitle("DMLs, showing % methylation difference among  treatments") +
#     scale_fill_manual(values=c("firebrick3", "dodgerblue3")) +
#     #annotate(geom="text", x=-20, y=-65, label="Loci hypermethylated in\nSouth Sound", color="dodgerblue3") +
#     #annotate(geom="text", x=260, y=62, label="Loci hypermethylated in\nHood Canal", color="firebrick3") +
#   ylab("% methylation difference among  treatments") + xlab("Locus") + coord_flip()
# ggsave(filename = "../analyses/DMLs/DMLs-diff-barplot.pdf", width=7, height=5)

Extract correlation matrix (pearson) and distance matrix (manhattan) from the % methylated matrix

FYI I checked that the correlation matrix in getCorrelation is derived from the percent methylated matrix. It is!

# cor.pears <- cor(perc.meth, method="pearson") # create pearson correlation matrix. 
# dist.manhat <- dist(t(perc.meth), method = "manhattan", diag = T, upper = F)
# dist.manhat.df <- reshape2::melt(as.matrix(dist.manhat), varnames = c("row", "col"))
# 
# cor.pears.DMLs <- cor(perc.methDMLs, method="pearson") 
# dist.manhat.DMLs <- dist(t(perc.methDMLs), method = "manhattan", diag = T, upper = F) 
# dist.manhat.DMLs.df <- reshape2::melt(as.matrix(dist.manhat.DMLs), varnames = c("row", "col"))

Read in sample number key, merge with distance matrix dataframes

#key <- read.csv(here::here("data", "sample-key.csv"))

# dist.manhat.df <- merge(x=merge(x=dist.manhat.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
# colnames(dist.manhat.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
# write.csv(dist.manhat.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.csv"), row.names=FALSE) 
# 
# dist.manhat.DMLs.df <- merge(x=merge(x=dist.manhat.DMLs.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
# colnames(dist.manhat.DMLs.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
# write.csv(dist.manhat.DMLs.df, file=here::here("analyses", "methylation-filtered", "dist.manhat.DMLs.csv"), row.names=FALSE)

Principal Component Analyses, DMLs only

# load("../analyses/DMLs/R-objects/dml25_counts")
# 
# PCA.DMLs <- PCASamples(dml25_counts, obj.return = TRUE, sd.filter = T) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
# nrow(PCA.DMLs$rotation)
# 
# summary(PCA.DMLs) #Look at summary statistics. The first PC explains 36.3% of variation, the second PC explains 8.5% of variation.  # new analysis/filtering AND >5x replace with NA: PC1=36.5% PC2=9.3%

Save DML PC scores and R object for integration with other data

# save(PCA.DMLs, file="../analyses/DMLs/R-objects/PCA.DMLs")
# write_delim(data.frame(PCA.DMLs$x) %>% tibble::rownames_to_column("sample"), "../analyses/DMLs/PC-scores-DMLs.tab",  delim = '\t', col_names = TRUE)

PCA Figure, DMLs only

# par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
# PCA.figure <- ordiplot(PCA.DMLs, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
# points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.5) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
# #Add multiple white boxes on top of the default black box to manually change the color
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# box(col = "white")
# #ordiellipse(PCA.DMLs, plotCustomization$ treatment, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
# #ordiellipse(PCA.DMLs, plotCustomization$ treatment, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
# axis(side =  1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
# mtext(side = 1, text = "PC 1 (36.5%)", line = 3, cex = 1.5) #Add x-axis label
# axis(side =  2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
# mtext(side = 2, text = "PC 2 (9.3%)", line = 3, cex = 1.5) #Add y-axis label
# legend("topright", 
#        pch = c(16, 17), 
#        legend = c("Hood Canal", "South Sound"), 
#        col = c(c("firebrick3", "dodgerblue3")), 
#        cex = 1.4, bty = "n") #Add a legend with information about ambient and elevated samples
# 
# save(PCA.figure, file="../analyses/DMLs/R-objects/PCA.figure")

OTHER POSSIBLE ANALYSIS OPTIONS - randomize which samples are “Treatment” and run 10 permutations to see now many loci are DMLs then - Try to pull SNPs from RNASeq data to identify pop structure - Research cool software for integrating meth+rnaseq. AI options for integration?