Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot", "janitor", "googlesheets4", "ggrepel") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

# Load MethylKit
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("methylKit")
require(methylKit)

Obtain session information

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] BiocManager_1.30.19        ggrepel_0.9.2             
##  [3] googlesheets4_1.0.1        janitor_2.1.0             
##  [5] corrplot_0.92              PerformanceAnalytics_2.0.4
##  [7] xts_0.12.2                 zoo_1.8-11                
##  [9] factoextra_1.0.7           vegan_2.6-4               
## [11] lattice_0.20-45            permute_0.9-7             
## [13] plotly_4.10.1              viridis_0.6.2             
## [15] viridisLite_0.4.1          Pstat_1.2                 
## [17] matrixStats_0.62.0         ggforce_0.4.1             
## [19] methylKit_1.24.0           GenomicRanges_1.49.0      
## [21] GenomeInfoDb_1.34.2        IRanges_2.32.0            
## [23] S4Vectors_0.36.0           BiocGenerics_0.44.0       
## [25] here_1.0.1                 reshape2_1.4.4            
## [27] forcats_0.5.2              stringr_1.4.1             
## [29] dplyr_1.0.10               purrr_0.3.5               
## [31] readr_2.1.3                tidyr_1.2.1               
## [33] tibble_3.1.8               ggplot2_3.4.0             
## [35] tidyverse_1.3.2           
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1                backports_1.4.1            
##   [3] plyr_1.8.7                  lazyeval_0.2.2             
##   [5] splines_4.2.2               BiocParallel_1.32.1        
##   [7] digest_0.6.30               htmltools_0.5.3            
##   [9] fansi_1.0.3                 magrittr_2.0.3             
##  [11] cluster_2.1.4               tzdb_0.3.0                 
##  [13] limma_3.54.0                Biostrings_2.66.0          
##  [15] modelr_0.1.9                R.utils_2.12.2             
##  [17] bdsmatrix_1.3-6             timechange_0.1.1           
##  [19] colorspace_2.0-3            rvest_1.0.3                
##  [21] haven_2.5.1                 xfun_0.34                  
##  [23] crayon_1.5.2                RCurl_1.98-1.9             
##  [25] jsonlite_1.8.3              glue_1.6.2                 
##  [27] polyclip_1.10-4             gtable_0.3.1               
##  [29] gargle_1.2.1                zlibbioc_1.44.0            
##  [31] XVector_0.38.0              DelayedArray_0.23.2        
##  [33] scales_1.2.1                mvtnorm_1.1-3              
##  [35] DBI_1.1.3                   Rcpp_1.0.9                 
##  [37] emdbook_1.3.12              mclust_6.0.0               
##  [39] htmlwidgets_1.5.4           httr_1.4.4                 
##  [41] ellipsis_0.3.2              pkgconfig_2.0.3            
##  [43] XML_3.99-0.12               R.methodsS3_1.8.2          
##  [45] farver_2.1.1                sass_0.4.2                 
##  [47] dbplyr_2.2.1                utf8_1.2.2                 
##  [49] tidyselect_1.2.0            rlang_1.0.6                
##  [51] munsell_0.5.0               cellranger_1.1.0           
##  [53] tools_4.2.2                 cachem_1.0.6               
##  [55] cli_3.4.1                   generics_0.1.3             
##  [57] fastseg_1.44.0              broom_1.0.1                
##  [59] evaluate_0.18               fastmap_1.1.0              
##  [61] yaml_2.3.6                  knitr_1.40                 
##  [63] fs_1.5.2                    nlme_3.1-160               
##  [65] R.oo_1.25.0                 xml2_1.3.3                 
##  [67] compiler_4.2.2              rstudioapi_0.14            
##  [69] reprex_2.0.2                tweenr_2.0.2               
##  [71] bslib_0.4.1                 stringi_1.7.8              
##  [73] Matrix_1.5-1                vctrs_0.5.0                
##  [75] pillar_1.8.1                lifecycle_1.0.3            
##  [77] jquerylib_0.1.4             data.table_1.14.4          
##  [79] bitops_1.0-7                rtracklayer_1.58.0         
##  [81] qvalue_2.30.0               R6_2.5.1                   
##  [83] BiocIO_1.8.0                gridExtra_2.3              
##  [85] codetools_0.2-18            MASS_7.3-58.1              
##  [87] gtools_3.9.3                assertthat_0.2.1           
##  [89] SummarizedExperiment_1.28.0 rprojroot_2.0.3            
##  [91] rjson_0.2.21                withr_2.5.0                
##  [93] GenomicAlignments_1.34.0    Rsamtools_2.14.0           
##  [95] GenomeInfoDbData_1.2.9      mgcv_1.8-41                
##  [97] parallel_4.2.2              hms_1.1.2                  
##  [99] quadprog_1.5-8              grid_4.2.2                 
## [101] coda_0.19-4                 snakecase_0.11.0           
## [103] rmarkdown_2.17              MatrixGenerics_1.10.0      
## [105] googledrive_2.0.0           bbmle_1.0.25               
## [107] numDeriv_2016.8-1.1         Biobase_2.58.0             
## [109] lubridate_1.9.0             restfulr_0.0.15

Import bismark alignment .bams into MethylKit. NOTE: This was done on Sedna!

I ran this code interactively on Sedna srun --pty /bin/bash # First, I entered an interactive Sedna node module load R # Then, load R R #start R interactively Now proceed with the following code

# ## AGAIN- this code was run interactively on Sedna! 
# # Install and load programs
# 
# # Load MethylKit
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("methylKit")
# 
# require(methylKit)
# require(tidyverse)
# 
# # Load sample info dataframe 
# load(file="/home/lspencer/DuMOAR/R/sample.info")
# 
# # Triple check that treatments are in the order of sequence sample number (1-20)
# (sample.info %>% arrange(sample_seq))$sample # List out library prep sample number 
# (sample.info %>% arrange(sample_seq))$sample_seq # List out sequence sample 
# (sample.info %>% arrange(sample_seq))$high_or_low_co2 # List out treatment
# 
# # Read .bam files into Methylkit on Sedna 
# (file.list=list("/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_06_S1_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_14_S2_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_22_S3_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_38_S4_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_04_S5_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_15_S6_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_33_S7_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_01_S8_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_06_S9_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_21_S10_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_24_S11_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_06_S12_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_11_S13_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_24_S14_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_02_S15_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_13_S16_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_28_S17_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_01_S18_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_08_S19_R1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_11_S20_R1_bismark_bt2_pe.deduplicated.sorted.bam"))
# 
# #### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)
# 
# meth_obj = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"), 
#                           assembly = "PGA_assembly.fasta", read.context="CpG", mincov=2, 
#                           treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2))
# 
# #### Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward. 
# 
# save(meth_obj, file = "/home/lspencer/DuMOAR/R/meth_obj")

Read in Bismark summary stats and some library prep stats

# Read in data from GoogleSheet 
sample.info <- read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "DNAisolations_only") %>% 
  clean_names() %>% 
  mutate_at(vars(tank, treatment, high_or_low_co2, developmental_stage, dna_siolation_batch), as.factor) %>%
  
  # Join with bismark summary report 
  right_join(

  # Read in bismark summary report
    read_delim(file="../results/bismark/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
  clean_names() %>% 
   separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
   mutate(sample_prep=paste(a, b, sep = "-")) %>%
    mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
   select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
  mutate(perc_totalread_unique=unique_reads_remaining/total_reads, 
         CpGs_Meth=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs), 
         CHGs_Meth=100*methylated_chgs/(methylated_chgs+unmethylated_chgs), 
         CHHs_Meth=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)),

  by = c("sample"="sample_prep"))
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <]8;;https://gargle.r-lib.org/articles/non-interactive-auth.htmlhttps://gargle.r-lib.org/articles/non-interactive-auth.html]8;;>
## ℹ The googlesheets4 package is using a cached token for
##   ']8;;mailto:laura.spencer@noaa.govlaura.spencer@noaa.gov]8;;'.
## ✔ Reading from "OA Crab Sample Collection 071119".
## ✔ Range ''DNAisolations_only''.
## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
save(sample.info, file="../data/sample.info")
load(file="../data/sample.info")

Investigate possible factors influencing low %CpGmeth

Correlation bubble plot

cor(sample.info %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=0.65) 

Look closer at factors that appear to correlate with CpGs_Meth to get correlation statistics

#trace("chart.Correlation", edit=T) #to edit function
# In line 17, change the cex value for what you want, i changed it to 2

chart.Correlation(sample.info %>% 
                    select(CpGs_Meth, a260_280, mean_mw_highest_peak, aligned_reads, 
                           unaligned_reads, ambiguously_aligned_reads, no_genomic_sequence, 
                           unique_reads_remaining, total_cs, perc_totalread_unique, CHGs_Meth, CHHs_Meth), 
                  histogram=F, pch=19) 

Correlations between CpGs_Meth and the following:
- a260_280: 0.52*
- mean_mw_highest_peak: 0.51*
- aligned_reads: 0.66**
- unaligned_reads: -0.77***
- ambiguously_aligned_reads: -0.60**
- no_genomic_sequence: -0.93***
- unique_reads_remaining: 0.73***
- total_cs: 0.72***
- perc_totalread_unique: 0.82***
- CHGs_Meth: -0.38
- CHHs_Meth: -0.59**

ggplot(sample.info, aes(x=a260_280, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 1.5, y = 75, label = "r = 0.52", size=6) + 
ggtitle("% CpG Methylation ~ a260_280")

ggplot(sample.info, aes(x=mean_mw_highest_peak, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 10000, y = 100, label = "r = 0.51", size=6) + 
ggtitle("% CpG Methylation ~ Mean MW (highest peak)")

ggplot(sample.info, aes(x=aligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 6e6, y = 85, label = "r = 0.66", size=6) + 
ggtitle("% CpG Methylation ~ # of aligned reads")

ggplot(sample.info, aes(x=unaligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 1e7, y = 85, label = "r = -0.77", size=6) + 
ggtitle("% CpG Methylation ~ # unaligned reads")

ggplot(sample.info, aes(x=ambiguously_aligned_reads, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 6e6, y = 85, label = "r = -0.60", size=6) + 
ggtitle("% CpG Methylation ~ # ambiguously aligned reads")

ggplot(sample.info, aes(x=no_genomic_sequence, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 285, y = 75, label = "r = -0.93", size=6) + 
ggtitle("% CpG Methylation ~ # unaligned- no genomic sequence")

No. of reads not aligning due to no genomic sequence corrlates very well with % methylation, but the actual # of reads is super low across all samples (<300!). What does this mean?!

ggplot(sample.info, aes(x=unique_reads_remaining, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 3.5e6, y = 95, label = "r = 0.73", size=6) + 
ggtitle("% CpG Methylation ~ # unique aligned reads")

ggplot(sample.info, aes(x=total_cs, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 6e7, y = 90, label = "r = 0.72", size=6) + 
ggtitle("% CpG Methylation ~ total # Cs")

ggplot(sample.info, aes(x=perc_totalread_unique, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = .15, y = 85, label = "r = 0.82", size=6) + 
ggtitle("% CpG Methylation ~ % total reads uniquely aligned")

ggplot(sample.info, aes(x=CHGs_Meth, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) + 
ggtitle("% CpG Methylation ~ % CHG Methylation")

ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) + 
  geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) + 
  scale_color_manual(values=c("red", "blue")) +
           geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) + 
  theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
  annotate("text", x = 30, y = 75, label = "r = -0.59", size=6) + 
ggtitle("% CpG Methylation ~ % CHH Methylation")

Load methylKit object that was created on Sedna

load("../results/methylkit/meth_obj")

What does the resulting methylkit object look like?

# Check out data for sample #1 
head(meth_obj[[1]])
##                            chr start  end strand coverage numCs numTs
## 1 contig_1_pilon__unscaffolded  3076 3076      +        2     0     2
## 2 contig_1_pilon__unscaffolded  3120 3120      +        2     0     2
## 3 contig_1_pilon__unscaffolded  3129 3129      +        2     0     2
## 4 contig_1_pilon__unscaffolded  3138 3138      +        2     0     2
## 5 contig_1_pilon__unscaffolded  3168 3168      +        2     0     2
## 6 contig_1_pilon__unscaffolded  3183 3183      +        2     0     2

Check the basic stats about the methylation data - coverage and percent methylation.

First look at % CpG methylation for each sampe

for (i in 1:20) {
getMethylationStats(meth_obj[[i]],plot=T,both.strands=TRUE)
     mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

#### Now look at coverage for each sample

for (i in 1:20) {
 getCoverageStats(meth_obj[[i]],plot=TRUE,both.strands=TRUE) 
   mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Generate CpG methylation histograms after filtering for coverage

Filter out loci where coverage is less than 5x or greater than 100x.

meth_5x=filterByCoverage(meth_obj,lo.count=5,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(meth_5x, file="../results/methylkit/meth_5x")

Now look at CpG methylation after filtering for 5x coverage

for (i in 1:20) {
getMethylationStats(meth_5x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Filter out loci where coverage is less than 10x or greater than 100x.

meth_10x=filterByCoverage(meth_obj,lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(meth_10x, file="../results/methylkit/meth_10x")

Now look at CpG methylation after filtering for 10x coverage

for (i in 1:20) {
getMethylationStats(meth_10x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Unite methylation data into one object for methylkit functions

column bind all samples, and destrand

# meth_unite=methylKit::unite(meth_obj, destrand=TRUE, min.per.group=1L)
# save(meth_unite, file = "../results/methylkit/meth_unite") #save object to file
load(file = "../results/methylkit/meth_unite") #save object to file

Cluster analysis

clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 20
PCASamples(meth_unite, screeplot = T)

PCASamples(meth_unite)

Generate PCA without weird samples (those with very low meth rates)

meth_unite_nolows <- reorganize(methylObj = meth_unite, sample.ids = c("1", "2", "3", "4", "7", "8", "10", "11", "12", "15", "16", "17", "18", "20"),
                                treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 8, 10:12, 15:18, 20)])

PCASamples(meth_unite_nolows)

Reshape methylation to inspect % methylation at various filtering thresholds

# Here - use all methylation data
#load(file = "../results/methylkit/meth_unite") #save object to file

meth_unite_reshaped <- meth_unite %>%
  melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
  drop_na(count) %>%
  mutate(stat=gsub('[0-9]+', '', variable)) %>%
  mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
  select(-variable) %>%
  pivot_wider(names_from = stat, values_from = count) %>%
  mutate(percMeth = 100*(numCs/coverage)) %>%
  mutate(sample=as.numeric(sample))

save(meth_unite_reshaped, file = "../results/methylkit/meth_unite_reshaped")
# load(file = "../results/methylkit/meth_unite_reshaped") 
# # Here - use a smaller data set, already filtered for min coverage 5x 
# meth_unite_5x <- methylKit::unite(meth_5x, destrand=TRUE, min.per.group=1L)
# save(meth_unite_5x, file = "../results/methylkit/meth_unite_5x") #save object to file
# load(file = "../results/methylkit/meth_unite_5x") #save object to file
# 
# meth_unite_5x_reshaped <- meth_unite_5x %>%
#   melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
#   drop_na(count) %>%
#   mutate(stat=gsub('[0-9]+', '', variable)) %>%
#   mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
#   select(-variable) %>%
#   pivot_wider(names_from = stat, values_from = count) %>%
#   mutate(percMeth = 100*(numCs/coverage)) %>%
#   mutate(sample=as.numeric(sample))
# 
# save(meth_unite_5x_reshaped, file = "meth_unite_5x_reshaped")
# save(meth_unite_5x_reshaped, file = "../results/methylkit/meth_unite_5x_reshaped")

No coverage threshold, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, no coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'

5x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=5) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 5x coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'

10x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=10) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 10x coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'

What is mean % methylated with 15x coverage?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  filter(coverage>=15) %>%
  summarize(mean = mean(percMeth), nloci = n()) %>%
  left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
  ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) + 
  geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
  ggtitle("% methylation ~ # loci, 15x coverage minimum") +
  scale_color_manual(values = c("red", "blue")) + 
  geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'

Is % methylation a function of coverage within a sample?

The following samples have low CpG methylation: 5, 6, 9, 13, 14, 19. Check out each sample individually

meth_unite_reshaped %>%
  filter(coverage>=5) %>%
  ggplot() + geom_point(aes(x=coverage, y=percMeth), size=0.05) +
  theme_minimal() + 
  facet_wrap(~sample)