list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot", "janitor", "googlesheets4", "ggrepel") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
# Load MethylKit
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("methylKit")
require(methylKit)
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] BiocManager_1.30.19 ggrepel_0.9.2
## [3] googlesheets4_1.0.1 janitor_2.1.0
## [5] corrplot_0.92 PerformanceAnalytics_2.0.4
## [7] xts_0.12.2 zoo_1.8-11
## [9] factoextra_1.0.7 vegan_2.6-4
## [11] lattice_0.20-45 permute_0.9-7
## [13] plotly_4.10.1 viridis_0.6.2
## [15] viridisLite_0.4.1 Pstat_1.2
## [17] matrixStats_0.62.0 ggforce_0.4.1
## [19] methylKit_1.24.0 GenomicRanges_1.49.0
## [21] GenomeInfoDb_1.34.2 IRanges_2.32.0
## [23] S4Vectors_0.36.0 BiocGenerics_0.44.0
## [25] here_1.0.1 reshape2_1.4.4
## [27] forcats_0.5.2 stringr_1.4.1
## [29] dplyr_1.0.10 purrr_0.3.5
## [31] readr_2.1.3 tidyr_1.2.1
## [33] tibble_3.1.8 ggplot2_3.4.0
## [35] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1
## [3] plyr_1.8.7 lazyeval_0.2.2
## [5] splines_4.2.2 BiocParallel_1.32.1
## [7] digest_0.6.30 htmltools_0.5.3
## [9] fansi_1.0.3 magrittr_2.0.3
## [11] cluster_2.1.4 tzdb_0.3.0
## [13] limma_3.54.0 Biostrings_2.66.0
## [15] modelr_0.1.9 R.utils_2.12.2
## [17] bdsmatrix_1.3-6 timechange_0.1.1
## [19] colorspace_2.0-3 rvest_1.0.3
## [21] haven_2.5.1 xfun_0.34
## [23] crayon_1.5.2 RCurl_1.98-1.9
## [25] jsonlite_1.8.3 glue_1.6.2
## [27] polyclip_1.10-4 gtable_0.3.1
## [29] gargle_1.2.1 zlibbioc_1.44.0
## [31] XVector_0.38.0 DelayedArray_0.23.2
## [33] scales_1.2.1 mvtnorm_1.1-3
## [35] DBI_1.1.3 Rcpp_1.0.9
## [37] emdbook_1.3.12 mclust_6.0.0
## [39] htmlwidgets_1.5.4 httr_1.4.4
## [41] ellipsis_0.3.2 pkgconfig_2.0.3
## [43] XML_3.99-0.12 R.methodsS3_1.8.2
## [45] farver_2.1.1 sass_0.4.2
## [47] dbplyr_2.2.1 utf8_1.2.2
## [49] tidyselect_1.2.0 rlang_1.0.6
## [51] munsell_0.5.0 cellranger_1.1.0
## [53] tools_4.2.2 cachem_1.0.6
## [55] cli_3.4.1 generics_0.1.3
## [57] fastseg_1.44.0 broom_1.0.1
## [59] evaluate_0.18 fastmap_1.1.0
## [61] yaml_2.3.6 knitr_1.40
## [63] fs_1.5.2 nlme_3.1-160
## [65] R.oo_1.25.0 xml2_1.3.3
## [67] compiler_4.2.2 rstudioapi_0.14
## [69] reprex_2.0.2 tweenr_2.0.2
## [71] bslib_0.4.1 stringi_1.7.8
## [73] Matrix_1.5-1 vctrs_0.5.0
## [75] pillar_1.8.1 lifecycle_1.0.3
## [77] jquerylib_0.1.4 data.table_1.14.4
## [79] bitops_1.0-7 rtracklayer_1.58.0
## [81] qvalue_2.30.0 R6_2.5.1
## [83] BiocIO_1.8.0 gridExtra_2.3
## [85] codetools_0.2-18 MASS_7.3-58.1
## [87] gtools_3.9.3 assertthat_0.2.1
## [89] SummarizedExperiment_1.28.0 rprojroot_2.0.3
## [91] rjson_0.2.21 withr_2.5.0
## [93] GenomicAlignments_1.34.0 Rsamtools_2.14.0
## [95] GenomeInfoDbData_1.2.9 mgcv_1.8-41
## [97] parallel_4.2.2 hms_1.1.2
## [99] quadprog_1.5-8 grid_4.2.2
## [101] coda_0.19-4 snakecase_0.11.0
## [103] rmarkdown_2.17 MatrixGenerics_1.10.0
## [105] googledrive_2.0.0 bbmle_1.0.25
## [107] numDeriv_2016.8-1.1 Biobase_2.58.0
## [109] lubridate_1.9.0 restfulr_0.0.15
I ran this code interactively on Sedna
srun --pty /bin/bash
# First, I entered an interactive
Sedna node module load R
# Then, load R R
#start R interactively Now proceed with the following code
# ## AGAIN- this code was run interactively on Sedna!
# # Install and load programs
#
# # Load MethylKit
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("methylKit")
#
# require(methylKit)
# require(tidyverse)
#
# # Load sample info dataframe
# load(file="/home/lspencer/DuMOAR/R/sample.info")
#
# # Triple check that treatments are in the order of sequence sample number (1-20)
# (sample.info %>% arrange(sample_seq))$sample # List out library prep sample number
# (sample.info %>% arrange(sample_seq))$sample_seq # List out sequence sample
# (sample.info %>% arrange(sample_seq))$high_or_low_co2 # List out treatment
#
# # Read .bam files into Methylkit on Sedna
# (file.list=list("/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_06_S1_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_14_S2_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_22_S3_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH01_38_S4_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_04_S5_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_15_S6_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH03_33_S7_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_01_S8_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_06_S9_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_21_S10_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH05_24_S11_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_06_S12_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_11_S13_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH07_24_S14_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_02_S15_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_13_S16_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH09_28_S17_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_01_S18_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_08_S19_R1_bismark_bt2_pe.deduplicated.sorted.bam",
# "/scratch/lspencer/DuMOAR/aligned/trim-before-concat/CH10_11_S20_R1_bismark_bt2_pe.deduplicated.sorted.bam"))
#
# #### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)
#
# meth_obj = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"),
# assembly = "PGA_assembly.fasta", read.context="CpG", mincov=2,
# treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2))
#
# #### Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward.
#
# save(meth_obj, file = "/home/lspencer/DuMOAR/R/meth_obj")
# Read in data from GoogleSheet
sample.info <- read_sheet("https://docs.google.com/spreadsheets/d/1ym0XnYVts98tIUCn0kIaU6VuvqxzV7LoSx9RHwLdiIs/edit?usp=sharing", "DNAisolations_only") %>%
clean_names() %>%
mutate_at(vars(tank, treatment, high_or_low_co2, developmental_stage, dna_siolation_batch), as.factor) %>%
# Join with bismark summary report
right_join(
# Read in bismark summary report
read_delim(file="../results/bismark/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_R1_bismark_bt2_pe.bam', "")) %>%
clean_names() %>%
separate(file, into = c("a", "b", "sample_seq"), remove = F, sep = "_") %>%
mutate(sample_prep=paste(a, b, sep = "-")) %>%
mutate(sample_seq=as.numeric(str_replace_all(sample_seq, "S", ""))) %>%
select(file, sample_prep, sample_seq, everything(), -a, -b) %>%
mutate(perc_totalread_unique=unique_reads_remaining/total_reads,
CpGs_Meth=100*methylated_cp_gs/(methylated_cp_gs+unmethylated_cp_gs),
CHGs_Meth=100*methylated_chgs/(methylated_chgs+unmethylated_chgs),
CHHs_Meth=100*methylated_ch_hs/(methylated_ch_hs+unmethylated_ch_hs)),
by = c("sample"="sample_prep"))
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <]8;;https://gargle.r-lib.org/articles/non-interactive-auth.htmlhttps://gargle.r-lib.org/articles/non-interactive-auth.html]8;;>
## ℹ The googlesheets4 package is using a cached token for
## ']8;;mailto:laura.spencer@noaa.govlaura.spencer@noaa.gov]8;;'.
## ✔ Reading from "OA Crab Sample Collection 071119".
## ✔ Range ''DNAisolations_only''.
## Rows: 20 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): File
## dbl (14): Total Reads, Aligned Reads, Unaligned Reads, Ambiguously Aligned R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
save(sample.info, file="../data/sample.info")
load(file="../data/sample.info")
cor(sample.info %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=0.65)
#trace("chart.Correlation", edit=T) #to edit function
# In line 17, change the cex value for what you want, i changed it to 2
chart.Correlation(sample.info %>%
select(CpGs_Meth, a260_280, mean_mw_highest_peak, aligned_reads,
unaligned_reads, ambiguously_aligned_reads, no_genomic_sequence,
unique_reads_remaining, total_cs, perc_totalread_unique, CHGs_Meth, CHHs_Meth),
histogram=F, pch=19)
Correlations between CpGs_Meth and the following:
- a260_280: 0.52*
- mean_mw_highest_peak: 0.51*
- aligned_reads: 0.66**
- unaligned_reads: -0.77***
- ambiguously_aligned_reads: -0.60**
- no_genomic_sequence: -0.93***
- unique_reads_remaining: 0.73***
- total_cs: 0.72***
- perc_totalread_unique: 0.82***
- CHGs_Meth: -0.38
- CHHs_Meth: -0.59**
ggplot(sample.info, aes(x=a260_280, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 1.5, y = 75, label = "r = 0.52", size=6) +
ggtitle("% CpG Methylation ~ a260_280")
ggplot(sample.info, aes(x=mean_mw_highest_peak, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 10000, y = 100, label = "r = 0.51", size=6) +
ggtitle("% CpG Methylation ~ Mean MW (highest peak)")
ggplot(sample.info, aes(x=aligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 6e6, y = 85, label = "r = 0.66", size=6) +
ggtitle("% CpG Methylation ~ # of aligned reads")
ggplot(sample.info, aes(x=unaligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 1e7, y = 85, label = "r = -0.77", size=6) +
ggtitle("% CpG Methylation ~ # unaligned reads")
ggplot(sample.info, aes(x=ambiguously_aligned_reads, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 6e6, y = 85, label = "r = -0.60", size=6) +
ggtitle("% CpG Methylation ~ # ambiguously aligned reads")
ggplot(sample.info, aes(x=no_genomic_sequence, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 285, y = 75, label = "r = -0.93", size=6) +
ggtitle("% CpG Methylation ~ # unaligned- no genomic sequence")
No. of reads not aligning due to no genomic sequence corrlates very well with % methylation, but the actual # of reads is super low across all samples (<300!). What does this mean?!
ggplot(sample.info, aes(x=unique_reads_remaining, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 3.5e6, y = 95, label = "r = 0.73", size=6) +
ggtitle("% CpG Methylation ~ # unique aligned reads")
ggplot(sample.info, aes(x=total_cs, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 6e7, y = 90, label = "r = 0.72", size=6) +
ggtitle("% CpG Methylation ~ total # Cs")
ggplot(sample.info, aes(x=perc_totalread_unique, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = .15, y = 85, label = "r = 0.82", size=6) +
ggtitle("% CpG Methylation ~ % total reads uniquely aligned")
ggplot(sample.info, aes(x=CHGs_Meth, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 2.25, y = 70, label = "r = -0.38", size=6) +
ggtitle("% CpG Methylation ~ % CHG Methylation")
ggplot(sample.info, aes(x=CHHs_Meth, y=CpGs_Meth)) +
geom_point(aes(label=sample_seq, color=high_or_low_co2), size=3) +
scale_color_manual(values=c("red", "blue")) +
geom_text(aes(label=sample_seq, color=high_or_low_co2), vjust = -0.7) +
theme_minimal() + geom_smooth(method = "lm", se=T, color="black", linetype="dotted", fill="gray80") +
annotate("text", x = 30, y = 75, label = "r = -0.59", size=6) +
ggtitle("% CpG Methylation ~ % CHH Methylation")
load("../results/methylkit/meth_obj")
# Check out data for sample #1
head(meth_obj[[1]])
## chr start end strand coverage numCs numTs
## 1 contig_1_pilon__unscaffolded 3076 3076 + 2 0 2
## 2 contig_1_pilon__unscaffolded 3120 3120 + 2 0 2
## 3 contig_1_pilon__unscaffolded 3129 3129 + 2 0 2
## 4 contig_1_pilon__unscaffolded 3138 3138 + 2 0 2
## 5 contig_1_pilon__unscaffolded 3168 3168 + 2 0 2
## 6 contig_1_pilon__unscaffolded 3183 3183 + 2 0 2
for (i in 1:20) {
getMethylationStats(meth_obj[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
#### Now look at coverage for each sample
for (i in 1:20) {
getCoverageStats(meth_obj[[i]],plot=TRUE,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_5x=filterByCoverage(meth_obj,lo.count=5,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(meth_5x, file="../results/methylkit/meth_5x")
for (i in 1:20) {
getMethylationStats(meth_5x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_10x=filterByCoverage(meth_obj,lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(meth_10x, file="../results/methylkit/meth_10x")
for (i in 1:20) {
getMethylationStats(meth_10x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
# meth_unite=methylKit::unite(meth_obj, destrand=TRUE, min.per.group=1L)
# save(meth_unite, file = "../results/methylkit/meth_unite") #save object to file
load(file = "../results/methylkit/meth_unite") #save object to file
clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 20
PCASamples(meth_unite, screeplot = T)
PCASamples(meth_unite)
meth_unite_nolows <- reorganize(methylObj = meth_unite, sample.ids = c("1", "2", "3", "4", "7", "8", "10", "11", "12", "15", "16", "17", "18", "20"),
treatment = as.numeric((sample.info %>% arrange(sample_seq))$high_or_low_co2)[c(1:4, 7, 8, 10:12, 15:18, 20)])
PCASamples(meth_unite_nolows)
# Here - use all methylation data
#load(file = "../results/methylkit/meth_unite") #save object to file
meth_unite_reshaped <- meth_unite %>%
melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
drop_na(count) %>%
mutate(stat=gsub('[0-9]+', '', variable)) %>%
mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
select(-variable) %>%
pivot_wider(names_from = stat, values_from = count) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
mutate(sample=as.numeric(sample))
save(meth_unite_reshaped, file = "../results/methylkit/meth_unite_reshaped")
# load(file = "../results/methylkit/meth_unite_reshaped")
# # Here - use a smaller data set, already filtered for min coverage 5x
# meth_unite_5x <- methylKit::unite(meth_5x, destrand=TRUE, min.per.group=1L)
# save(meth_unite_5x, file = "../results/methylkit/meth_unite_5x") #save object to file
# load(file = "../results/methylkit/meth_unite_5x") #save object to file
#
# meth_unite_5x_reshaped <- meth_unite_5x %>%
# melt(id=c("chr", "start", "end", "strand"), value.name = "count") %>%
# drop_na(count) %>%
# mutate(stat=gsub('[0-9]+', '', variable)) %>%
# mutate(sample=gsub('[a-zA-Z]+', '', variable)) %>%
# select(-variable) %>%
# pivot_wider(names_from = stat, values_from = count) %>%
# mutate(percMeth = 100*(numCs/coverage)) %>%
# mutate(sample=as.numeric(sample))
#
# save(meth_unite_5x_reshaped, file = "meth_unite_5x_reshaped")
# save(meth_unite_5x_reshaped, file = "../results/methylkit/meth_unite_5x_reshaped")
meth_unite_reshaped %>%
group_by(sample) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, no coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=5) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 5x coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=10) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 10x coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
meth_unite_reshaped %>%
group_by(sample) %>%
filter(coverage>=15) %>%
summarize(mean = mean(percMeth), nloci = n()) %>%
left_join(sample.info[c("treatment", "sample_seq", "high_or_low_co2")], by=c("sample"="sample_seq")) %>%
ggplot(aes(x=nloci, y=mean, label=sample)) + geom_point(aes(colour=high_or_low_co2)) +
geom_label_repel(min.segment.length=0.25,aes(colour=high_or_low_co2)) + theme_minimal() +
ggtitle("% methylation ~ # loci, 15x coverage minimum") +
scale_color_manual(values = c("red", "blue")) +
geom_smooth(method = "loess", se=T, color="black", linetype="dotted", fill="gray80")
## `geom_smooth()` using formula = 'y ~ x'
The following samples have low CpG methylation: 5, 6, 9, 13, 14, 19. Check out each sample individually
meth_unite_reshaped %>%
filter(coverage>=5) %>%
ggplot() + geom_point(aes(x=coverage, y=percMeth), size=0.05) +
theme_minimal() +
facet_wrap(~sample)