--- title: "13-DSS" output: html_document date: "2025-08-09" --- ```{r} # Install once if needed: if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DSS") #install.packages(c("data.table","dplyr")) library(DSS) library(data.table) library(dplyr) ``` ```{r} # Load packages suppressPackageStartupMessages({ library(data.table) }) # 1A) Set the folder that holds your .cov/.cov.gz files cov_dir <- "../output/11-nextflow/coverage" # <- change if needed # 1B) List files cov_files <- list.files(cov_dir, pattern="\\.cov(\\.gz)?$", full.names=TRUE) length(cov_files); head(cov_files) # 1C) Show sizes (look for non-zero) sizes <- file.info(cov_files)$size data.frame(file=basename(cov_files), size=sizes)[order(sizes), ][1:10, ] # 1D) Peek inside the first file (works for .gz and plain .cov) f <- cov_files[1] if (grepl("\\.gz$", f)) { con <- gzfile(f, "rt") } else { con <- file(f, "rt") } peek <- try(readLines(con, n=5), silent=TRUE) close(con) peek ``` ```{r} library(data.table) # keep cov_files from Step 1; make sure it's sorted and only real files cov_files <- sort(list.files("../output/11-nextflow/coverage", pattern="\\.cov(\\.gz)?$", full.names=TRUE)) read_cov <- function(f) { dt <- fread(cmd = paste("gzip -dc", shQuote(f)), header = FALSE, showProgress = FALSE) if (ncol(dt) == 6) { # chr pos STRAND/END meth_pct meth unmeth third <- dt[[3]] if (is.character(third) || any(third %in% c("+","-"))) { setnames(dt, c("chr","pos","strand","meth_pct","meth","unmeth")) dt <- dt[, .(meth = sum(meth), unmeth = sum(unmeth)), by = .(chr, pos)] } else { setnames(dt, c("chr","pos","end","meth_pct","meth","unmeth")) dt <- dt[, .(chr, pos, meth, unmeth)] } } else if (ncol(dt) == 5) { setnames(dt, c("chr","pos","meth_pct","meth","unmeth")) dt <- dt[, .(chr, pos, meth, unmeth)] } else { stop("Unexpected column count (", ncol(dt), ") in: ", f) } dt[, N := meth + unmeth] dt[, X := meth] dt[, .(chr, pos, N, X)] } # Test on first file test_dt <- read_cov(cov_files[1]) str(test_dt); head(test_dt) ``` ```{r} # Keep cov_files from earlier clean_name <- function(x) { x <- basename(x) x <- sub("\\.CpG_report\\.merged_CpG_evidence\\.cov(\\.gz)?$", "", x, perl = TRUE) x <- sub("\\.CX_report\\.CpG_report\\.cov(\\.gz)?$", "", x, perl = TRUE) x <- sub("\\.bismark\\.cov(\\.gz)?$", "", x, perl = TRUE) x } sample_names <- vapply(cov_files, clean_name, character(1)) sample_names # Prefix before first _ or - prefix <- sub("[_-].*$", "", sample_names) table(prefix) # Map filename prefixes to group labels (edit if needed) group_map <- c(LCo = "popA", LYa = "popB") stopifnot(all(prefix %in% names(group_map))) group <- factor(unname(group_map[prefix]), levels = c("popA","popB")) names(group) <- sample_names # Expect 3 vs 3 table(group) ``` ```{r} # Uses the robust reader we already tested lst <- lapply(cov_files, read_cov) names(lst) <- sample_names # Quick QC: coverage summaries sapply(lst, function(x) quantile(x$N, c(0, .25, .5, .75, .9, .99), na.rm=TRUE)) # CpGs present in ALL 6 samples common_keys <- Reduce(function(a,b) merge(a,b,by=c("chr","pos"))[,.(chr,pos)], lapply(lst, function(x) x[,.(chr,pos)])) n_common <- nrow(common_keys); n_common ```