---
title: "13-DSS"
output: html_document
date: "2025-08-09"
---
```{r}
# Install once if needed:
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("DSS")
#install.packages(c("data.table","dplyr"))
library(DSS)
library(data.table)
library(dplyr)
```
```{r}
# Load packages
suppressPackageStartupMessages({
library(data.table)
})
# 1A) Set the folder that holds your .cov/.cov.gz files
cov_dir <- "../output/11-nextflow/coverage" # <- change if needed
# 1B) List files
cov_files <- list.files(cov_dir, pattern="\\.cov(\\.gz)?$", full.names=TRUE)
length(cov_files); head(cov_files)
# 1C) Show sizes (look for non-zero)
sizes <- file.info(cov_files)$size
data.frame(file=basename(cov_files), size=sizes)[order(sizes), ][1:10, ]
# 1D) Peek inside the first file (works for .gz and plain .cov)
f <- cov_files[1]
if (grepl("\\.gz$", f)) {
con <- gzfile(f, "rt")
} else {
con <- file(f, "rt")
}
peek <- try(readLines(con, n=5), silent=TRUE)
close(con)
peek
```
```{r}
library(data.table)
# keep cov_files from Step 1; make sure it's sorted and only real files
cov_files <- sort(list.files("../output/11-nextflow/coverage", pattern="\\.cov(\\.gz)?$", full.names=TRUE))
read_cov <- function(f) {
dt <- fread(cmd = paste("gzip -dc", shQuote(f)), header = FALSE, showProgress = FALSE)
if (ncol(dt) == 6) {
# chr pos STRAND/END meth_pct meth unmeth
third <- dt[[3]]
if (is.character(third) || any(third %in% c("+","-"))) {
setnames(dt, c("chr","pos","strand","meth_pct","meth","unmeth"))
dt <- dt[, .(meth = sum(meth), unmeth = sum(unmeth)), by = .(chr, pos)]
} else {
setnames(dt, c("chr","pos","end","meth_pct","meth","unmeth"))
dt <- dt[, .(chr, pos, meth, unmeth)]
}
} else if (ncol(dt) == 5) {
setnames(dt, c("chr","pos","meth_pct","meth","unmeth"))
dt <- dt[, .(chr, pos, meth, unmeth)]
} else {
stop("Unexpected column count (", ncol(dt), ") in: ", f)
}
dt[, N := meth + unmeth]
dt[, X := meth]
dt[, .(chr, pos, N, X)]
}
# Test on first file
test_dt <- read_cov(cov_files[1])
str(test_dt); head(test_dt)
```
```{r}
# Keep cov_files from earlier
clean_name <- function(x) {
x <- basename(x)
x <- sub("\\.CpG_report\\.merged_CpG_evidence\\.cov(\\.gz)?$", "", x, perl = TRUE)
x <- sub("\\.CX_report\\.CpG_report\\.cov(\\.gz)?$", "", x, perl = TRUE)
x <- sub("\\.bismark\\.cov(\\.gz)?$", "", x, perl = TRUE)
x
}
sample_names <- vapply(cov_files, clean_name, character(1))
sample_names
# Prefix before first _ or -
prefix <- sub("[_-].*$", "", sample_names)
table(prefix)
# Map filename prefixes to group labels (edit if needed)
group_map <- c(LCo = "popA", LYa = "popB")
stopifnot(all(prefix %in% names(group_map)))
group <- factor(unname(group_map[prefix]), levels = c("popA","popB"))
names(group) <- sample_names
# Expect 3 vs 3
table(group)
```
```{r}
# Uses the robust reader we already tested
lst <- lapply(cov_files, read_cov)
names(lst) <- sample_names
# Quick QC: coverage summaries
sapply(lst, function(x) quantile(x$N, c(0, .25, .5, .75, .9, .99), na.rm=TRUE))
# CpGs present in ALL 6 samples
common_keys <- Reduce(function(a,b) merge(a,b,by=c("chr","pos"))[,.(chr,pos)],
lapply(lst, function(x) x[,.(chr,pos)]))
n_common <- nrow(common_keys); n_common
```