--- title: "03-expression-matrix-correlations" format: html editor: visual --- lncRNA–mRNA Correlation Pipeline Description: Complete, reproducible workflow that normalises lncRNA and mRNA count matrices with DESeq2 and calculates pair‑wise Pearson correlations between every lncRNA–mRNA pair. Expected input files (tab‑ or comma‑separated): • lncrna_counts.tsv – genes/lncRNAs in rows, samples in columns • mrna_counts.tsv – mRNAs in rows, samples in identical columns/order Output files: • correlations_full.tsv – matrix of r values (lncRNAs × mRNAs) • correlations_padj.tsv – matrix of Benjamini‑Hochberg adjusted p‑values • significant_pairs.tsv – table of pairs passing |r| >= 0.7 & padj <= 0.05 The code uses block processing so memory stays < ~4 GB even for large datasets. **Update:** This version auto‑handles the specific formats you just uploaded: * `lncRNA_counts.txt` – featureCounts output (6 metadata cols + raw counts) * `mRNA_counts.txt` – Kallisto counts table (transcript IDs + numeric counts) We strip metadata columns, harmonise sample names (e.g. *sample140*), round the Kallisto numeric counts to integers for DESeq2, then run the same DESeq2 → VST  → pair‑wise Pearson correlation workflow as before. # Apul ## 1 Libraries ```{r} library(data.table) library(DESeq2) library(WGCNA) # p-values + block-wise tools if (!requireNamespace("bigmemory", quietly = TRUE)) install.packages("bigmemory") library(bigmemory) # RAM-friendly matrices library(dplyr) ``` ## 2 Read count matrices & tidy formatting ```{r} # ---------- 2.0 Load the raw tables --------------------------------------- lnc_raw <- fread(cmd = "grep -v '^#' ~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Apul/lncRNA_counts.txt") mrna_raw <- fread("~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Apul/mRNA_counts.txt") # ---------- 2.1 Quick sanity counts *before* you touch rownames() ---------- cat("\nlnc_raw rows :", nrow(lnc_raw), "| unique Geneid :", length(unique(lnc_raw$Geneid)), "| blanks :", sum(lnc_raw$Geneid == ""), "\n") cat("mrna_raw rows:", nrow(mrna_raw), "| unique IDs :", length(unique(mrna_raw[[1]])), "| blanks :", sum(mrna_raw[[1]] == ""), "\n\n") # ---------- 2.2 Drop blank IDs (if any) and make duplicates unique --------- lnc_raw <- lnc_raw [Geneid != ""] mrna_raw <- mrna_raw[mrna_raw[[1]] != ""] lnc_ids <- make.unique(lnc_raw$Geneid) mrna_ids <- make.unique(mrna_raw[[1]]) # ---------- 2.3 Build matrices with row-names that now *stick* ------------- lnc_mat <- as.matrix(as.data.frame(lnc_raw[, -(1:6)], row.names = lnc_ids)) mrna_mat <- as.matrix(as.data.frame(mrna_raw[, -1], row.names = mrna_ids)) # ---------- 2.4 Harmonise column (sample) names ---------------------------- clean_lnc <- function(x) sub(".*RNA-ACR-([0-9]+).*", "sample\\1", x) colnames(lnc_mat) <- clean_lnc(colnames(lnc_mat)) colnames(mrna_mat) <- sub("^kallisto_quant_", "", colnames(mrna_mat)) colnames(mrna_mat) <- sub("\\.\\d+$", "", colnames(mrna_mat)) # strip ".1" # ---------- 2.5 Round Kallisto counts to integers ------------------------- mrna_mat <- round(mrna_mat) # ---------- 2.6 Assign back to the workflow objects ------------------------ lnc_counts <- lnc_mat mrna_counts <- mrna_mat head(lnc_counts) head(mrna_counts) ``` ### 2.1 Match samples across matrices ```{r} common <- intersect(colnames(lnc_counts), colnames(mrna_counts)) stopifnot(length(common) >= 2) lnc_counts <- lnc_counts [, common] mrna_counts <- mrna_counts[, common] ``` ## 3 Filter low-expressed genes ```{r} min_samples <- 3 min_counts <- 10 keep_lnc <- rowSums(lnc_counts >= min_counts) >= min_samples keep_mrna <- rowSums(mrna_counts >= min_counts) >= min_samples lnc_counts <- lnc_counts [keep_lnc , ] mrna_counts <- mrna_counts[keep_mrna, ] ``` ## 4 Joint DESeq2 normalisation & variance-stabilising transform ```{r} stopifnot(nrow(lnc_counts) == length(rownames(lnc_counts))) stopifnot(nrow(mrna_counts) == length(rownames(mrna_counts))) combined <- rbind(lnc_counts, mrna_counts) dds <- DESeqDataSetFromMatrix(countData = combined, colData = data.frame(cond = factor(rep("all", ncol(combined))), row.names = colnames(combined)), design = ~ 1) dds <- estimateSizeFactors(dds) vst_mat <- assay(vst(dds, blind = TRUE)) lnc_vst <- vst_mat[rownames(lnc_counts) , ] mrna_vst <- vst_mat[rownames(mrna_counts), ] ``` ## 5 Block-wise Pearson correlations ```{r} block_size <- 2000 # tweak for RAM nr_lnc <- nrow(lnc_vst) nr_mrna <- nrow(mrna_vst) cor_mat <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") cor_padj <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") for (start in seq(1, nr_lnc, by = block_size)) { end <- min(start + block_size - 1, nr_lnc) rblk <- cor(t(lnc_vst[start:end, ]), t(mrna_vst), method = "pearson") pblk <- corPvalueStudent(rblk, nSamples = ncol(lnc_vst)) cor_mat [start:end, ] <- rblk cor_padj[start:end, ] <- p.adjust(pblk, method = "BH") cat(sprintf("processed %d–%d\n", start, end)) } options(bigmemory.allow.dimnames = TRUE) dimnames(cor_mat) <- list(rownames(lnc_vst), rownames(mrna_vst)) dimnames(cor_padj) <- dimnames(cor_mat) ``` ## 6 Export results ```{r} # full matrices (beware large size!) fwrite(as.data.table(as.matrix(cor_mat), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/correlations_full.tsv") fwrite(as.data.table(as.matrix(cor_padj), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/correlations_padj.tsv") # significant pairs (|r| ≥ 0.7 & padj ≤ 0.05) sig <- which(abs(cor_mat[]) >= 0.7 & cor_padj[] <= 0.05, arr.ind = TRUE) |> as.data.frame() colnames(sig) <- c("lnc_idx", "mrna_idx") sig <- sig |> mutate(lncRNA = rownames(cor_mat)[lnc_idx], mRNA = colnames(cor_mat)[mrna_idx], r = cor_mat [cbind(lnc_idx, mrna_idx)], padj = cor_padj[cbind(lnc_idx, mrna_idx)]) |> select(lncRNA, mRNA, r, padj) fwrite(sig, "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/significant_pairs.tsv") ``` ## 7 Session info ```{r} sessionInfo() ``` ## 8 Visualization ```{r} library(ggplot2) library(reshape2) library(tidyr) # 1. Take top 20 by lowest padj top_sig <- sig |> arrange(padj) |> head(20) # 2. Create correlation matrix (wide form) wide_df <- top_sig |> arrange(padj) |> head(20) |> select(lncRNA, mRNA, r) |> pivot_wider(names_from = mRNA, values_from = r) heat_df <- as.data.frame(wide_df) rownames(heat_df) <- heat_df$lncRNA heat_df <- heat_df[, -1] # 3. Melt into long format heat_long <- melt(as.matrix(heat_df), varnames = c("lncRNA", "mRNA"), value.name = "correlation") # 4. Plot ggplot(heat_long, aes(x = mRNA, y = lncRNA, fill = correlation)) + geom_tile(color = "white") + scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1, 1), name = "Pearson r") + theme_minimal(base_size = 12) + theme(axis.text.x = element_text(angle = 45, hjust = 1), panel.grid = element_blank()) + labs(title = "Top 20 lncRNA–mRNA Correlations", x = "mRNA", y = "lncRNA") ``` # Peve ## 1 Libraries ```{r} library(data.table) library(DESeq2) library(WGCNA) # p-values + block-wise tools if (!requireNamespace("bigmemory", quietly = TRUE)) install.packages("bigmemory") library(bigmemory) # RAM-friendly matrices library(dplyr) ``` ## 2 Read count matrices & tidy formatting ```{bash} #download mRNA matrix curl -o ~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Peve/mRNA_counts.txt https://raw.githubusercontent.com/urol-e5/deep-dive/main/E-Peve/output/12-Peve-RNAseq-kallisto/kallisto/kallisto.isoform.counts.matrix ``` ```{r} # ---------- 2.0 Load the raw tables --------------------------------------- lnc_raw <- fread(cmd = "grep -v '^#' ~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Peve/lncRNA_counts.txt") mrna_raw <- fread("~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Peve/mRNA_counts.txt") # ---------- 2.1 Quick sanity counts *before* you touch rownames() ---------- cat("\nlnc_raw rows :", nrow(lnc_raw), "| unique Geneid :", length(unique(lnc_raw$Geneid)), "| blanks :", sum(lnc_raw$Geneid == ""), "\n") cat("mrna_raw rows:", nrow(mrna_raw), "| unique IDs :", length(unique(mrna_raw[[1]])), "| blanks :", sum(mrna_raw[[1]] == ""), "\n\n") # ---------- 2.2 Drop blank IDs (if any) and make duplicates unique --------- lnc_raw <- lnc_raw [Geneid != ""] mrna_raw <- mrna_raw[mrna_raw[[1]] != ""] lnc_ids <- make.unique(lnc_raw$Geneid) mrna_ids <- make.unique(mrna_raw[[1]]) # ---------- 2.3 Build matrices with row-names that now *stick* ------------- lnc_mat <- as.matrix(as.data.frame(lnc_raw[, -(1:6)], row.names = lnc_ids)) mrna_mat <- as.matrix(as.data.frame(mrna_raw[, -1], row.names = mrna_ids)) # ---------- 2.4 Harmonise column (sample) names ---------------------------- # This matches RNA-POR-71-S1-TP2.sorted.bam and extracts "71" clean_lnc <- function(x) sub(".*RNA-POR-([0-9]+)-.*", "sample\\1", x) colnames(lnc_mat) <- clean_lnc(colnames(lnc_mat)) colnames(mrna_mat) <- sub("^kallisto_quant_", "", colnames(mrna_mat)) colnames(mrna_mat) <- sub("\\.\\d+$", "", colnames(mrna_mat)) common <- intersect(colnames(lnc_mat), colnames(mrna_mat)) length(common) # Should now be >= 2 # ---------- 2.5 Round Kallisto counts to integers ------------------------- mrna_mat <- round(mrna_mat) # ---------- 2.6 Assign back to the workflow objects ------------------------ lnc_counts <- lnc_mat mrna_counts <- mrna_mat head(lnc_counts) head(mrna_counts) ``` ### 2.1 Match samples across matrices ```{r} common <- intersect(colnames(lnc_counts), colnames(mrna_counts)) stopifnot(length(common) >= 2) lnc_counts <- lnc_counts [, common] mrna_counts <- mrna_counts[, common] ``` ## 3 Filter low-expressed genes ```{r} min_samples <- 3 min_counts <- 10 keep_lnc <- rowSums(lnc_counts >= min_counts) >= min_samples keep_mrna <- rowSums(mrna_counts >= min_counts) >= min_samples lnc_counts <- lnc_counts [keep_lnc , ] mrna_counts <- mrna_counts[keep_mrna, ] ``` ## 4 Joint DESeq2 normalisation & variance-stabilising transform ```{r} stopifnot(nrow(lnc_counts) == length(rownames(lnc_counts))) stopifnot(nrow(mrna_counts) == length(rownames(mrna_counts))) combined <- rbind(lnc_counts, mrna_counts) dds <- DESeqDataSetFromMatrix(countData = combined, colData = data.frame(cond = factor(rep("all", ncol(combined))), row.names = colnames(combined)), design = ~ 1) dds <- estimateSizeFactors(dds) vst_mat <- assay(vst(dds, blind = TRUE)) lnc_vst <- vst_mat[rownames(lnc_counts) , ] mrna_vst <- vst_mat[rownames(mrna_counts), ] ``` ## 5 Block-wise Pearson correlations ```{r} block_size <- 2000 # tweak for RAM nr_lnc <- nrow(lnc_vst) nr_mrna <- nrow(mrna_vst) cor_mat <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") cor_padj <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") for (start in seq(1, nr_lnc, by = block_size)) { end <- min(start + block_size - 1, nr_lnc) rblk <- cor(t(lnc_vst[start:end, ]), t(mrna_vst), method = "pearson") pblk <- corPvalueStudent(rblk, nSamples = ncol(lnc_vst)) cor_mat [start:end, ] <- rblk cor_padj[start:end, ] <- p.adjust(pblk, method = "BH") cat(sprintf("processed %d–%d\n", start, end)) } options(bigmemory.allow.dimnames = TRUE) dimnames(cor_mat) <- list(rownames(lnc_vst), rownames(mrna_vst)) dimnames(cor_padj) <- dimnames(cor_mat) ``` ## 6 Export results ```{r} # full matrices (beware large size!) fwrite(as.data.table(as.matrix(cor_mat), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Peve/correlations_full.tsv") fwrite(as.data.table(as.matrix(cor_padj), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Peve/correlations_padj.tsv") # significant pairs (|r| ≥ 0.7 & padj ≤ 0.05) sig <- which(abs(cor_mat[]) >= 0.7 & cor_padj[] <= 0.05, arr.ind = TRUE) |> as.data.frame() colnames(sig) <- c("lnc_idx", "mrna_idx") sig <- sig |> mutate(lncRNA = rownames(cor_mat)[lnc_idx], mRNA = colnames(cor_mat)[mrna_idx], r = cor_mat [cbind(lnc_idx, mrna_idx)], padj = cor_padj[cbind(lnc_idx, mrna_idx)]) |> select(lncRNA, mRNA, r, padj) fwrite(sig, "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Peve/significant_pairs.tsv") ``` ## 7 Session info ```{r} sessionInfo() ``` ## 8 Visualization ```{r} library(ggplot2) library(reshape2) library(tidyr) # 1. Take top 20 by lowest padj top_sig <- sig |> arrange(padj) |> head(20) # 2. Create correlation matrix (wide form) wide_df <- top_sig |> arrange(padj) |> head(20) |> select(lncRNA, mRNA, r) |> pivot_wider(names_from = mRNA, values_from = r) heat_df <- as.data.frame(wide_df) rownames(heat_df) <- heat_df$lncRNA heat_df <- heat_df[, -1] # 3. Melt into long format heat_long <- melt(as.matrix(heat_df), varnames = c("lncRNA", "mRNA"), value.name = "correlation") # 4. Plot ggplot(heat_long, aes(x = mRNA, y = lncRNA, fill = correlation)) + geom_tile(color = "white") + scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1, 1), name = "Pearson r") + theme_minimal(base_size = 12) + theme(axis.text.x = element_text(angle = 45, hjust = 1), panel.grid = element_blank()) + labs(title = "Top lncRNA–mRNA Correlations", x = "mRNA", y = "lncRNA") ``` # Ptuh ## 1 Libraries ```{r} library(data.table) library(DESeq2) library(WGCNA) # p-values + block-wise tools if (!requireNamespace("bigmemory", quietly = TRUE)) install.packages("bigmemory") library(bigmemory) # RAM-friendly matrices library(dplyr) ``` ## 2 Read count matrices & tidy formatting ```{bash} #download mRNA matrix curl -o ~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Ptuh/mRNA_counts.txt https://raw.githubusercontent.com/urol-e5/deep-dive/main/F-Pmea/output/14-Pmea-RNAseq-kallisto/kallisto/kallisto.isoform.counts.matrix ``` ```{r} # ---------- 2.0 Load the raw tables --------------------------------------- lnc_raw <- fread(cmd = "grep -v '^#' ~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Ptuh/lncRNA_counts.txt") mrna_raw <- fread("~/github/deep-dive-expression/M-multi-species/data/03-expression-matrix-correlations/Ptuh/mRNA_counts.txt") # ---------- 2.1 Quick sanity counts *before* you touch rownames() ---------- cat("\nlnc_raw rows :", nrow(lnc_raw), "| unique Geneid :", length(unique(lnc_raw$Geneid)), "| blanks :", sum(lnc_raw$Geneid == ""), "\n") cat("mrna_raw rows:", nrow(mrna_raw), "| unique IDs :", length(unique(mrna_raw[[1]])), "| blanks :", sum(mrna_raw[[1]] == ""), "\n\n") # ---------- 2.2 Drop blank IDs (if any) and make duplicates unique --------- lnc_raw <- lnc_raw [Geneid != ""] mrna_raw <- mrna_raw[mrna_raw[[1]] != ""] lnc_ids <- make.unique(lnc_raw$Geneid) mrna_ids <- make.unique(mrna_raw[[1]]) # ---------- 2.3 Build matrices with row-names that now *stick* ------------- lnc_mat <- as.matrix(as.data.frame(lnc_raw[, -(1:6)], row.names = lnc_ids)) mrna_mat <- as.matrix(as.data.frame(mrna_raw[, -1], row.names = mrna_ids)) # ---------- 2.4 Harmonise column (sample) names ---------------------------- clean_lnc <- function(x) sub(".*RNA-POC-([0-9]+)-.*", "sample\\1", x) colnames(lnc_mat) <- clean_lnc(colnames(lnc_mat)) colnames(mrna_mat) <- sub("^kallisto_quant_", "", colnames(mrna_mat)) colnames(mrna_mat) <- sub("\\.\\d+$", "", colnames(mrna_mat)) common <- intersect(colnames(lnc_mat), colnames(mrna_mat)) length(common) # Should now be >= 2 # ---------- 2.5 Round Kallisto counts to integers ------------------------- mrna_mat <- round(mrna_mat) # ---------- 2.6 Assign back to the workflow objects ------------------------ lnc_counts <- lnc_mat mrna_counts <- mrna_mat head(lnc_counts) head(mrna_counts) ``` ### 2.1 Match samples across matrices ```{r} common <- intersect(colnames(lnc_counts), colnames(mrna_counts)) stopifnot(length(common) >= 2) lnc_counts <- lnc_counts [, common] mrna_counts <- mrna_counts[, common] ``` ## 3 Filter low-expressed genes ```{r} min_samples <- 3 min_counts <- 10 keep_lnc <- rowSums(lnc_counts >= min_counts) >= min_samples keep_mrna <- rowSums(mrna_counts >= min_counts) >= min_samples lnc_counts <- lnc_counts [keep_lnc , ] mrna_counts <- mrna_counts[keep_mrna, ] ``` ## 4 Joint DESeq2 normalisation & variance-stabilising transform ```{r} stopifnot(nrow(lnc_counts) == length(rownames(lnc_counts))) stopifnot(nrow(mrna_counts) == length(rownames(mrna_counts))) combined <- rbind(lnc_counts, mrna_counts) dds <- DESeqDataSetFromMatrix(countData = combined, colData = data.frame(cond = factor(rep("all", ncol(combined))), row.names = colnames(combined)), design = ~ 1) dds <- estimateSizeFactors(dds) vst_mat <- assay(vst(dds, blind = TRUE)) lnc_vst <- vst_mat[rownames(lnc_counts) , ] mrna_vst <- vst_mat[rownames(mrna_counts), ] ``` ## 5 Block-wise Pearson correlations ```{r} block_size <- 2000 # tweak for RAM nr_lnc <- nrow(lnc_vst) nr_mrna <- nrow(mrna_vst) cor_mat <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") cor_padj <- big.matrix(nrow = nr_lnc, ncol = nr_mrna, type = "double") for (start in seq(1, nr_lnc, by = block_size)) { end <- min(start + block_size - 1, nr_lnc) rblk <- cor(t(lnc_vst[start:end, ]), t(mrna_vst), method = "pearson") pblk <- corPvalueStudent(rblk, nSamples = ncol(lnc_vst)) cor_mat [start:end, ] <- rblk cor_padj[start:end, ] <- p.adjust(pblk, method = "BH") cat(sprintf("processed %d–%d\n", start, end)) } options(bigmemory.allow.dimnames = TRUE) dimnames(cor_mat) <- list(rownames(lnc_vst), rownames(mrna_vst)) dimnames(cor_padj) <- dimnames(cor_mat) ``` ## 6 Export results ```{r} # full matrices (beware large size!) fwrite(as.data.table(as.matrix(cor_mat), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Ptuh/correlations_full.tsv") fwrite(as.data.table(as.matrix(cor_padj), keep.rownames = "lncRNA"), "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Ptuh/correlations_padj.tsv") # significant pairs (|r| ≥ 0.7 & padj ≤ 0.05) sig <- which(abs(cor_mat[]) >= 0.7 & cor_padj[] <= 0.05, arr.ind = TRUE) |> as.data.frame() colnames(sig) <- c("lnc_idx", "mrna_idx") sig <- sig |> mutate(lncRNA = rownames(cor_mat)[lnc_idx], mRNA = colnames(cor_mat)[mrna_idx], r = cor_mat [cbind(lnc_idx, mrna_idx)], padj = cor_padj[cbind(lnc_idx, mrna_idx)]) |> select(lncRNA, mRNA, r, padj) fwrite(sig, "~/github/deep-dive-expression/M-multi-species/output/03-expression-matrix-correlations/Ptuh/significant_pairs.tsv") ``` ## 7 Session info ```{r} sessionInfo() ``` ## 8 Visualization ```{r} library(ggplot2) library(reshape2) library(tidyr) # 1. Take top 20 by lowest padj top_sig <- sig |> arrange(padj) |> head(20) # 2. Create correlation matrix (wide form) wide_df <- top_sig |> arrange(padj) |> head(20) |> select(lncRNA, mRNA, r) |> pivot_wider(names_from = mRNA, values_from = r) heat_df <- as.data.frame(wide_df) rownames(heat_df) <- heat_df$lncRNA heat_df <- heat_df[, -1] # 3. Melt into long format heat_long <- melt(as.matrix(heat_df), varnames = c("lncRNA", "mRNA"), value.name = "correlation") # 4. Plot ggplot(heat_long, aes(x = mRNA, y = lncRNA, fill = correlation)) + geom_tile(color = "white") + scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1, 1), name = "Pearson r") + theme_minimal(base_size = 12) + theme(axis.text.x = element_text(angle = 45, hjust = 1), panel.grid = element_blank()) + labs(title = "Top lncRNA–mRNA Correlations", x = "mRNA", y = "lncRNA") ```