--- title: "14-lncRNA-classification" author: "Zach Bengtsson" date: "2026-01-30" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = TRUE, warning = TRUE) ``` ### Libraries ```{r setup, include=FALSE} library(tidyverse) library(purrr) ``` ### Setup ```{r} # Species to process species <- c("Apul", "Peve", "Ptuh") # Base directory for GTFs BASE_GTF_DIR <- "~/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/GTFs" # Base directory for classification outputs (BED + annotation tables) OUT_BASE <- "~/github/deep-dive-expression/M-multi-species/output/14-lncRNA-classification/coral_lnc_classification" # Base directory for expression matrices # Expecting files: # _lnc_expr.tsv # _gene_expr.tsv EXPR_BASE <- "~/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/expression_matrices" # Tier 1 parameters cis_window_bp <- 10000 # e.g. ±10 kb around nearest gene cor_threshold <- 0.6 # |r| >= 0.6 to call cis ``` ### Download GTFs and expression matrices to the correct places ```{bash} # ========================= # GTF downloads # ========================= # These must match the R paths above BASE_GTF_DIR=$HOME/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/GTFs mkdir -p "${BASE_GTF_DIR}" # ---- Apul ---- curl -L \ https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulchra-genome.gtf \ -o "${BASE_GTF_DIR}/Apul_genes.gtf" curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/refs/heads/main/D-Apul/output/31-Apul-lncRNA/Apul-lncRNA.gtf \ -o "${BASE_GTF_DIR}/Apul-lncRNA.gtf" # ---- Peve ---- curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/b0290e08af4eaeed30d74a758965debef6111801/E-Peve/data/Porites_evermanni_validated.gtf \ -o "${BASE_GTF_DIR}/Peve_genes.gtf" curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/refs/heads/main/E-Peve/output/17-Peve-lncRNA/Peve-lncRNA.gtf \ -o "${BASE_GTF_DIR}/Peve-lncRNA.gtf" # ---- Ptuh ---- curl -L \ https://github.com/urol-e5/deep-dive-expression/raw/f62c6d01e04ef0007f2f53af84181481d64d29c1/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gtf \ -o "${BASE_GTF_DIR}/Ptuh_genes.gtf" curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/refs/heads/main/F-Ptuh/output/17-Ptuh-lncRNA/Ptuh-lncRNA.gtf \ -o "${BASE_GTF_DIR}/Ptuh-lncRNA.gtf" echo "GTFs downloaded to ${BASE_GTF_DIR}" ``` lncRNA expression matrices are txt and gene matrices are CSV, kept file names as TSV for references later in the code, but read them in using correct functions. ```{bash} # ========================= # Expression matrices # ========================= EXPR_BASE=$HOME/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/expression_matrices # ---- Apul ---- # lncRNA counts: tab-delimited .txt curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/M-multi-species/output/01.6-lncRNA-pipeline/Apul-lncRNA-counts-filtered.txt \ -o "${EXPR_BASE}/Apul_lnc_expr.tsv" # gene counts: CSV curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/D-Apul/output/07-Apul-Hisat/Apul-gene_count_matrix.csv \ -o "${EXPR_BASE}/Apul_gene_expr.tsv" # ---- Peve ---- curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/M-multi-species/output/01.6-lncRNA-pipeline/Peve-lncRNA-counts-filtered.txt \ -o "${EXPR_BASE}/Peve_lnc_expr.tsv" curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/E-Peve/output/06.2-Peve-Hisat/Peve-gene_count_matrix.csv \ -o "${EXPR_BASE}/Peve_gene_expr.tsv" # ---- Ptuh ---- curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/M-multi-species/output/01.6-lncRNA-pipeline/Ptuh-lncRNA-counts-filtered.txt \ -o "${EXPR_BASE}/Ptuh_lnc_expr.tsv" curl -L \ https://raw.githubusercontent.com/urol-e5/deep-dive-expression/main/F-Ptuh/output/06.2-Ptuh-Hisat/Ptuh-gene_count_matrix.csv \ -o "${EXPR_BASE}/Ptuh_gene_expr.tsv" echo "Expression matrices downloaded to ${EXPR_BASE}" ``` ```{bash} set -e BASE_GTF_DIR=$HOME/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/GTFs echo "BASE_GTF_DIR = ${BASE_GTF_DIR}" echo echo "Files in BASE_GTF_DIR:" ls -l "${BASE_GTF_DIR}" echo echo "=== Apul_genes.gtf feature counts ===" awk 'BEGIN{ex=0; tx=0; gene=0} !/^#/ { if ($3=="exon") ex++ else if ($3=="transcript") tx++ else if ($3=="gene") gene++ } END{ print "exon:", ex; print "transcript:", tx; print "gene:", gene; }' "${BASE_GTF_DIR}/Apul_genes.gtf" echo echo "=== Apul-lncRNA.gtf feature counts ===" awk 'BEGIN{lnc=0} !/^#/ { if ($3=="lncRNA") lnc++ } END{ print "lncRNA:", lnc; }' "${BASE_GTF_DIR}/Apul-lncRNA.gtf" echo echo "=== Peve_genes.gtf feature counts ===" awk 'BEGIN{ex=0; tx=0; gene=0} !/^#/ { if ($3=="exon") ex++ else if ($3=="transcript") tx++ else if ($3=="gene") gene++ } END{ print "exon:", ex; print "transcript:", tx; print "gene:", gene; }' "${BASE_GTF_DIR}/Peve_genes.gtf" echo echo "=== Peve-lncRNA.gtf feature counts ===" awk 'BEGIN{lnc=0} !/^#/ { if ($3=="lncRNA") lnc++ } END{ print "lncRNA:", lnc; }' "${BASE_GTF_DIR}/Peve-lncRNA.gtf" echo echo "=== Ptuh_genes.gtf feature counts ===" awk 'BEGIN{ex=0; tx=0; gene=0} !/^#/ { if ($3=="exon") ex++ else if ($3=="transcript") tx++ else if ($3=="gene") gene++ } END{ print "exon:", ex; print "transcript:", tx; print "gene:", gene; }' "${BASE_GTF_DIR}/Ptuh_genes.gtf" echo echo "=== Ptuh-lncRNA.gtf feature counts ===" awk 'BEGIN{lnc=0} !/^#/ { if ($3=="lncRNA") lnc++ } END{ print "lncRNA:", lnc; }' "${BASE_GTF_DIR}/Ptuh-lncRNA.gtf" ``` ### Generate Bed and overlap files ```{bash} set -euo pipefail species=("Apul" "Peve" "Ptuh") BASE_GTF_DIR=$HOME/github/deep-dive-expression/M-multi-species/data/14-lncRNA-classification/GTFs OUT_BASE=$HOME/github/deep-dive-expression/M-multi-species/output/14-lncRNA-classification/coral_lnc_classification # Explicit bedtools path BEDTOOLS="/home/shared/bedtools-v2.30.0/bin/bedtools" if [[ ! -x "${BEDTOOLS}" ]]; then echo "ERROR: bedtools not executable at ${BEDTOOLS}" >&2 exit 1 fi echo "Using bedtools at: ${BEDTOOLS}" "${BEDTOOLS}" --version echo make_beds_for_species () { local sp="$1" echo ">>> Processing species (bedtools step): ${sp}" local GENE_GTF="${BASE_GTF_DIR}/${sp}_genes.gtf" local LNC_GTF="${BASE_GTF_DIR}/${sp}-lncRNA.gtf" if [[ ! -f "${GENE_GTF}" ]]; then echo "ERROR: Missing gene GTF for ${sp}: ${GENE_GTF}" >&2 exit 1 fi if [[ ! -f "${LNC_GTF}" ]]; then echo "ERROR: Missing lncRNA GTF for ${sp}: ${LNC_GTF}" >&2 exit 1 fi local OUTDIR="${OUT_BASE}/${sp}" mkdir -p "${OUTDIR}" cd "${OUTDIR}" echo " - Creating BED files for ${sp}" # 1) gene_exons.bed (all species have exon features) awk 'BEGIN{FS=OFS="\t"} !/^#/ && $3=="exon" { attr = $9 id = attr sub(/.*gene_id "/, "", id) sub(/".*/, "", id) if (id != "") { print $1, $4-1, $5, id, ".", $7 } }' "${GENE_GTF}" \ | sort -k1,1 -k2,2n -k3,3n -k6,6 \ > gene_exons.bed echo " gene_exons.bed lines: $(wc -l < gene_exons.bed)" # 2) gene_body.bed (Apul & Ptuh: transcripts only; Peve: genes + transcripts) awk 'BEGIN{FS=OFS="\t"} !/^#/ && ($3=="gene" || $3=="transcript") { attr = $9 id = "" if (index(attr, "gene_id ") > 0) { id = attr sub(/.*gene_id "/, "", id) } else if (index(attr, "transcript_id ") > 0) { id = attr sub(/.*transcript_id "/, "", id) } sub(/".*/, "", id) if (id != "") { print $1, $4-1, $5, id, ".", $7 } }' "${GENE_GTF}" \ | sort -k1,1 -k2,2n -k3,3n -k6,6 \ > gene_body.bed echo " gene_body.bed lines: $(wc -l < gene_body.bed)" # 3) lnc_tx.bed from lncRNA GTF (all three use feature 'lncRNA' w/ gene_id) awk 'BEGIN{FS=OFS="\t"} !/^#/ && $3=="lncRNA" { attr = $9 id = attr sub(/.*gene_id "/, "", id) sub(/".*/, "", id) if (id != "") { print $1, $4-1, $5, id, ".", $7 } }' "${LNC_GTF}" \ | sort -k1,1 -k2,2n -k3,3n -k6,6 \ > lnc_tx.bed echo " lnc_tx.bed lines: $(wc -l < lnc_tx.bed)" echo " - Running bedtools for ${sp}" "${BEDTOOLS}" intersect \ -s -wa -wb \ -a lnc_tx.bed -b gene_exons.bed \ > lnc_vs_geneExons_sense.bed "${BEDTOOLS}" intersect \ -S -wa -wb \ -a lnc_tx.bed -b gene_exons.bed \ > lnc_vs_geneExons_antisense.bed "${BEDTOOLS}" intersect \ -s -wa -wb \ -f 1.0 \ -a lnc_tx.bed -b gene_body.bed \ > lnc_vs_geneBody_intronicCandidates.bed "${BEDTOOLS}" closest \ -d \ -a lnc_tx.bed -b gene_body.bed \ > lnc_nearestGene.bed echo " sense exonic: $(wc -l < lnc_vs_geneExons_sense.bed)" echo " antisense exonic: $(wc -l < lnc_vs_geneExons_antisense.bed)" echo " intronic: $(wc -l < lnc_vs_geneBody_intronicCandidates.bed)" echo " nearestGene: $(wc -l < lnc_nearestGene.bed)" echo ">>> Finished bedtools step for ${sp}" cd - >/dev/null } for sp in "${species[@]}"; do make_beds_for_species "${sp}" done echo "All species processed with bedtools. Outputs in ${OUT_BASE}/{Apul,Peve,Ptuh}" ``` ```{r} for (sp in species) { sp_dir <- file.path(OUT_BASE, sp) cat("\n====", sp, "====\n") files <- c("lnc_tx.bed", "gene_exons.bed", "gene_body.bed", "lnc_vs_geneExons_sense.bed", "lnc_vs_geneExons_antisense.bed", "lnc_vs_geneBody_intronicCandidates.bed", "lnc_nearestGene.bed") for (f in files) { path <- file.path(sp_dir, f) if (!file.exists(path)) { cat(" ", f, " -> MISSING\n") } else { n_lines <- length(readr::read_lines(path)) cat(" ", f, " ->", n_lines, "lines\n") } } } ``` ### Tier 1 classifications (genomic class and cis_trans_class) ```{r} safe_cor <- function(x, y) { if (all(is.na(x)) | all(is.na(y))) return(NA_real_) suppressWarnings(cor(x, y, use = "pairwise.complete.obs")) } ``` ```{r} process_species <- function(sp, bed_base = OUT_BASE, expr_base = EXPR_BASE) { message(">>> Tier 1 classification for species: ", sp) sp_dir <- file.path(bed_base, sp) # BED / overlap files (from bash chunk) sense_file <- file.path(sp_dir, "lnc_vs_geneExons_sense.bed") antisense_file <- file.path(sp_dir, "lnc_vs_geneExons_antisense.bed") intronic_file <- file.path(sp_dir, "lnc_vs_geneBody_intronicCandidates.bed") nearest_file <- file.path(sp_dir, "lnc_nearestGene.bed") lnc_tx_file <- file.path(sp_dir, "lnc_tx.bed") # Expression matrices lnc_expr_file <- file.path(expr_base, paste0(sp, "_lnc_expr.tsv")) gene_expr_file <- file.path(expr_base, paste0(sp, "_gene_expr.tsv")) # Sanity check files_to_check <- c(sense_file, antisense_file, intronic_file, nearest_file, lnc_tx_file, lnc_expr_file, gene_expr_file) missing <- files_to_check[!file.exists(files_to_check)] if (length(missing) > 0) { stop("Missing files for species ", sp, ":\n", paste(missing, collapse = "\n")) } #--------------------------- # Load BED / overlap files #--------------------------- sense <- readr::read_tsv( sense_file, col_names = c("lnc_chr","lnc_start","lnc_end","lnc_id","lnc_score","lnc_strand", "gene_chr","gene_start","gene_end","gene_id","gene_score","gene_strand"), show_col_types = FALSE ) antisense <- readr::read_tsv( antisense_file, col_names = c("lnc_chr","lnc_start","lnc_end","lnc_id","lnc_score","lnc_strand", "gene_chr","gene_start","gene_end","gene_id","gene_score","gene_strand"), show_col_types = FALSE ) intronic <- readr::read_tsv( intronic_file, col_names = c("lnc_chr","lnc_start","lnc_end","lnc_id","lnc_score","lnc_strand", "gene_chr","gene_start","gene_end","gene_id","gene_score","gene_strand"), show_col_types = FALSE ) nearest <- readr::read_tsv( nearest_file, col_names = c("lnc_chr","lnc_start","lnc_end","lnc_id","lnc_score","lnc_strand", "gene_chr","gene_start","gene_end","gene_id","gene_score","gene_strand", "distance_bp"), show_col_types = FALSE ) # All lncRNAs lnc_all <- readr::read_tsv( lnc_tx_file, col_names = c("lnc_chr","lnc_start","lnc_end","lnc_id","lnc_score","lnc_strand"), show_col_types = FALSE ) %>% dplyr::distinct(lnc_id, .keep_all = TRUE) #--------------------------- # Genomic class #--------------------------- sense_ids <- unique(sense$lnc_id) antisense_ids <- unique(antisense$lnc_id) intronic_ids <- unique(intronic$lnc_id) genomic_class_df <- lnc_all %>% dplyr::mutate( genomic_class = dplyr::case_when( lnc_id %in% sense_ids ~ "sense_exonic", lnc_id %in% antisense_ids ~ "antisense_exonic", lnc_id %in% intronic_ids & !lnc_id %in% c(sense_ids, antisense_ids) ~ "intronic", TRUE ~ "intergenic" ) ) #--------------------------- # Nearest gene info #--------------------------- nearest_slim <- nearest %>% dplyr::select(lnc_id, nearest_gene_id = gene_id, distance_bp) lnc_annot <- genomic_class_df %>% dplyr::left_join(nearest_slim, by = "lnc_id") #--------------------------- # Expression matrices #--------------------------- # lncRNA: tab-delimited (filtered lncRNA counts) lnc_expr <- readr::read_tsv(lnc_expr_file, show_col_types = FALSE) # gene: comma-delimited (featureCounts gene_count_matrix.csv) gene_expr <- readr::read_csv(gene_expr_file, show_col_types = FALSE) # IDs in first column lnc_ids <- lnc_expr[[1]] gene_ids <- gene_expr[[1]] # Clean up lnc sample columns lnc_data <- lnc_expr[, -1] # everything except ID meta_cols <- c("Chr", "Start", "End", "Strand", "Length") lnc_data <- dplyr::select(lnc_data, -dplyr::any_of(meta_cols)) lnc_sample_names <- colnames(lnc_data) lnc_sample_names_clean <- vapply( lnc_sample_names, function(n) { # Extract patterns like RNA.ACR.140 or RNA.POR.71 extracted <- sub(".*(RNA[._-][A-Z]{3}[._-][0-9]+).*", "\\1", n) if (identical(extracted, n)) { # Pattern not found; leave as-is return(n) } else { # Normalize to RNA-XXX-NNN return(gsub("[._]", "-", extracted)) } }, character(1) ) colnames(lnc_data) <- lnc_sample_names_clean # Clean up gene sample columns gene_data <- gene_expr[, -1] gene_sample_names <- colnames(gene_data) gene_sample_names_clean <- gene_sample_names gene_sample_names_clean <- sub("^transcript_counts\\.", "", gene_sample_names_clean) gene_sample_names_clean <- sub("^transcript.counts\\.", "", gene_sample_names_clean) gene_sample_names_clean <- sub("^counts\\.", "", gene_sample_names_clean) colnames(gene_data) <- gene_sample_names_clean # Align sample sets lnc_samples <- colnames(lnc_data) gene_samples <- colnames(gene_data) common_samples <- intersect(lnc_samples, gene_samples) if (length(common_samples) < 2) { stop( "Too few overlapping samples between lnc and gene matrices for ", sp, ".\n", "lnc samples (cleaned): ", paste(lnc_samples, collapse = ", "), "\n", "gene samples (cleaned): ", paste(gene_samples, collapse = ", ") ) } # Subset matrices to common samples in the same order lnc_mat <- as.matrix(lnc_data[, common_samples]) rownames(lnc_mat) <- lnc_ids gene_mat <- as.matrix(gene_data[, common_samples]) rownames(gene_mat) <- gene_ids # Sanity check: column names now must match stopifnot(identical(colnames(lnc_mat), colnames(gene_mat))) #--------------------------- # cis vs non-cis #--------------------------- lnc_cis_df <- lnc_annot %>% dplyr::mutate( in_cis_window = !is.na(distance_bp) & distance_bp <= cis_window_bp, nearest_gene_cor = purrr::pmap_dbl( list(lnc_id, nearest_gene_id, in_cis_window), function(lnc_id_i, gene_id_i, in_window) { if (is.na(gene_id_i) || !lnc_id_i %in% rownames(lnc_mat) || !gene_id_i %in% rownames(gene_mat)) { return(NA_real_) } safe_cor(lnc_mat[lnc_id_i, ], gene_mat[gene_id_i, ]) } ), cis_flag = in_cis_window & !is.na(nearest_gene_cor) & abs(nearest_gene_cor) >= cor_threshold, cis_trans_class = dplyr::case_when( cis_flag ~ "cis", TRUE ~ "non_cis_or_unknown" ) ) #--------------------------- # Final table for this species #--------------------------- lnc_final <- lnc_cis_df %>% dplyr::mutate(species = sp) %>% dplyr::select( species, lnc_id, lnc_chr, lnc_start, lnc_end, lnc_strand, genomic_class, cis_trans_class, nearest_gene_id, distance_bp, nearest_gene_cor ) # Write to disk out_file <- file.path(sp_dir, paste0(sp, "_lnc_Tier1_annotation.tsv")) readr::write_tsv(lnc_final, out_file) message(" - Written: ", out_file) lnc_final } ``` Test just Apul ```{r} tmp_Apul <- process_species("Apul") head(tmp_Apul) ``` ### Summary table and sanity check ```{r} all_results <- purrr::map_df(species, process_species) combined_out <- file.path(OUT_BASE, "AllSpecies_lnc_Tier1_annotation.tsv") readr::write_tsv(all_results, combined_out) message(">>> Combined Tier1 table written: ", combined_out) ``` ```{r} # Reload the combined table from disk (works even in a fresh session) tier1_path <- file.path( OUT_BASE, "AllSpecies_lnc_Tier1_annotation.tsv" ) all_results <- readr::read_tsv(tier1_path, show_col_types = FALSE) # Peek at the structure and first few rows print(dim(all_results)) print(head(all_results)) ``` ## Summary Figures Setup ```{r} library(tidyverse) tier1_path <- "~/github/deep-dive-expression/M-multi-species/output/14-lncRNA-classification/coral_lnc_classification/AllSpecies_lnc_Tier1_annotation.tsv" lnc_tier1 <- readr::read_tsv(tier1_path, show_col_types = FALSE) %>% mutate( species = factor(species, levels = c("Apul", "Peve", "Ptuh")), genomic_class = factor( genomic_class, levels = c("intergenic", "intronic", "antisense_exonic", "sense_exonic") ), cis_trans_class = factor( cis_trans_class, levels = c("cis", "non_cis_or_unknown") ) ) ``` ### Figure 1A – Counts by genomic class and species How many lncRNAs fall into each genomic class per species? ```{r} fig1a_counts <- lnc_tier1 %>% count(species, genomic_class) %>% ggplot(aes(x = species, y = n, fill = genomic_class)) + geom_col(color = "black", linewidth = 0.2) + labs( x = "Species", y = "Number of lncRNAs", fill = "Genomic class" ) + scale_fill_brewer(palette = "Set2") + theme_classic(base_size = 12) + theme( legend.position = "right", axis.text.x = element_text(angle = 0, hjust = 0.5) ) fig1a_counts # ggsave("fig1a_genomic_class_counts.png", fig1a_counts, width = 5, height = 4, dpi = 300) ``` ### Figure 1B – Proportions by genomic class and species What fraction of lncRNAs in each species are intergenic/intronic/exonic? ```{r} fig1b_props <- lnc_tier1 %>% count(species, genomic_class) %>% group_by(species) %>% mutate(prop = n / sum(n)) %>% ggplot(aes(x = species, y = prop, fill = genomic_class)) + geom_col(color = "black", linewidth = 0.2) + labs( x = "Species", y = "Proportion of lncRNAs", fill = "Genomic class" ) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + scale_fill_brewer(palette = "Set2") + theme_classic(base_size = 12) + theme( legend.position = "right", axis.text.x = element_text(angle = 0, hjust = 0.5) ) fig1b_props # ggsave("fig1b_genomic_class_proportions.png", fig1b_props, width = 5, height = 4, dpi = 300) ``` ### Figure 2 – Cis vs non-cis by genomic class (faceted) Within each genomic class, how many lncRNAs are cis vs non-cis? ```{r} fig2_cis_by_class <- lnc_tier1 %>% count(species, genomic_class, cis_trans_class) %>% group_by(species, genomic_class) %>% mutate(prop = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = species, y = prop, fill = cis_trans_class)) + geom_col(color = "black", linewidth = 0.2, position = "fill") + facet_wrap(~ genomic_class, nrow = 1) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + labs( x = "Species", y = "Proportion within genomic class", fill = "Regulatory class" ) + scale_fill_brewer(palette = "Pastel1", labels = c("cis", "non-cis or unknown")) + theme_classic(base_size = 12) + theme( strip.background = element_rect(fill = "grey90", color = NA), strip.text = element_text(face = "bold"), axis.text.x = element_text(angle = 0, hjust = 0.5) ) fig2_cis_by_class # ggsave("fig2_cis_by_genomic_class.png", fig2_cis_by_class, width = 9, height = 4, dpi = 300) ``` ### Figure 3 – Distance to nearest gene (log scale) How far are lncRNAs from their nearest gene, and does that differ by class and species? ```{r} fig3_dist_violin <- lnc_dist %>% ggplot(aes(x = genomic_class, y = log10_distance, fill = genomic_class)) + geom_violin(trim = TRUE) + facet_wrap(~ species, nrow = 1) + labs( x = "Genomic class", y = expression(log[10]~"(distance to nearest gene + 1, bp)"), fill = "Genomic class" ) + scale_fill_brewer(palette = "Set2") + theme_classic(base_size = 12) + theme( strip.background = element_rect(fill = "grey90", color = NA), strip.text = element_text(face = "bold"), axis.text.x = element_text(angle = 30, hjust = 1) ) fig3_dist_violin # ggsave("fig3_distance_violin.png", fig3_dist_violin, width = 9, height = 4, dpi = 300) ``` ### Figure 4 – Correlation strength for cis lncRNAs When we call something cis, how strong is its correlation with the nearest gene? ```{r} lnc_cis_only <- lnc_tier1 %>% filter(cis_trans_class == "cis") %>% filter(!is.na(nearest_gene_cor)) fig4_cis_cor <- lnc_cis_only %>% ggplot(aes(x = nearest_gene_cor, fill = species)) + geom_density(alpha = 0.4) + geom_vline(xintercept = c(-0.6, 0.6), linetype = "dashed", linewidth = 0.4) + labs( x = "Pearson correlation with nearest gene", y = "Density", fill = "Species" ) + scale_fill_brewer(palette = "Dark2") + theme_classic(base_size = 12) fig4_cis_cor # ggsave("fig4_cis_correlation_density.png", fig4_cis_cor, width = 6, height = 4, dpi = 300) ``` By genomic class ```{r} fig4_cis_cor_facet <- lnc_cis_only %>% ggplot(aes(x = nearest_gene_cor, fill = species)) + geom_density(alpha = 0.4) + geom_vline(xintercept = c(-0.6, 0.6), linetype = "dashed", linewidth = 0.4) + facet_wrap(~ genomic_class, nrow = 2) + labs( x = "Pearson correlation with nearest gene", y = "Density", fill = "Species" ) + scale_fill_brewer(palette = "Dark2") + theme_classic(base_size = 12) + theme( strip.background = element_rect(fill = "grey90", color = NA), strip.text = element_text(face = "bold") ) fig4_cis_cor_facet # ggsave("fig4_cis_correlation_by_class.png", fig4_cis_cor_facet, width = 8, height = 6, dpi = 300) ``` ### Summary Tables Table 1 ```{r} table_genomic_class <- lnc_tier1 %>% count(species, genomic_class, name = "n_lnc") %>% group_by(species) %>% mutate( total_lnc = sum(n_lnc), prop = n_lnc / total_lnc ) %>% ungroup() table_genomic_class ``` Table 2 ```{r} table_cis <- lnc_tier1 %>% count(species, genomic_class, cis_trans_class, name = "n_lnc") %>% group_by(species, genomic_class) %>% mutate( total_in_class = sum(n_lnc), prop_in_class = n_lnc / total_in_class ) %>% ungroup() table_cis ```