Gene expression summary for Acropora pulchra sRNA-seq data.
trimmed reads generated in deep-dive project
Reads aligned to Acropora pulchra genome
library(tidyverse)## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errorslibrary(ggplot2)
library(reshape2)## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smithslibrary(pheatmap)
library(RColorBrewer)
library(DESeq2)## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedianslibrary(ComplexHeatmap)## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.14.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
##    in the original pheatmap() are identically supported in the new function. You 
##    can still use the original function by explicitly calling pheatmap::pheatmap().
## 
## 
## Attaching package: 'ComplexHeatmap'
## 
## The following object is masked from 'package:pheatmap':
## 
##     pheatmapLoad in the sRNA count matrix generated using ShortStack 4.1.0. Keep in mind this data includes counts of all sRNAs, not just miRNAs
Counts generated in 04-Apul-sRNA-discovery-ShortStack. Coldata generated in 03.00-D-Apul-RNAseq-gene-expression-DESeq2
# Read in sRNA counts data
Apul_counts_sRNA_data_OG <- read_delim("../output/04-Apul-sRNA-discovery-ShortStack/ShortStack_out/Counts.txt", delim="\t") ## Rows: 18885 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Coords, Name, MIRNA
## dbl (40): 1A10-fastp-adapters-polyG-31bp-merged_condensed, 1A12-fastp-adapte...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.head(Apul_counts_sRNA_data_OG)## # A tibble: 6 × 43
##   Coords               Name  MIRNA 1A10-fastp-adapters-…¹ 1A12-fastp-adapters-…²
##   <chr>                <chr> <chr>                  <dbl>                  <dbl>
## 1 ntLink_7:3054-3472   Clus… N                          6                     19
## 2 ntLink_7:9758-10311  Clus… N                         12                     47
## 3 ntLink_7:22562-22980 Clus… N                         16                     19
## 4 ntLink_7:29267-29820 Clus… N                         10                     32
## 5 ntLink_7:42050-42468 Clus… N                         18                     19
## 6 ntLink_7:43122-43556 Clus… N                          0                     46
## # ℹ abbreviated names: ¹`1A10-fastp-adapters-polyG-31bp-merged_condensed`,
## #   ²`1A12-fastp-adapters-polyG-31bp-merged_condensed`
## # ℹ 38 more variables: `1A1-fastp-adapters-polyG-31bp-merged_condensed` <dbl>,
## #   `1A2-fastp-adapters-polyG-31bp-merged_condensed` <dbl>,
## #   `1A8-fastp-adapters-polyG-31bp-merged_condensed` <dbl>,
## #   `1A9-fastp-adapters-polyG-31bp-merged_condensed` <dbl>,
## #   `1B10-fastp-adapters-polyG-31bp-merged_condensed` <dbl>, …# Read in coldata 
coldata_OG <- read.csv(file = "../output/03.00-D-Apul-RNAseq-gene-expression-DESeq2/DESeq2-coldata.tab", row.names=1, sep = "\t")
coldata_OG$time.point <- factor(coldata_OG$time.point)Apul_counts_sRNA <- Apul_counts_sRNA_data_OG
coldata <- coldata_OG
# Remove excess portions of sample column names to just "sample###"
colnames(Apul_counts_sRNA) <- sub("-fastp-adapters-polyG-31bp-merged_condensed", "", colnames(Apul_counts_sRNA))
# Keep just the counts and cluster names
Apul_counts_sRNA <- Apul_counts_sRNA %>% select(-Coords, -MIRNA)
# I'm not going to be doing any removal of low-count sRNAs for now
# Make the cluster names our new row names
Apul_counts_sRNA <- Apul_counts_sRNA %>% column_to_rownames(var = "Name")
# Append colony and timepoint info to sample names
colnames(Apul_counts_sRNA) <- paste(colnames(Apul_counts_sRNA), coldata[colnames(Apul_counts_sRNA), "colony.id"], coldata[colnames(Apul_counts_sRNA), "time.point"], sep = "_")
# Make sure coldata metadata has matching rownames (for DEseq2 formatting)
rownames(coldata) <- paste(rownames(coldata), coldata$colony.id, coldata$time.point, sep = "_")
write.table(Apul_counts_sRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_sRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
head(Apul_counts_sRNA)##           1A10_ACR-145_TP4 1A12_ACR-237_TP3 1A1_ACR-173_TP1 1A2_ACR-244_TP4
## Cluster_1                6               19              39               6
## Cluster_2               12               47             150              21
## Cluster_3               16               19              48               5
## Cluster_4               10               32             157              28
## Cluster_5               18               19              20               6
## Cluster_6                0               46              53              12
##           1A8_ACR-186_TP2 1A9_ACR-244_TP2 1B10_ACR-150_TP4 1B1_ACR-225_TP3
## Cluster_1               6               9               16              15
## Cluster_2              13              93               58              36
## Cluster_3               6               5                9              14
## Cluster_4              10             105               68              34
## Cluster_5               5               6               14               8
## Cluster_6              40               1               31              44
##           1B2_ACR-173_TP3 1B5_ACR-229_TP1 1B9_ACR-265_TP4 1C10_ACR-173_TP4
## Cluster_1              31              10               3                1
## Cluster_2             109              13               9               27
## Cluster_3              30               6               6                3
## Cluster_4              83              11               9               38
## Cluster_5              31              11               0                5
## Cluster_6              99               9               2               68
##           1C4_ACR-139_TP4 1D10_ACR-265_TP2 1D3_ACR-225_TP4 1D4_ACR-237_TP4
## Cluster_1               2               30              36              51
## Cluster_2               9               63             290             141
## Cluster_3               8               26              44              46
## Cluster_4              12               54             392             131
## Cluster_5               0               33             121              35
## Cluster_6              47                4             273              98
##           1D6_ACR-229_TP2 1D8_ACR-237_TP2 1D9_ACR-229_TP4 1E1_ACR-265_TP3
## Cluster_1               8              26               3               1
## Cluster_2              13             315               6               7
## Cluster_3               9              27               4               3
## Cluster_4              23             202               6              10
## Cluster_5               7              12               7               0
## Cluster_6               2              46              11               6
##           1E3_ACR-150_TP2 1E5_ACR-139_TP3 1E9_ACR-237_TP1 1F11_ACR-173_TP2
## Cluster_1              33               3              12               17
## Cluster_2             107               6              62               36
## Cluster_3              45               1              11                5
## Cluster_4             112               2              72               69
## Cluster_5              44               0               8               13
## Cluster_6               1               0              51                2
##           1F4_ACR-150_TP3 1F8_ACR-145_TP3 1G5_ACR-244_TP3 1H11_ACR-225_TP1
## Cluster_1              30               2               4                9
## Cluster_2             105               8              63               34
## Cluster_3              30               1               3               12
## Cluster_4             117              18              64               31
## Cluster_5              42               1               0                7
## Cluster_6              59               1              30               51
##           1H12_ACR-186_TP3 1H6_ACR-225_TP2 1H7_ACR-229_TP3 1H8_ACR-186_TP4
## Cluster_1                7              48               4               6
## Cluster_2               11             235               7               2
## Cluster_3                8              34              10               8
## Cluster_4               11             299               8               8
## Cluster_5                6              33               3               4
## Cluster_6                4              26              16              53
##           2B2_ACR-145_TP1 2B3_ACR-139_TP2 2C1_ACR-244_TP1 2C2_ACR-139_TP1
## Cluster_1               7              30               1              36
## Cluster_2              13              32              24             156
## Cluster_3              13              27               3              25
## Cluster_4              11              37              26              77
## Cluster_5              24               6               0              61
## Cluster_6               1             293               2             262
##           2D2_ACR-150_TP1 2E2_ACR-186_TP1 2F1_ACR-265_TP1 2G1_ACR-145_TP2
## Cluster_1               7               0               5              46
## Cluster_2              55               4               5             400
## Cluster_3              22               3               3              36
## Cluster_4              85               4               7             408
## Cluster_5              21               5               5              48
## Cluster_6              24               2               0               4head(coldata)##                  time.point colony.id
## 1A1_ACR-173_TP1         TP1   ACR-173
## 1A10_ACR-145_TP4        TP4   ACR-145
## 1A12_ACR-237_TP3        TP3   ACR-237
## 1A2_ACR-244_TP4         TP4   ACR-244
## 1A8_ACR-186_TP2         TP2   ACR-186
## 1A9_ACR-244_TP2         TP2   ACR-244Plot histograms of the expression levels in each sample
# Melt the count matrix into long format
Apul_counts_sRNA_melted <- melt(Apul_counts_sRNA, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_sRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 99729 rows containing non-finite outside the scale range
## (`stat_bin()`).First let’s check the total number of transcripts in each sample – keep in mind this expression data has not been normalized yet, so there may be different totals for each sample
# Calculate the total number of transcripts for each sample
total_transcripts <- colSums(Apul_counts_sRNA)
# Create a data frame for plotting
total_transcripts_df <- data.frame(sample = names(total_transcripts),
                                   totals = total_transcripts)
# Plot the total number of transcripts for each sample
ggplot(total_transcripts_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
No glaring discrepancies/patterns
Now let’s check the number of unique transcripts in each sample – that is, how many unique sRNAs are expressed in each sample? This should be pretty much the same across samples, even without normalization.
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_transcripts <- colSums(Apul_counts_sRNA > 0)
# Create a data frame for plotting
unique_transcripts_df <- data.frame(sample = names(unique_transcripts),
                                    uniques = unique_transcripts)
# Plot the total number of unique transcripts for each sample
ggplot(unique_transcripts_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed Transcripts per Sample",
       x = "Sample",
       y = "Unique Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilityThe ShortStack output Results.txt includes all clusters of sRNA reads, including those not annotated as valid miRNAs. Now that we’ve looked at all the sRNAs a bit, let’s focus in on those classified as miRNAs.
Apul_counts_miRNA <- Apul_counts_sRNA_data_OG
coldata <- coldata_OG
# Remove excess portions of sample column names to just "sample###"
colnames(Apul_counts_miRNA) <- sub("-fastp-adapters-polyG-31bp-merged_condensed", "", colnames(Apul_counts_miRNA))
# Keep only the sRNAs ID'd as valid miRNAs
Apul_counts_miRNA <- Apul_counts_miRNA %>% filter(MIRNA == "Y")
# Keep just the counts and cluster names
Apul_counts_miRNA <- Apul_counts_miRNA %>% select(-Coords, -MIRNA)
# Make the cluster names our new row names
Apul_counts_miRNA <- Apul_counts_miRNA %>% column_to_rownames(var = "Name")
# Append colony and timepoint info to sample names
colnames(Apul_counts_miRNA) <- paste(colnames(Apul_counts_miRNA), coldata[colnames(Apul_counts_miRNA), "colony.id"], coldata[colnames(Apul_counts_miRNA), "time.point"], sep = "_")
# Make sure coldata metadata has matching rownames (for DEseq2 formatting)
rownames(coldata) <- paste(rownames(coldata), coldata$colony.id, coldata$time.point, sep = "_")
write.table(Apul_counts_miRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
head(Apul_counts_miRNA)##              1A10_ACR-145_TP4 1A12_ACR-237_TP3 1A1_ACR-173_TP1 1A2_ACR-244_TP4
## Cluster_1819               37               35             102              77
## Cluster_1832              325              435            1826             947
## Cluster_1833               16               52              40              30
## Cluster_1836             1071             2048            2334            6240
## Cluster_1865              103              299             203             491
## Cluster_1950              121              179             413             421
##              1A8_ACR-186_TP2 1A9_ACR-244_TP2 1B10_ACR-150_TP4 1B1_ACR-225_TP3
## Cluster_1819              84             340               14              19
## Cluster_1832            1238            2174              219             580
## Cluster_1833               6               0                6              27
## Cluster_1836            8001           11364             1614            2024
## Cluster_1865             108            1203               86             119
## Cluster_1950             224             689              201             297
##              1B2_ACR-173_TP3 1B5_ACR-229_TP1 1B9_ACR-265_TP4 1C10_ACR-173_TP4
## Cluster_1819              36             204              25               18
## Cluster_1832             547            3544             223              235
## Cluster_1833              30             145               6                3
## Cluster_1836            1577            5523            1993              885
## Cluster_1865             137             753             114               56
## Cluster_1950             249            1345             130               79
##              1C4_ACR-139_TP4 1D10_ACR-265_TP2 1D3_ACR-225_TP4 1D4_ACR-237_TP4
## Cluster_1819              10               96              66              41
## Cluster_1832             211              618            2897             978
## Cluster_1833              13                0              82              94
## Cluster_1836            1073             2737           14727           10196
## Cluster_1865              74              503             652             337
## Cluster_1950              51              400            1100             269
##              1D6_ACR-229_TP2 1D8_ACR-237_TP2 1D9_ACR-229_TP4 1E1_ACR-265_TP3
## Cluster_1819             195              81              66              19
## Cluster_1832            2836             645            1098             265
## Cluster_1833              30              57              12              21
## Cluster_1836            8094            4832            4571            1210
## Cluster_1865             536             345             253             178
## Cluster_1950            1043             206             460             102
##              1E3_ACR-150_TP2 1E5_ACR-139_TP3 1E9_ACR-237_TP1 1F11_ACR-173_TP2
## Cluster_1819             177               2              19               88
## Cluster_1832            1326              45             288             1527
## Cluster_1833               0               0              19               24
## Cluster_1836            5111             747            1425             4717
## Cluster_1865             669              16             130              174
## Cluster_1950             646              18             223              247
##              1F4_ACR-150_TP3 1F8_ACR-145_TP3 1G5_ACR-244_TP3 1H11_ACR-225_TP1
## Cluster_1819              56              23              95               18
## Cluster_1832             757             214            2749              610
## Cluster_1833              12               7               0               20
## Cluster_1836            3174            1026            4302             1059
## Cluster_1865             454              53            1140              157
## Cluster_1950             280             101             707              384
##              1H12_ACR-186_TP3 1H6_ACR-225_TP2 1H7_ACR-229_TP3 1H8_ACR-186_TP4
## Cluster_1819               33              47              50              28
## Cluster_1832              439            1714             805             873
## Cluster_1833               11              38              11              42
## Cluster_1836             1636            6727            2155            1625
## Cluster_1865               38             602             217              87
## Cluster_1950              147             666             360             421
##              2B2_ACR-145_TP1 2B3_ACR-139_TP2 2C1_ACR-244_TP1 2C2_ACR-139_TP1
## Cluster_1819              44              68              57              50
## Cluster_1832             232            1769             788            1115
## Cluster_1833               0              18              53              24
## Cluster_1836            1266            5844            2033            2926
## Cluster_1865             139             204             344             380
## Cluster_1950             290             657             370             450
##              2D2_ACR-150_TP1 2E2_ACR-186_TP1 2F1_ACR-265_TP1 2G1_ACR-145_TP2
## Cluster_1819              22              24              16              62
## Cluster_1832             144             211             171            1032
## Cluster_1833               4               0               0              27
## Cluster_1836            1267            1729             656            8817
## Cluster_1865              94               7              34             324
## Cluster_1950             127              63              67             255Plot histograms of the expression levels in each sample
# Melt the count matrix into long format
Apul_counts_miRNA_melted <- melt(Apul_counts_miRNA, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_miRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "miRNA Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 159 rows containing non-finite outside the scale range
## (`stat_bin()`).First let’s check the total number of miRNAs in each sample – keep in mind this expression data has not been normalized yet, so there may be different totals for each sample
# Calculate the total number of transcripts for each sample
total_miRNA <- colSums(Apul_counts_miRNA)
# Create a data frame for plotting
total_miRNA_df <- data.frame(sample = names(total_miRNA),
                                   totals = total_miRNA)
# Plot the total number of transcripts for each sample
ggplot(total_miRNA_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of miRNAs per Sample",
       x = "Sample",
       y = "Total miRNAs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilityNow let’s check the number of unique miRNAs in each sample – This should be pretty much the same across samples, even without normalization.
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_miRNA <- colSums(Apul_counts_miRNA > 0)
# Create a data frame for plotting
unique_miRNA_df <- data.frame(sample = names(unique_miRNA),
                                    uniques = unique_miRNA)
# Plot the total number of unique transcripts for each sample
ggplot(unique_miRNA_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed miRNAs per Sample",
       x = "Sample",
       y = "Unique miRNA") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilitypheatmap(Apul_counts_miRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)## Warning: The input is a data frame, convert it to the matrix.
Well… there’s like 2 miRNAs with much higher expression than the others, which is making visualizing relative differences difficult. Let’s redo the heatmap, normalizing each row to view relative difference in expression between samples (
scale='row')
pheatmap(Apul_counts_miRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         scale = 'row',
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)## Warning: The input is a data frame, convert it to the matrix.ShortStack’s primary purpose is to identify miRNAs from sRNA-seq data, but it also automatically annotates siRNA loci! Since siRNA potentially play an important role in transposon silencing in invertebrates, we should generate count matrices for siRNAs as well.
We can see clusters annotated as siRNAs in the Results.gff3 output file of ShortStack (sRNA ID shown in the 3rd column)
Apul_Resultsgff <- read.table("../output/04-Apul-sRNA-discovery-ShortStack/ShortStack_out/Results.gff3")
# Separate last column info into multiple columns for filtering
Apul_Resultsgff <- Apul_Resultsgff %>%
  separate(V9, into = c("Name", "DicerCall", "MIRNA"), sep = ";") %>%
  mutate(Name = sub("ID=", "", Name),
         DicerCall = sub("DicerCall=", "", DicerCall),
         MIRNA = sub("MIRNA=", "", MIRNA))## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 102 rows [1820, 1821,
## 1835, 1836, 1838, 1839, 1843, 1844, 1874, 1875, 1961, 1962, 1964, 1965, 2387,
## 2388, 2517, 2518, 2765, 2766, ...].head(Apul_Resultsgff)##         V1         V2                 V3    V4    V5   V6 V7 V8      Name
## 1 ntLink_7 ShortStack Unknown_sRNA_locus  3054  3472  625  -  . Cluster_1
## 2 ntLink_7 ShortStack Unknown_sRNA_locus  9758 10311 2797  +  . Cluster_2
## 3 ntLink_7 ShortStack Unknown_sRNA_locus 22562 22980  634  -  . Cluster_3
## 4 ntLink_7 ShortStack Unknown_sRNA_locus 29267 29820 2881  +  . Cluster_4
## 5 ntLink_7 ShortStack Unknown_sRNA_locus 42050 42468  689  -  . Cluster_5
## 6 ntLink_7 ShortStack Unknown_sRNA_locus 43122 43556 1774  -  . Cluster_6
##   DicerCall MIRNA
## 1         N     N
## 2         N     N
## 3         N     N
## 4         N     N
## 5         N     N
## 6         N     N# keep just the sRNA category column (V3), and the cluster names (Name)
# filter to only keep clusters ID'd as siRNAs
Apul_siRNA_clusters <- Apul_Resultsgff %>%
  select(V3, Name) %>%
  filter(str_detect(V3, regex("siRNA")))
head(Apul_siRNA_clusters)##              V3         Name
## 1 siRNA23_locus   Cluster_74
## 2 siRNA23_locus  Cluster_232
## 3 siRNA23_locus  Cluster_238
## 4 siRNA23_locus  Cluster_938
## 5 siRNA24_locus  Cluster_967
## 6 siRNA21_locus Cluster_1810# Now use this list of clusters ID'd as siRNAs to filter our sRNA count matrix
# keep only the sample counts and cluster names
Apul_counts_sRNA <- rownames_to_column(Apul_counts_sRNA, var = "Name")
Apul_counts_siRNA <- left_join(Apul_siRNA_clusters, Apul_counts_sRNA, by = c("Name" = "Name")) %>%
  select(-V3)
# convert the column of cluster names into the df row names
Apul_counts_sRNA <- Apul_counts_sRNA %>% column_to_rownames(var="Name")
Apul_counts_siRNA <- Apul_counts_siRNA %>% column_to_rownames(var="Name")
head(Apul_counts_siRNA)##              1A10_ACR-145_TP4 1A12_ACR-237_TP3 1A1_ACR-173_TP1 1A2_ACR-244_TP4
## Cluster_74                  8                3              29              10
## Cluster_232                 0                3              11              15
## Cluster_238                 0                5              11              16
## Cluster_938                 2                3               1               1
## Cluster_967                14               15              22              18
## Cluster_1810               10               20              37              14
##              1A8_ACR-186_TP2 1A9_ACR-244_TP2 1B10_ACR-150_TP4 1B1_ACR-225_TP3
## Cluster_74                 0              19                1              28
## Cluster_232                6              62                0               0
## Cluster_238                5              68                2               0
## Cluster_938               60              29                0               0
## Cluster_967               29              26               10               6
## Cluster_1810              56             131               25              10
##              1B2_ACR-173_TP3 1B5_ACR-229_TP1 1B9_ACR-265_TP4 1C10_ACR-173_TP4
## Cluster_74                 8              65               9                4
## Cluster_232               11               1               0                3
## Cluster_238               12               1               0                3
## Cluster_938                1              17               0                2
## Cluster_967                9              57              13               10
## Cluster_1810              11              91               8                4
##              1C4_ACR-139_TP4 1D10_ACR-265_TP2 1D3_ACR-225_TP4 1D4_ACR-237_TP4
## Cluster_74                 2                7             272              22
## Cluster_232               30                1               0               6
## Cluster_238               31                0               1               7
## Cluster_938                1               19              13               5
## Cluster_967                7               24              62              17
## Cluster_1810               8               72              43              30
##              1D6_ACR-229_TP2 1D8_ACR-237_TP2 1D9_ACR-229_TP4 1E1_ACR-265_TP3
## Cluster_74                33               7              27               2
## Cluster_232                1               8               1               0
## Cluster_238                1               9               1               0
## Cluster_938              116               8               2               2
## Cluster_967               40              17              13              14
## Cluster_1810              84              54              21               8
##              1E3_ACR-150_TP2 1E5_ACR-139_TP3 1E9_ACR-237_TP1 1F11_ACR-173_TP2
## Cluster_74                12               1               4               10
## Cluster_232                0              10               1               14
## Cluster_238                0               9               1               11
## Cluster_938              207               0               3                3
## Cluster_967               59               4               8               17
## Cluster_1810             111               3              27               53
##              1F4_ACR-150_TP3 1F8_ACR-145_TP3 1G5_ACR-244_TP3 1H11_ACR-225_TP1
## Cluster_74                 8               2              19               37
## Cluster_232                4               0              26                0
## Cluster_238                4               0              27                0
## Cluster_938               11               2               8                2
## Cluster_967               11               8              39               20
## Cluster_1810              42              15              30               22
##              1H12_ACR-186_TP3 1H6_ACR-225_TP2 1H7_ACR-229_TP3 1H8_ACR-186_TP4
## Cluster_74                  7              66              12              10
## Cluster_232                 1               0               3               2
## Cluster_238                 1               0               2               3
## Cluster_938                 1              21               5              13
## Cluster_967                 8              27               8              26
## Cluster_1810                5              66              32              18
##              2B2_ACR-145_TP1 2B3_ACR-139_TP2 2C1_ACR-244_TP1 2C2_ACR-139_TP1
## Cluster_74                10              23              29              21
## Cluster_232                1             126              15             205
## Cluster_238                0             124              16             203
## Cluster_938                2              13               4               5
## Cluster_967               38              45              24              41
## Cluster_1810              27              98              28              42
##              2D2_ACR-150_TP1 2E2_ACR-186_TP1 2F1_ACR-265_TP1 2G1_ACR-145_TP2
## Cluster_74                 4               8               3               5
## Cluster_232                6               1               0               4
## Cluster_238                6               1               0               5
## Cluster_938                1               2               1              22
## Cluster_967                5               9               5              21
## Cluster_1810               9               8               6              49write.table(Apul_counts_siRNA, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_siRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)Plot histograms of the expression levels in each sample
# Melt the count matrix into long format
Apul_counts_siRNA_melted <- melt(Apul_counts_siRNA, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_siRNA_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "siRNA Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 372 rows containing non-finite outside the scale range
## (`stat_bin()`).First let’s check the total number of siRNAs in each sample – keep in mind this expression data has not been normalized yet, so there may be different totals for each sample
# Calculate the total number of transcripts for each sample
total_siRNA <- colSums(Apul_counts_siRNA)
# Create a data frame for plotting
total_siRNA_df <- data.frame(sample = names(total_siRNA),
                                   totals = total_siRNA)
# Plot the total number of transcripts for each sample
ggplot(total_siRNA_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of siRNAs per Sample",
       x = "Sample",
       y = "Total siRNAs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
All of the TP2s seem to have higher #s, but since this isn’t normalized I don’t want to put too much stock in that
Now let’s check the number of unique siRNAs in each sample – This should be pretty much the same across samples, even without normalization.
# Calculate the number of unique transcripts (non-zero counts) for each sample
unique_siRNA <- colSums(Apul_counts_siRNA > 0)
# Create a data frame for plotting
unique_siRNA_df <- data.frame(sample = names(unique_siRNA),
                                    uniques = unique_siRNA)
# Plot the total number of unique transcripts for each sample
ggplot(unique_siRNA_df, aes(x = reorder(sample, uniques), y = uniques)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + 
  labs(title = "Total Number of Unique Expressed siRNAs per Sample",
       x = "Sample",
       y = "Unique siRNA") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilitypheatmap(Apul_counts_siRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)## Warning: The input is a data frame, convert it to the matrix.pheatmap(Apul_counts_siRNA,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         scale = 'row',
         color = colorRampPalette(c("blue", "white", "red"))(50),
         fontsize_row = 8,
         fontsize_col = 8)## Warning: The input is a data frame, convert it to the matrix.DESeq2 requires a metadata data frame as input – we’ll use the coldata we’ve already formatted
head(coldata)##                  time.point colony.id
## 1A1_ACR-173_TP1         TP1   ACR-173
## 1A10_ACR-145_TP4        TP4   ACR-145
## 1A12_ACR-237_TP3        TP3   ACR-237
## 1A2_ACR-244_TP4         TP4   ACR-244
## 1A8_ACR-186_TP2         TP2   ACR-186
## 1A9_ACR-244_TP2         TP2   ACR-244# Alphabetize rownames of coldata and colnames of Apul_counts_sRNA
coldata <- coldata[order(rownames(coldata)), ]
Apul_counts_sRNA <- Apul_counts_sRNA[, order(colnames(Apul_counts_sRNA))]
all(rownames(coldata) == colnames(Apul_counts_sRNA))## [1] TRUEdds_Apul_sRNA <- DESeqDataSetFromMatrix(countData = Apul_counts_sRNA,
                              colData = coldata,
                              design = ~ time.point + colony.id)## converting counts to integer mode## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]dds_Apul_sRNA## class: DESeqDataSet 
## dim: 18885 40 
## metadata(1): version
## assays(1): counts
## rownames(18885): Cluster_1 Cluster_2 ... Cluster_18884 Cluster_18885
## rowData names(0):
## colnames(40): 1A1_ACR-173_TP1 1A10_ACR-145_TP4 ... 2F1_ACR-265_TP1
##   2G1_ACR-145_TP2
## colData names(2): time.point colony.iddds_Apul_sRNA$time.point <- factor(dds_Apul_sRNA$time.point, levels = c("TP1","TP2", "TP3", "TP4"))
dds_Apul_sRNA <- DESeq(dds_Apul_sRNA)## estimating size factors##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]## final dispersion estimates##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]## fitting model and testing# Define the output directory path
output_dir <- "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/"
# Set desired false discovery rate threshold (i.e. adjusted p-value, padj)
fdr <- 0.05
# Set log2 fold change threshold (a value of '1' is equal to a fold change of '2')
log2fc <- 1
sRNA_tp1.v.tp2.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP2"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp1.v.tp3.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP3"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp1.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP1","TP4"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp2.v.tp3.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP2","TP3"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp2.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP2","TP4"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp3.v.tp4.results <- results(dds_Apul_sRNA, contrast=c("time.point","TP3","TP4"), alpha = fdr, lfcThreshold = log2fc)
sRNA_tp2.v.tp4.results## log2 fold change (MLE): time.point TP2 vs TP4 
## Wald test p-value: time.point TP2 vs TP4 
## DataFrame with 18885 rows and 6 columns
##                baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## Cluster_1       13.0128      0.7818140  0.432541  0.000000  1.000000         1
## Cluster_2       55.8529      1.3487948  0.466752  0.747280  0.454894         1
## Cluster_3       13.5530      0.1447121  0.381801  0.000000  1.000000         1
## Cluster_4       57.1718      1.0440902  0.416307  0.105908  0.915655         1
## Cluster_5       13.4828      0.0805211  0.634008  0.000000  1.000000         1
## ...                 ...            ...       ...       ...       ...       ...
## Cluster_18881   20.5241      0.0824002  0.302947 0.0000000  1.000000         1
## Cluster_18882   13.3871      0.6283925  0.673831 0.0000000  1.000000         1
## Cluster_18883  170.0788      1.0252319  0.567332 0.0444746  0.964526         1
## Cluster_18884  205.3327     -0.7523888  0.550953 0.0000000  1.000000         1
## Cluster_18885  195.5333     -0.8148397  0.540692 0.0000000  1.000000         1summary(sRNA_tp2.v.tp4.results)## 
## out of 18885 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 1.00 (up)    : 16, 0.085%
## LFC < -1.00 (down) : 199, 1.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?resultstable(sRNA_tp2.v.tp4.results$padj < 0.05)## 
## FALSE  TRUE 
## 18670   215Write DDS results tables to CSVs
# Create a named list of the data frames
results_list <- list(
  sRNA_tp1.v.tp2.results = sRNA_tp1.v.tp2.results,
  sRNA_tp1.v.tp3.results = sRNA_tp1.v.tp3.results,
  sRNA_tp1.v.tp4.results = sRNA_tp1.v.tp4.results,
  sRNA_tp2.v.tp3.results = sRNA_tp2.v.tp3.results,
  sRNA_tp2.v.tp4.results = sRNA_tp2.v.tp4.results,
  sRNA_tp3.v.tp4.results = sRNA_tp3.v.tp4.results
)
# Loop through the list and write each data frame to a CSV file in the specified directory
for (df_name in names(results_list)) {
  write.csv(results_list[[df_name]], file = paste0(output_dir, df_name, ".table.csv"), row.names = TRUE, quote = FALSE)
}It’s worth noting here that I’m actually going to be doing two different types of transformation on the counts data, which serve different purposes.
First is normalizing the transcript counts, which adjusts for differences in library size or sequencing depth, but retains count-like properties. Normalized counts are most useful for things like visualizing expression levels and differential expression analysis.
Second is variance stabilizing the counts data, which aims to make the variance of the transformed data approximately independent of the mean, reducing heteroscedasticity (the relationship between variance and mean) and “smoothing” out the variance at low counts. Notably, the transformed data is no longer on the original count scale. The transformation makes the variance roughly constant across the range of counts, which makes it easier to interpret patterns in the data visually. Variance stabilized data is most useful for exploratory data analysis, like PCA, clustering, and heatmaps, and is also the transformation we’ll want to use before WGCNA.
# extract normalized counts
# (normalization is automatically performed by deseq2)
Apul_counts_sRNA_norm <- counts(dds_Apul_sRNA, normalized=TRUE) %>% data.frame()
write.table(Apul_counts_sRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
# variance stabilized data
vsd_Apul_sRNA <- varianceStabilizingTransformation(dds_Apul_sRNA, blind=TRUE)
wpn_vsd_Apul_sRNA <- getVarianceStabilizedData(dds_Apul_sRNA)
rv_wpn_Apul_sRNA <- rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE)
Apul_counts_sRNA_vsd <- data.frame(wpn_vsd_Apul_sRNA)
write.table(Apul_counts_sRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
q75_wpn_Apul_sRNA <- quantile(rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE), .75)  # 75th quantile variability
Apul_counts_sRNA_vsd_q75 <- wpn_vsd_Apul_sRNA[ rv_wpn_Apul_sRNA > q75_wpn_Apul_sRNA, ] %>% data.frame # filter to retain only the most variable genes
write.table(Apul_counts_sRNA_vsd_q75, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized_q75.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
q95_wpn_Apul_sRNA <- quantile(rowVars(wpn_vsd_Apul_sRNA, useNames=TRUE), .95)  # 95th quantile variability
Apul_counts_sRNA_vsd_q95 <- wpn_vsd_Apul_sRNA[ rv_wpn_Apul_sRNA > q95_wpn_Apul_sRNA, ] %>% data.frame # filter to retain only the most variable genes
write.table(Apul_counts_sRNA_vsd_q95, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_sRNA_variancestabilized_q95.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)Apul_counts_sRNA_norm_long <- Apul_counts_sRNA_norm %>%
  mutate(
    Gene_id = row.names(Apul_counts_sRNA_norm)
  ) %>%
  pivot_longer(-Gene_id)
Apul_counts_sRNA_norm_long %>%
  ggplot(., aes(x = name, y = value)) +
  geom_violin() +
  geom_point() +
  theme_bw() +
  theme(
    axis.text.x = element_text( angle = 90)
  ) +
  ylim(0, NA) +
  labs(
    title = "Normalized Expression",
    x = "Sample",
    y = "Normalized counts"
  )Apul_counts_sRNA_vsd_long <- Apul_counts_sRNA_vsd %>%
  mutate(
    Gene_id = row.names(Apul_counts_sRNA_vsd)
  ) %>%
  pivot_longer(-Gene_id)
Apul_counts_sRNA_vsd_long %>%
  ggplot(., aes(x = name, y = value)) +
  geom_violin() +
  geom_point() +
  theme_bw() +
  theme(
    axis.text.x = element_text( angle = 90)
  ) +
  ylim(0, NA) +
  labs(
    title = "Variance Stabilized Expression",
    x = "Sample",
    y = "Variance stabilized data"
  )Plot histograms of the normalized expression levels in each sample
# Melt the count matrix into long format
Apul_counts_norm_melted <- melt(Apul_counts_sRNA_norm, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 99729 rows containing non-finite outside the scale range
## (`stat_bin()`).Check the total number of transcripts in each sample – now that we’ve normalized the data these totals should be similar
# Calculate the total number of transcripts for each sample
total_transcripts_norm <- colSums(Apul_counts_sRNA_norm)
# Create a data frame for plotting
total_transcripts_norm_df <- data.frame(sample = names(total_transcripts_norm),
                                   totals = total_transcripts_norm)
# Plot the total number of transcripts for each sample
ggplot(total_transcripts_norm_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilityplotPCA(vsd_Apul_sRNA, intgroup="time.point")plotPCA(vsd_Apul_sRNA, intgroup="colony.id")
Samples are strongly clustering by colony. Interestingly time point doesn’t appear to influence clustering.
sample_dists <- dist(t(assay(vsd_Apul_sRNA)))
pheatmap(as.matrix(sample_dists), 
         clustering_distance_rows = "euclidean", 
         clustering_distance_cols = "euclidean", 
         main="Sample Clustering")
Samples are strongly clustering by colony.
Of most variable variance stabilized sRNA transcripts
# 75th quantile
heat_colors <- rev(brewer.pal(12, "RdYlBu"))## Warning in brewer.pal(12, "RdYlBu"): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colorspheatmap(Apul_counts_sRNA_vsd_q75, 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")## Warning: The input is a data frame, convert it to the matrix.## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.# 95th quantile
pheatmap(Apul_counts_sRNA_vsd_q95, 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")## Warning: The input is a data frame, convert it to the matrix.Apul_counts_sRNA_norm$Name <- rownames(Apul_counts_sRNA_norm)
Apul_counts_sRNA_vsd$Name <- rownames(Apul_counts_sRNA_vsd)
Apul_counts_miRNA_namesdf <- data.frame(Name = rownames(Apul_counts_miRNA)) 
Apul_counts_miRNA_norm <- left_join(Apul_counts_miRNA_namesdf, Apul_counts_sRNA_norm, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
colnames(Apul_counts_miRNA_norm) <- gsub("X", "", colnames(Apul_counts_miRNA_norm))
colnames(Apul_counts_miRNA_norm) <- gsub("\\.", "-", colnames(Apul_counts_miRNA_norm))
write.table(Apul_counts_miRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_miRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
Apul_counts_miRNA_vsd <- left_join(Apul_counts_miRNA_namesdf, Apul_counts_sRNA_vsd, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_miRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_miRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)Plot histograms of the normalized expression levels in each sample
# Melt the count matrix into long format
Apul_counts_miRNA_norm_melted <- melt(Apul_counts_miRNA_norm, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_miRNA_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 159 rows containing non-finite outside the scale range
## (`stat_bin()`).Check the total number of transcripts in each sample – now that we’ve normalized the data these totals should be similar
# Calculate the total number of transcripts for each sample
total_transcripts_miRNA_norm <- colSums(Apul_counts_miRNA_norm)
# Create a data frame for plotting
total_transcripts_miRNA_norm_df <- data.frame(sample = names(total_transcripts_miRNA_norm),
                                   totals = total_transcripts_miRNA_norm)
# Plot the total number of transcripts for each sample
ggplot(total_transcripts_miRNA_norm_df, aes(x = reorder(sample, totals), totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of miRNA Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
## PCA of variance stabilized miRNAs
Note that I had to convert the counts to a data frame (instead of a DESeqTransform object) to select only miRNAs, so I can’t use the plotPCA() function.
# Perform PCA
pca_res <- prcomp(t(as.matrix(Apul_counts_miRNA_vsd)))
# Calculate percent variance explained
percentVar <- round(100 * (pca_res$sdev^2 / sum(pca_res$sdev^2)))
## Color by colony ID ##
# Convert PCA results to data frame
pca_df <- as.data.frame(pca_res$x)
pca_df$colony.id <- dds_Apul_sRNA$colony.id
# Create PCA plot with percent variance
ggplot(pca_df, aes(x = PC1, y = PC2, color = colony.id)) +
  geom_point(size = 3) +
  labs(
    x = paste0("PC1: ", percentVar[1], "% variance"),
    y = paste0("PC2: ", percentVar[2], "% variance"),
    title = "PCA of VSD miRNA Counts"
  )## Color by colony ID ##
# Convert PCA results to data frame
pca_df <- as.data.frame(pca_res$x)
pca_df$time.point <- dds_Apul_sRNA$time.point
# Create PCA plot with percent variance
ggplot(pca_df, aes(x = PC1, y = PC2, color = time.point)) +
  geom_point(size = 3) +
  labs(
    x = paste0("PC1: ", percentVar[1], "% variance"),
    y = paste0("PC2: ", percentVar[2], "% variance"),
    title = "PCA of VSD miRNA Counts"
  )
Interesting! When considering all sRNAs, they strongly clustered by colony and not time point. However, miRNAs alone show clear clustering by time point (though some influence of colony is still apparent).
# Calculate the Euclidean distance matrix
sample_dists <- dist(t(Apul_counts_miRNA_vsd))
# Create the heatmap
pheatmap(
  as.matrix(sample_dists), 
  clustering_distance_rows = "euclidean", 
  clustering_distance_cols = "euclidean", 
  main = "Sample Clustering"
)Of all miRNAs
heat_colors <- rev(brewer.pal(12, "RdYlBu"))## Warning in brewer.pal(12, "RdYlBu"): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colorspheatmap(as.matrix(Apul_counts_miRNA_vsd[apply(Apul_counts_miRNA_vsd, 1, var) > 0, ]), 
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")# 
# top_20_counts_all <- order(rowMeans(counts(dds,normalized=TRUE)),
#                 decreasing=TRUE)[1:200]
# 
# timepoint_annotation = colData(dds) %>% as.data.frame() %>% select(time.point)
# 
# 
# pheatmap(assay(vsd)[top_20_counts_all,], 
#          cluster_rows=FALSE, 
#          show_rownames=FALSE,
#          cluster_cols=TRUE, 
#          annotation_col = timepoint_annotation)# Perform clustering based on normalized expression
row_clustering <- hclust(dist(as.matrix(Apul_counts_miRNA_norm)))
col_clustering <- hclust(dist(t(as.matrix(Apul_counts_miRNA_norm))))
# Convert normalized counts to binary (presence/absence)
binary_counts <- Apul_counts_miRNA_norm > 0
binary_counts <- as.matrix(binary_counts) * 1  # Convert logical to numeric (1/0)
# Generate heatmap using predefined dendrograms
Heatmap(
  binary_counts,
  cluster_rows = as.dendrogram(row_clustering),  # Row clustering
  cluster_columns = as.dendrogram(col_clustering),  # Column clustering
  col = c("white", "black"),  # White for 0 (absence), black for 1 (presence)
  name = "Presence/Absence",  # Legend title
  show_row_names = TRUE,
  show_column_names = TRUE
)# Perform clustering based on presence/absence (binary data)
row_clustering <- hclust(dist(binary_counts))  # Use binary_counts for row clustering
col_clustering <- hclust(dist(t(binary_counts)))  # Use binary_counts for column clustering
# Generate heatmap using predefined dendrograms
Heatmap(
  binary_counts,
  cluster_rows = as.dendrogram(row_clustering),  # Row clustering based on presence/absence
  cluster_columns = as.dendrogram(col_clustering),  # Column clustering based on presence/absence
  col = c("lightgray", "#408EC6"),  # White for 0 (absence), black for 1 (presence)
  name = "Presence/Absence",  # Legend title
  show_row_names = TRUE,
  show_column_names = TRUE,
  row_names_gp = gpar(fontsize = 6),  # Adjust row names font size
  column_names_gp = gpar(fontsize = 8)  # Adjust column names font size
)It looks like, of the 51 total miRNAs identified, most are consistently present in all A.pulchra colonies and time points. For 10-15 of the miRNAs, presence/absence is more variable accross colonies and timepoints.
results_df <- as.data.frame(sRNA_tp2.v.tp4.results)
results_df$Name <- rownames(results_df)
# Assuming your metadata frame also has a 'Name' column
# Perform the left join to select only the genes listed in metadata
filtered_results <- dplyr::left_join(Apul_counts_miRNA_namesdf, results_df, by = "Name")
filter(filtered_results, padj < 0.4)##            Name   baseMean log2FoldChange     lfcSE     stat       pvalue
## 1 Cluster_15097   337.3828       2.074086 0.4198997 2.557958 1.052889e-02
## 2 Cluster_16354 23815.2355       2.725480 0.3413368 5.055067 4.302405e-07
##           padj
## 1 0.3452048627
## 2 0.0002620998Apul_counts_sRNA_norm$Name <- rownames(Apul_counts_sRNA_norm)
Apul_counts_sRNA_vsd$Name <- rownames(Apul_counts_sRNA_vsd)
Apul_counts_siRNA_namesdf <- data.frame(Name = rownames(Apul_counts_siRNA)) 
Apul_counts_siRNA_norm <- left_join(Apul_counts_siRNA_namesdf, Apul_counts_sRNA_norm, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_siRNA_norm, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_siRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)
Apul_counts_siRNA_vsd <- left_join(Apul_counts_siRNA_namesdf, Apul_counts_sRNA_vsd, by = c("Name" = "Name")) %>%
  column_to_rownames(var = "Name")
write.table(Apul_counts_siRNA_vsd, file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_siRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE)Plot histograms of the normalized expression levels in each sample
# Melt the count matrix into long format
Apul_counts_siRNA_norm_melted <- melt(Apul_counts_siRNA_norm, variable.name = "sample", value.name = "counts")## No id variables; using all as measure variables# Plot the expression level histograms for each sample
ggplot(Apul_counts_siRNA_norm_melted, aes(x = counts)) +
  geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") +
  scale_x_log10() +  # Optional: Log-transform the x-axis for better visualization
  facet_wrap(~sample, scales = "free_y") +
  labs(title = "Gene Expression Level Histogram for Each Sample",
       x = "Expression Level (Counts)",
       y = "Frequency") +
  theme_minimal()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.## Warning: Removed 372 rows containing non-finite outside the scale range
## (`stat_bin()`).Check the total number of transcripts in each sample – now that we’ve normalized the data these totals should be similar
# Calculate the total number of transcripts for each sample
total_transcripts_siRNA_norm <- colSums(Apul_counts_siRNA_norm)
# Create a data frame for plotting
total_transcripts_siRNA_norm_df <- data.frame(sample = names(total_transcripts_siRNA_norm),
                                   totals = total_transcripts_siRNA_norm)
# Plot the total number of transcripts for each sample
ggplot(total_transcripts_siRNA_norm_df, aes(x = reorder(sample, totals), y = totals)) +
  geom_bar(stat = "identity", fill = "#408EC6", color = "black") +
  geom_text(aes(label = totals), vjust = -0.3, size = 3.5) +
  labs(title = "Total Number of siRNA Transcripts per Sample",
       x = "Sample",
       y = "Total Transcripts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readabilityOf all siRNAs
heat_colors <- rev(brewer.pal(12, "RdYlBu"))## Warning in brewer.pal(12, "RdYlBu"): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colorspheatmap(as.matrix(Apul_counts_siRNA_vsd[apply(Apul_counts_siRNA_vsd, 1, var) > 0, ]), 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         show_colnames = TRUE,
         color = heat_colors,
         scale="row")