## current date and time is  April 21, 2025 10:08:10

1 RNA seq

/home/shared/kallisto/kallisto

kallisto installed on raven?

cd data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna

##   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
##                                  Dload  Upload   Total   Spent    Left  Speed
##   0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  1  209M    1 4083k    0     0  43.8M      0  0:00:04 --:--:--  0:00:04 43.3M 54  209M   54  114M    0     0   104M      0  0:00:02  0:00:01  0:00:01  104M100  209M  100  209M    0     0   107M      0  0:00:01  0:00:01 --:--:--  107M

2 Downloading reference genome

/home/shared/kallisto/kallisto \
index -i \
data/cgigas_roslin_rna.index \
data/rna.fna

3 Downloading sequence reads

cd data 
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/

…make an output directory and populate output directory with sequence reads.

mkdir output
mkdir output/kallisto_01

find data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i data/cgigas_roslin_rna.index \
-o output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 data/{}_L001_R1_001.fastq.gz

4 Create gene expression matrix

perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix output/kallisto_01 \
    --name_sample_by_basedir \
    output/kallisto_01/D54_S145/abundance.tsv \
    output/kallisto_01/D56_S136/abundance.tsv \
    output/kallisto_01/D58_S144/abundance.tsv \
    output/kallisto_01/M45_S140/abundance.tsv \
    output/kallisto_01/M48_S137/abundance.tsv \
    output/kallisto_01/M89_S138/abundance.tsv \
    output/kallisto_01/D55_S146/abundance.tsv \
    output/kallisto_01/D57_S143/abundance.tsv \
    output/kallisto_01/D59_S142/abundance.tsv \
    output/kallisto_01/M46_S141/abundance.tsv \
    output/kallisto_01/M49_S139/abundance.tsv \
    output/kallisto_01/M90_S147/abundance.tsv \
    output/kallisto_01/N48_S194/abundance.tsv \
    output/kallisto_01/N50_S187/abundance.tsv \
    output/kallisto_01/N52_S184/abundance.tsv \
    output/kallisto_01/N54_S193/abundance.tsv \
    output/kallisto_01/N56_S192/abundance.tsv \
    output/kallisto_01/N58_S195/abundance.tsv \
    output/kallisto_01/N49_S185/abundance.tsv \
    output/kallisto_01/N51_S186/abundance.tsv \
    output/kallisto_01/N53_S188/abundance.tsv \
    output/kallisto_01/N55_S190/abundance.tsv \
    output/kallisto_01/N57_S191/abundance.tsv \
    output/kallisto_01/N59_S189/abundance.tsv

5 Run DESeq2

#install.packages('BioManager')
#BiocManager::install("DESeq2")
library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(pheatmap)
library(RColorBrewer)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     shift
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     shift
## 
## The following object is masked from 'package:IRanges':
## 
##     shift
## 
## The following objects are masked from 'package:S4Vectors':
## 
##     first, second

6 Read in the count matrix

countmatrix <- read.delim("output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)

##                   D54_S145   D56_S136   D58_S144 M45_S140  M48_S137  M89_S138
## XM_034447539.1 2.700980000  0.0000000 15.4800000  4.90442 2.5252900  7.089490
## XM_034446339.1 0.019292200  0.0159715  0.0132301 40.89660 0.0195631  0.012115
## XM_011456326.3 0.000000000  0.0000000  0.0000000  0.00000 0.0000000  0.000000
## XM_011430931.3 0.000275498 10.0820000 18.2415000  0.00000 0.0000000 24.886700
## XM_034455815.1 0.000000000  0.0000000  0.0000000  0.00000 0.0000000  0.000000
## XM_034444672.1 0.000000000  1.0000000  0.0000000  0.00000 1.0000000  0.000000
##                   D55_S146   D57_S143  D59_S142   M46_S141  M49_S139
## XM_034447539.1  0.00000000  0.0000000 5.7772600  0.0000000 2.5252900
## XM_034446339.1  0.00788343  0.0132074 0.0100441  0.0121685 0.0210626
## XM_011456326.3  0.00000000  0.0000000 0.0000000  0.0000000 0.0000000
## XM_011430931.3 22.39050000 29.0000000 0.0000000 18.3797000 0.0205952
## XM_034455815.1  0.00000000  0.0000000 0.0000000  0.0000000 0.0000000
## XM_034444672.1  1.00000000  0.0000000 0.0000000  3.0000000 1.0000000
##                   M90_S147  N48_S194 N50_S187    N52_S184    N54_S193
## XM_034447539.1 3.95073e+00  1.691930  0.00000  0.00000000   0.0000000
## XM_034446339.1 1.55112e-02  0.013728 80.82270  0.00946109   0.0390535
## XM_011456326.3 0.00000e+00  0.000000  0.00000  0.00000000   0.0000000
## XM_011430931.3 4.60514e-07 22.414900  9.34299 15.48500000 196.6450000
## XM_034455815.1 0.00000e+00  0.000000  0.00000  0.00000000   0.0000000
## XM_034444672.1 0.00000e+00  0.000000  0.00000  0.00000000   1.0000000
##                  N56_S192   N58_S195   N49_S185   N51_S186  N53_S188   N55_S190
## XM_034447539.1 0.00000000  5.7296600  0.0000000  0.0000000 9.4164900  0.0000000
## XM_034446339.1 0.00678549  0.0216473  0.0139614  0.0358411 0.0251142  0.0114829
## XM_011456326.3 0.00000000  0.0000000  0.0000000  0.0000000 0.0000000  0.0000000
## XM_011430931.3 1.73641000 34.8223000 14.6284000 18.1326000 0.0000000 23.6451000
## XM_034455815.1 0.00000000  0.0000000  0.0000000  0.0000000 0.0000000  0.0000000
## XM_034444672.1 0.00000000  0.0000000  0.0000000  0.0000000 0.0000000  0.0000000
##                  N57_S191 N59_S189
## XM_034447539.1 0.00000000  5.05059
## XM_034446339.1 0.00328544 30.00000
## XM_011456326.3 0.00000000  0.00000
## XM_011430931.3 2.84974000 51.00000
## XM_034455815.1 0.00000000  0.00000
## XM_034444672.1 4.00000000  1.00000

…rounding integers to whole numbers

countmatrix <- round(countmatrix, 0)
str(countmatrix)

## 'data.frame':    73307 obs. of  24 variables:
##  $ D54_S145: num  3 0 0 0 0 0 0 1 0 0 ...
##  $ D56_S136: num  0 0 0 10 0 1 0 1 0 0 ...
##  $ D58_S144: num  15 0 0 18 0 0 0 3 0 0 ...
##  $ M45_S140: num  5 41 0 0 0 0 0 0 0 0 ...
##  $ M48_S137: num  3 0 0 0 0 1 0 1 0 0 ...
##  $ M89_S138: num  7 0 0 25 0 0 0 0 0 0 ...
##  $ D55_S146: num  0 0 0 22 0 1 0 0 0 0 ...
##  $ D57_S143: num  0 0 0 29 0 0 0 0 0 0 ...
##  $ D59_S142: num  6 0 0 0 0 0 0 3 0 0 ...
##  $ M46_S141: num  0 0 0 18 0 3 0 3 0 0 ...
##  $ M49_S139: num  3 0 0 0 0 1 0 1 0 0 ...
##  $ M90_S147: num  4 0 0 0 0 0 13 1 0 0 ...
##  $ N48_S194: num  2 0 0 22 0 0 0 1 0 0 ...
##  $ N50_S187: num  0 81 0 9 0 0 0 2 0 0 ...
##  $ N52_S184: num  0 0 0 15 0 0 0 0 0 0 ...
##  $ N54_S193: num  0 0 0 197 0 1 0 3 0 12 ...
##  $ N56_S192: num  0 0 0 2 0 0 0 0 0 0 ...
##  $ N58_S195: num  6 0 0 35 0 0 0 0 0 0 ...
##  $ N49_S185: num  0 0 0 15 0 0 0 0 0 0 ...
##  $ N51_S186: num  0 0 0 18 0 0 0 4 0 0 ...
##  $ N53_S188: num  9 0 0 0 0 0 0 0 0 0 ...
##  $ N55_S190: num  0 0 0 24 0 0 0 1 0 0 ...
##  $ N57_S191: num  0 0 0 3 0 4 0 0 0 0 ...
##  $ N59_S189: num  5 30 0 51 0 1 0 0 0 0 ...

7 Get DEGs based on Desication

deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), 
                             type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)

## converting counts to integer mode

deseq2.dds <- DESeq(deseq2.dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 5677 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]

head(deseq2.res)

## log2 fold change (MLE): condition desicated vs control 
## Wald test p-value: condition desicated vs control 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## NM_001305288.1   0.181270      1.0453698  3.002647  0.348149 7.27728e-01
## NM_001305289.1   0.881457     -2.8119577  1.068276 -2.632239 8.48240e-03
## NM_001305290.1 145.913728      0.4580323  0.116185  3.942251 8.07203e-05
## NM_001305291.1   0.261701      0.5618449  1.587076  0.354013 7.23329e-01
## NM_001305292.1   2.902430     -1.2181330  0.763421 -1.595624 1.10573e-01
## NM_001305293.1 234.342117      0.0663449  0.131969  0.502731 6.15154e-01
##                      padj
##                 <numeric>
## NM_001305288.1         NA
## NM_001305289.1         NA
## NM_001305290.1 0.00956401
## NM_001305291.1         NA
## NM_001305292.1 0.59541971
## NM_001305293.1 0.95562321

vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition")

# Select top 50 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")

# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])

## [1] 607   6

tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")

## Warning in xy.coords(x, y, xlabel, ylabel, log): 16484 x values <= 0 omitted
## from logarithmic plot

# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")

# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)

# Create volcano plot
volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("grey", "red")) +
  labs(title = "Volcano Plot",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-value",
       color = "Significantly\nDifferentially Expressed") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "top")

print(volcano_plot)

## Warning: Removed 41435 rows containing missing values or values outside the scale range
## (`geom_point()`).

write.table(tmp.sig, "output/DEGlist.tab", row.names = T)

library(data.table)
deglist <- fread("output/DEGlist.tab")

deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns

library(DT)
datatable(deglist)

8 Gene Enrichment Analysis

gene_deg_status <- res_df %>%
  mutate(degstaus = ifelse(padj < 0.05, 1, 0))

library(Biostrings)

## Loading required package: XVector

## 
## Attaching package: 'XVector'

## The following object is masked from 'package:purrr':
## 
##     compact

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

# Read the FASTA file
fasta_data <- readDNAStringSet("data/rna.fna")

# Calculate gene lengths
gene_lengths <- width(fasta_data)


# Extract gene names/IDs from sequence IDs
gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1])

# Create a data frame with gene IDs and lengths
gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths)

9 Need GO Mappings

package goseq (with function nullp()) is not available for my version of R. Skipping this step.

03.1-BLAST

Lara Breitkreutz

April 21, 2025