## current date and time is April 21, 2025 10:08:10
/home/shared/kallisto/kallisto
kallisto installed on raven?
cd data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
## % Total % Received % Xferd Average Speed Time Time Time Current
## Dload Upload Total Spent Left Speed
## 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 1 209M 1 4083k 0 0 43.8M 0 0:00:04 --:--:-- 0:00:04 43.3M 54 209M 54 114M 0 0 104M 0 0:00:02 0:00:01 0:00:01 104M100 209M 100 209M 0 0 107M 0 0:00:01 0:00:01 --:--:-- 107M
/home/shared/kallisto/kallisto \
index -i \
data/cgigas_roslin_rna.index \
data/rna.fna
cd data
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
…make an output directory and populate output directory with sequence reads.
mkdir output
mkdir output/kallisto_01
find data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i data/cgigas_roslin_rna.index \
-o output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 data/{}_L001_R1_001.fastq.gz
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix output/kallisto_01 \
--name_sample_by_basedir \
output/kallisto_01/D54_S145/abundance.tsv \
output/kallisto_01/D56_S136/abundance.tsv \
output/kallisto_01/D58_S144/abundance.tsv \
output/kallisto_01/M45_S140/abundance.tsv \
output/kallisto_01/M48_S137/abundance.tsv \
output/kallisto_01/M89_S138/abundance.tsv \
output/kallisto_01/D55_S146/abundance.tsv \
output/kallisto_01/D57_S143/abundance.tsv \
output/kallisto_01/D59_S142/abundance.tsv \
output/kallisto_01/M46_S141/abundance.tsv \
output/kallisto_01/M49_S139/abundance.tsv \
output/kallisto_01/M90_S147/abundance.tsv \
output/kallisto_01/N48_S194/abundance.tsv \
output/kallisto_01/N50_S187/abundance.tsv \
output/kallisto_01/N52_S184/abundance.tsv \
output/kallisto_01/N54_S193/abundance.tsv \
output/kallisto_01/N56_S192/abundance.tsv \
output/kallisto_01/N58_S195/abundance.tsv \
output/kallisto_01/N49_S185/abundance.tsv \
output/kallisto_01/N51_S186/abundance.tsv \
output/kallisto_01/N53_S188/abundance.tsv \
output/kallisto_01/N55_S190/abundance.tsv \
output/kallisto_01/N57_S191/abundance.tsv \
output/kallisto_01/N59_S189/abundance.tsv
#install.packages('BioManager')
#BiocManager::install("DESeq2")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pheatmap)
library(RColorBrewer)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following object is masked from 'package:SummarizedExperiment':
##
## shift
##
## The following object is masked from 'package:GenomicRanges':
##
## shift
##
## The following object is masked from 'package:IRanges':
##
## shift
##
## The following objects are masked from 'package:S4Vectors':
##
## first, second
countmatrix <- read.delim("output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138
## XM_034447539.1 2.700980000 0.0000000 15.4800000 4.90442 2.5252900 7.089490
## XM_034446339.1 0.019292200 0.0159715 0.0132301 40.89660 0.0195631 0.012115
## XM_011456326.3 0.000000000 0.0000000 0.0000000 0.00000 0.0000000 0.000000
## XM_011430931.3 0.000275498 10.0820000 18.2415000 0.00000 0.0000000 24.886700
## XM_034455815.1 0.000000000 0.0000000 0.0000000 0.00000 0.0000000 0.000000
## XM_034444672.1 0.000000000 1.0000000 0.0000000 0.00000 1.0000000 0.000000
## D55_S146 D57_S143 D59_S142 M46_S141 M49_S139
## XM_034447539.1 0.00000000 0.0000000 5.7772600 0.0000000 2.5252900
## XM_034446339.1 0.00788343 0.0132074 0.0100441 0.0121685 0.0210626
## XM_011456326.3 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000
## XM_011430931.3 22.39050000 29.0000000 0.0000000 18.3797000 0.0205952
## XM_034455815.1 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000
## XM_034444672.1 1.00000000 0.0000000 0.0000000 3.0000000 1.0000000
## M90_S147 N48_S194 N50_S187 N52_S184 N54_S193
## XM_034447539.1 3.95073e+00 1.691930 0.00000 0.00000000 0.0000000
## XM_034446339.1 1.55112e-02 0.013728 80.82270 0.00946109 0.0390535
## XM_011456326.3 0.00000e+00 0.000000 0.00000 0.00000000 0.0000000
## XM_011430931.3 4.60514e-07 22.414900 9.34299 15.48500000 196.6450000
## XM_034455815.1 0.00000e+00 0.000000 0.00000 0.00000000 0.0000000
## XM_034444672.1 0.00000e+00 0.000000 0.00000 0.00000000 1.0000000
## N56_S192 N58_S195 N49_S185 N51_S186 N53_S188 N55_S190
## XM_034447539.1 0.00000000 5.7296600 0.0000000 0.0000000 9.4164900 0.0000000
## XM_034446339.1 0.00678549 0.0216473 0.0139614 0.0358411 0.0251142 0.0114829
## XM_011456326.3 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## XM_011430931.3 1.73641000 34.8223000 14.6284000 18.1326000 0.0000000 23.6451000
## XM_034455815.1 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## XM_034444672.1 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## N57_S191 N59_S189
## XM_034447539.1 0.00000000 5.05059
## XM_034446339.1 0.00328544 30.00000
## XM_011456326.3 0.00000000 0.00000
## XM_011430931.3 2.84974000 51.00000
## XM_034455815.1 0.00000000 0.00000
## XM_034444672.1 4.00000000 1.00000
…rounding integers to whole numbers
countmatrix <- round(countmatrix, 0)
str(countmatrix)
## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 3 0 0 0 0 0 0 1 0 0 ...
## $ D56_S136: num 0 0 0 10 0 1 0 1 0 0 ...
## $ D58_S144: num 15 0 0 18 0 0 0 3 0 0 ...
## $ M45_S140: num 5 41 0 0 0 0 0 0 0 0 ...
## $ M48_S137: num 3 0 0 0 0 1 0 1 0 0 ...
## $ M89_S138: num 7 0 0 25 0 0 0 0 0 0 ...
## $ D55_S146: num 0 0 0 22 0 1 0 0 0 0 ...
## $ D57_S143: num 0 0 0 29 0 0 0 0 0 0 ...
## $ D59_S142: num 6 0 0 0 0 0 0 3 0 0 ...
## $ M46_S141: num 0 0 0 18 0 3 0 3 0 0 ...
## $ M49_S139: num 3 0 0 0 0 1 0 1 0 0 ...
## $ M90_S147: num 4 0 0 0 0 0 13 1 0 0 ...
## $ N48_S194: num 2 0 0 22 0 0 0 1 0 0 ...
## $ N50_S187: num 0 81 0 9 0 0 0 2 0 0 ...
## $ N52_S184: num 0 0 0 15 0 0 0 0 0 0 ...
## $ N54_S193: num 0 0 0 197 0 1 0 3 0 12 ...
## $ N56_S192: num 0 0 0 2 0 0 0 0 0 0 ...
## $ N58_S195: num 6 0 0 35 0 0 0 0 0 0 ...
## $ N49_S185: num 0 0 0 15 0 0 0 0 0 0 ...
## $ N51_S186: num 0 0 0 18 0 0 0 4 0 0 ...
## $ N53_S188: num 9 0 0 0 0 0 0 0 0 0 ...
## $ N55_S190: num 0 0 0 24 0 0 0 1 0 0 ...
## $ N57_S191: num 0 0 0 3 0 4 0 0 0 0 ...
## $ N59_S189: num 5 30 0 51 0 1 0 0 0 0 ...
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
## converting counts to integer mode
deseq2.dds <- DESeq(deseq2.dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5677 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
head(deseq2.res)
## log2 fold change (MLE): condition desicated vs control
## Wald test p-value: condition desicated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## NM_001305288.1 0.181270 1.0453698 3.002647 0.348149 7.27728e-01
## NM_001305289.1 0.881457 -2.8119577 1.068276 -2.632239 8.48240e-03
## NM_001305290.1 145.913728 0.4580323 0.116185 3.942251 8.07203e-05
## NM_001305291.1 0.261701 0.5618449 1.587076 0.354013 7.23329e-01
## NM_001305292.1 2.902430 -1.2181330 0.763421 -1.595624 1.10573e-01
## NM_001305293.1 234.342117 0.0663449 0.131969 0.502731 6.15154e-01
## padj
## <numeric>
## NM_001305288.1 NA
## NM_001305289.1 NA
## NM_001305290.1 0.00956401
## NM_001305291.1 NA
## NM_001305292.1 0.59541971
## NM_001305293.1 0.95562321
vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition")
# Select top 50 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]
# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]
# Log-transform counts
log_counts_top <- log2(counts_top + 1)
# Generate heatmap
pheatmap(log_counts_top, scale = "row")
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
## [1] 607 6
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 16484 x values <= 0 omitted
## from logarithmic plot
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)
# Create volcano plot
volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("grey", "red")) +
labs(title = "Volcano Plot",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-value",
color = "Significantly\nDifferentially Expressed") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "top")
print(volcano_plot)
## Warning: Removed 41435 rows containing missing values or values outside the scale range
## (`geom_point()`).
write.table(tmp.sig, "output/DEGlist.tab", row.names = T)
library(data.table)
deglist <- fread("output/DEGlist.tab")
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns
library(DT)
datatable(deglist)
gene_deg_status <- res_df %>%
mutate(degstaus = ifelse(padj < 0.05, 1, 0))
library(Biostrings)
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
# Read the FASTA file
fasta_data <- readDNAStringSet("data/rna.fna")
# Calculate gene lengths
gene_lengths <- width(fasta_data)
# Extract gene names/IDs from sequence IDs
gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1])
# Create a data frame with gene IDs and lengths
gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths)
package goseq (with function nullp()) is not available for my version of R. Skipping this step.