This code uses RNA-Seq reads to examine differential gene expression for two treatments of an experiment.

Load packages

Load R packages

library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)

Kallisto alignment

make sure kallisto works

#kallisto is already downloaded in Raven
/home/shared/kallisto/kallisto -h

download reference

#move to preferred directory
cd ../data

#download from url
curl -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna

create index and rename

/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna

Download data ending in “.fastq.gz”. wget downloads multiple files recursively.

wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/

Create subdirectory and align reads with kallisto

#create subdirectory
mkdir ../output/kallisto_01

#run kallisto
find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz

Create gene expression matrix

perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/D54_S145/abundance.tsv \
    ../output/kallisto_01/D56_S136/abundance.tsv \
    ../output/kallisto_01/D58_S144/abundance.tsv \
    ../output/kallisto_01/M45_S140/abundance.tsv \
    ../output/kallisto_01/M48_S137/abundance.tsv \
    ../output/kallisto_01/M89_S138/abundance.tsv \
    ../output/kallisto_01/D55_S146/abundance.tsv \
    ../output/kallisto_01/D57_S143/abundance.tsv \
    ../output/kallisto_01/D59_S142/abundance.tsv \
    ../output/kallisto_01/M46_S141/abundance.tsv \
    ../output/kallisto_01/M49_S139/abundance.tsv \
    ../output/kallisto_01/M90_S147/abundance.tsv \
    ../output/kallisto_01/N48_S194/abundance.tsv \
    ../output/kallisto_01/N50_S187/abundance.tsv \
    ../output/kallisto_01/N52_S184/abundance.tsv \
    ../output/kallisto_01/N54_S193/abundance.tsv \
    ../output/kallisto_01/N56_S192/abundance.tsv \
    ../output/kallisto_01/N58_S195/abundance.tsv \
    ../output/kallisto_01/N49_S185/abundance.tsv \
    ../output/kallisto_01/N51_S186/abundance.tsv \
    ../output/kallisto_01/N53_S188/abundance.tsv \
    ../output/kallisto_01/N55_S190/abundance.tsv \
    ../output/kallisto_01/N57_S191/abundance.tsv \
    ../output/kallisto_01/N59_S189/abundance.tsv

Run DESeq2

Now that the reads have been downloaded and aligned, we can conduct differential gene expression analysis.

Read in count matrix

#read matrix into r
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]

#explore table
head(countmatrix)
##                   D54_S145    D56_S136    D58_S144    M45_S140  M48_S137
## XM_034462753.1 30.00000000 2.90000e+01 1.21536e-05 4.14533e-01 34.000000
## XM_020072414.2  0.00481712 7.44551e-04 2.81572e-03 4.49804e-05  0.000000
## XM_011449807.3  0.00000000 2.00000e+00 2.00000e+00 0.00000e+00  4.000000
## XM_011450187.3  0.00000000 0.00000e+00 0.00000e+00 0.00000e+00  0.000000
## XM_034478413.1  0.00000000 0.00000e+00 0.00000e+00 0.00000e+00  0.000000
## XM_034452906.1  0.00000000 0.00000e+00 0.00000e+00 0.00000e+00  0.631953
##                   M89_S138    D55_S146    D57_S143    D59_S142    M46_S141
## XM_034462753.1 24.00000000 4.59454e+01 34.95200000 83.26640000 6.36312e+01
## XM_020072414.2  0.00186705 6.22574e-05  0.00127031  0.00400573 6.14065e-05
## XM_011449807.3  3.00000000 2.00000e+00  1.00000000  0.00000000 0.00000e+00
## XM_011450187.3  0.00000000 0.00000e+00  0.00000000  0.00000000 0.00000e+00
## XM_034478413.1  0.00000000 0.00000e+00  0.00000000  0.00000000 0.00000e+00
## XM_034452906.1  0.00000000 0.00000e+00  0.00000000  0.26390500 1.00000e+00
##                   M49_S139 M90_S147    N48_S194    N50_S187    N52_S184
## XM_034462753.1 42.50120000  75.3478 3.11802e+01 22.39470000 7.85190e+01
## XM_020072414.2  0.00610825   0.0000 4.23498e-05  0.00014354 1.95711e-04
## XM_011449807.3  0.00000000   1.0000 1.44246e+01  1.00000000 3.00000e+00
## XM_011450187.3  0.00000000   0.0000 0.00000e+00  0.00000000 0.00000e+00
## XM_034478413.1  0.00000000   0.0000 0.00000e+00  0.00000000 0.00000e+00
## XM_034452906.1  0.00000000   0.0000 1.00000e+00  0.00000000 8.00000e+00
##                   N54_S193 N56_S192    N58_S195   N49_S185    N51_S186
## XM_034462753.1 1.30000e+01 0.999993 4.95211e+01 13.0000000 33.08290000
## XM_020072414.2 2.98891e-04 0.000000 6.96225e-04  0.0153261  0.00113443
## XM_011449807.3 4.00000e+00 0.000000 1.00000e+00  0.0000000  0.00000000
## XM_011450187.3 0.00000e+00 0.000000 0.00000e+00  0.0000000  0.00000000
## XM_034478413.1 0.00000e+00 0.000000 0.00000e+00  0.0000000  0.00000000
## XM_034452906.1 1.34310e+01 0.000000 0.00000e+00  3.0000000  0.00000000
##                   N53_S188   N55_S190    N57_S191    N59_S189
## XM_034462753.1 36.49610000 3.40688000 1.88544e+01 4.86806e+01
## XM_020072414.2  0.00013002 0.00129055 4.30867e-05 7.61030e-05
## XM_011449807.3  8.00000000 3.00000000 5.00000e+00 0.00000e+00
## XM_011450187.3  0.00000000 0.00000000 0.00000e+00 0.00000e+00
## XM_034478413.1  0.00000000 0.00000000 0.00000e+00 0.00000e+00
## XM_034452906.1  0.00000000 0.00000000 6.34540e+00 0.00000e+00

round integers up

countmatrix <- round(countmatrix, 0)
str(countmatrix)
## 'data.frame':    73307 obs. of  24 variables:
##  $ D54_S145: num  30 0 0 0 0 0 18 0 41 1 ...
##  $ D56_S136: num  29 0 2 0 0 0 21 0 35 9 ...
##  $ D58_S144: num  0 0 2 0 0 0 7 0 41 7 ...
##  $ M45_S140: num  0 0 0 0 0 0 6 0 34 4 ...
##  $ M48_S137: num  34 0 4 0 0 1 9 0 0 1 ...
##  $ M89_S138: num  24 0 3 0 0 0 15 0 38 16 ...
##  $ D55_S146: num  46 0 2 0 0 0 15 0 40 12 ...
##  $ D57_S143: num  35 0 1 0 0 0 6 0 28 2 ...
##  $ D59_S142: num  83 0 0 0 0 0 24 0 31 14 ...
##  $ M46_S141: num  64 0 0 0 0 1 7 0 0 3 ...
##  $ M49_S139: num  43 0 0 0 0 0 14 0 37 11 ...
##  $ M90_S147: num  75 0 1 0 0 0 13 0 35 1 ...
##  $ N48_S194: num  31 0 14 0 0 1 8 0 28 8 ...
##  $ N50_S187: num  22 0 1 0 0 0 17 0 43 5 ...
##  $ N52_S184: num  79 0 3 0 0 8 7 0 18 0 ...
##  $ N54_S193: num  13 0 4 0 0 13 2 0 39 12 ...
##  $ N56_S192: num  1 0 0 0 0 0 0 0 4 1 ...
##  $ N58_S195: num  50 0 1 0 0 0 11 0 25 3 ...
##  $ N49_S185: num  13 0 0 0 0 3 3 0 18 12 ...
##  $ N51_S186: num  33 0 0 0 0 0 6 0 26 0 ...
##  $ N53_S188: num  36 0 8 0 0 0 24 0 0 0 ...
##  $ N55_S190: num  3 0 3 0 0 0 3 0 20 0 ...
##  $ N57_S191: num  19 0 5 0 0 6 8 0 0 18 ...
##  $ N59_S189: num  49 0 0 0 0 0 11 0 7 26 ...

Get DEGs based on Desication

deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), 
                             type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)

Use DESeq

deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]

Explore results

head(deseq2.res)
## log2 fold change (MLE): condition desicated vs control 
## Wald test p-value: condition desicated vs control 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## NM_001305288.1   0.181270      1.0453698  3.002647  0.348149 7.27728e-01
## NM_001305289.1   0.881457     -2.8119577  1.068276 -2.632239 8.48240e-03
## NM_001305290.1 145.913728      0.4580323  0.116185  3.942251 8.07203e-05
## NM_001305291.1   0.261701      0.5618449  1.587076  0.354013 7.23329e-01
## NM_001305292.1   2.902430     -1.2181330  0.763421 -1.595624 1.10573e-01
## NM_001305293.1 234.342117      0.0663449  0.131969  0.502731 6.15154e-01
##                      padj
##                 <numeric>
## NM_001305288.1         NA
## NM_001305289.1         NA
## NM_001305290.1 0.00956401
## NM_001305291.1         NA
## NM_001305292.1 0.59541971
## NM_001305293.1 0.95562321

Visualize Data

Count number of hits with adjusted p-value less then 0.05

dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
## [1] 607   6

plot significant points

tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")

make table

write.table(tmp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T)
deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
datatable(deglist)

Look at top 9 some genes of interest

#examine data
colData(deseq2.dds)
## DataFrame with 24 rows and 4 columns
##          condition        type sizeFactor replaceable
##           <factor>    <factor>  <numeric>   <logical>
## D54_S145   control single-read   0.964999        TRUE
## D56_S136   control single-read   1.253452        TRUE
## D58_S144   control single-read   1.164245        TRUE
## M45_S140   control single-read   1.240188        TRUE
## M48_S137   control single-read   1.048000        TRUE
## ...            ...         ...        ...         ...
## N51_S186 desicated single-read   1.286115        TRUE
## N53_S188 desicated single-read   1.182073        TRUE
## N55_S190 desicated single-read   0.889093        TRUE
## N57_S191 desicated single-read   0.707224        TRUE
## N59_S189 desicated single-read   1.126471        TRUE
# get results and arrange by pvalue
res <- results(deseq2.dds, tidy=TRUE, contrast=c("condition", "control", "desicated")) %>%
  arrange(padj, pvalue) %>%
  as_tibble()

# Define the top 10 genes of interest
goi <- res$row[1:9]
stopifnot(all(goi %in% names(deseq2.dds)))
goi
## [1] "XM_011456111.3" "XM_011456107.3" "XR_004597797.1" "XM_011437990.3"
## [5] "NM_001305333.1" "XM_011457656.3" "NM_001308913.1" "XM_034482887.1"
## [9] "XM_034443058.1"

make count matrix

#make matrix
tcounts <- t(log2((counts(deseq2.dds[goi, ], normalized=TRUE, replaced=FALSE)+.5))) %>%
  merge(colData(deseq2.dds), ., by="row.names") %>%
  gather(gene, expression, (ncol(.)-length(goi)+1):ncol(.))

#take a look at matrix
tcounts %>% 
  select(Row.names, condition, gene, expression) %>% 
  head %>% 
  knitr::kable()
Row.names condition gene expression
D54_S145 control XM_011456111.3 8.339032
D55_S146 control XM_011456111.3 8.366786
D56_S136 control XM_011456111.3 8.305731
D57_S143 control XM_011456111.3 8.171762
D58_S144 control XM_011456111.3 8.068697
D59_S142 control XM_011456111.3 8.395216

create box plot

ggplot(tcounts, aes(condition, expression)) + 
  geom_boxplot() + 
  facet_wrap(~gene, scales="free_y") + 
  labs(x="treatment", 
       y="Expression (log normalized counts)", 
       title="Top Nine Gene Results")+
  theme_bw()