This code uses RNA-Seq reads to examine differential gene expression for two treatments of an experiment.
Load R packages
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
make sure kallisto works
#kallisto is already downloaded in Raven
/home/shared/kallisto/kallisto -h
download reference
#move to preferred directory
cd ../data
#download from url
curl -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
create index and rename
/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna
Download data ending in “.fastq.gz”. wget downloads multiple files recursively.
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
Create subdirectory and align reads with kallisto
#create subdirectory
mkdir ../output/kallisto_01
#run kallisto
find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
Create gene expression matrix
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix ../output/kallisto_01 \
--name_sample_by_basedir \
../output/kallisto_01/D54_S145/abundance.tsv \
../output/kallisto_01/D56_S136/abundance.tsv \
../output/kallisto_01/D58_S144/abundance.tsv \
../output/kallisto_01/M45_S140/abundance.tsv \
../output/kallisto_01/M48_S137/abundance.tsv \
../output/kallisto_01/M89_S138/abundance.tsv \
../output/kallisto_01/D55_S146/abundance.tsv \
../output/kallisto_01/D57_S143/abundance.tsv \
../output/kallisto_01/D59_S142/abundance.tsv \
../output/kallisto_01/M46_S141/abundance.tsv \
../output/kallisto_01/M49_S139/abundance.tsv \
../output/kallisto_01/M90_S147/abundance.tsv \
../output/kallisto_01/N48_S194/abundance.tsv \
../output/kallisto_01/N50_S187/abundance.tsv \
../output/kallisto_01/N52_S184/abundance.tsv \
../output/kallisto_01/N54_S193/abundance.tsv \
../output/kallisto_01/N56_S192/abundance.tsv \
../output/kallisto_01/N58_S195/abundance.tsv \
../output/kallisto_01/N49_S185/abundance.tsv \
../output/kallisto_01/N51_S186/abundance.tsv \
../output/kallisto_01/N53_S188/abundance.tsv \
../output/kallisto_01/N55_S190/abundance.tsv \
../output/kallisto_01/N57_S191/abundance.tsv \
../output/kallisto_01/N59_S189/abundance.tsv
Now that the reads have been downloaded and aligned, we can conduct differential gene expression analysis.
Read in count matrix
#read matrix into r
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
#explore table
head(countmatrix)
## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137
## XM_034462753.1 30.00000000 2.90000e+01 1.21536e-05 4.14533e-01 34.000000
## XM_020072414.2 0.00481712 7.44551e-04 2.81572e-03 4.49804e-05 0.000000
## XM_011449807.3 0.00000000 2.00000e+00 2.00000e+00 0.00000e+00 4.000000
## XM_011450187.3 0.00000000 0.00000e+00 0.00000e+00 0.00000e+00 0.000000
## XM_034478413.1 0.00000000 0.00000e+00 0.00000e+00 0.00000e+00 0.000000
## XM_034452906.1 0.00000000 0.00000e+00 0.00000e+00 0.00000e+00 0.631953
## M89_S138 D55_S146 D57_S143 D59_S142 M46_S141
## XM_034462753.1 24.00000000 4.59454e+01 34.95200000 83.26640000 6.36312e+01
## XM_020072414.2 0.00186705 6.22574e-05 0.00127031 0.00400573 6.14065e-05
## XM_011449807.3 3.00000000 2.00000e+00 1.00000000 0.00000000 0.00000e+00
## XM_011450187.3 0.00000000 0.00000e+00 0.00000000 0.00000000 0.00000e+00
## XM_034478413.1 0.00000000 0.00000e+00 0.00000000 0.00000000 0.00000e+00
## XM_034452906.1 0.00000000 0.00000e+00 0.00000000 0.26390500 1.00000e+00
## M49_S139 M90_S147 N48_S194 N50_S187 N52_S184
## XM_034462753.1 42.50120000 75.3478 3.11802e+01 22.39470000 7.85190e+01
## XM_020072414.2 0.00610825 0.0000 4.23498e-05 0.00014354 1.95711e-04
## XM_011449807.3 0.00000000 1.0000 1.44246e+01 1.00000000 3.00000e+00
## XM_011450187.3 0.00000000 0.0000 0.00000e+00 0.00000000 0.00000e+00
## XM_034478413.1 0.00000000 0.0000 0.00000e+00 0.00000000 0.00000e+00
## XM_034452906.1 0.00000000 0.0000 1.00000e+00 0.00000000 8.00000e+00
## N54_S193 N56_S192 N58_S195 N49_S185 N51_S186
## XM_034462753.1 1.30000e+01 0.999993 4.95211e+01 13.0000000 33.08290000
## XM_020072414.2 2.98891e-04 0.000000 6.96225e-04 0.0153261 0.00113443
## XM_011449807.3 4.00000e+00 0.000000 1.00000e+00 0.0000000 0.00000000
## XM_011450187.3 0.00000e+00 0.000000 0.00000e+00 0.0000000 0.00000000
## XM_034478413.1 0.00000e+00 0.000000 0.00000e+00 0.0000000 0.00000000
## XM_034452906.1 1.34310e+01 0.000000 0.00000e+00 3.0000000 0.00000000
## N53_S188 N55_S190 N57_S191 N59_S189
## XM_034462753.1 36.49610000 3.40688000 1.88544e+01 4.86806e+01
## XM_020072414.2 0.00013002 0.00129055 4.30867e-05 7.61030e-05
## XM_011449807.3 8.00000000 3.00000000 5.00000e+00 0.00000e+00
## XM_011450187.3 0.00000000 0.00000000 0.00000e+00 0.00000e+00
## XM_034478413.1 0.00000000 0.00000000 0.00000e+00 0.00000e+00
## XM_034452906.1 0.00000000 0.00000000 6.34540e+00 0.00000e+00
round integers up
countmatrix <- round(countmatrix, 0)
str(countmatrix)
## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 30 0 0 0 0 0 18 0 41 1 ...
## $ D56_S136: num 29 0 2 0 0 0 21 0 35 9 ...
## $ D58_S144: num 0 0 2 0 0 0 7 0 41 7 ...
## $ M45_S140: num 0 0 0 0 0 0 6 0 34 4 ...
## $ M48_S137: num 34 0 4 0 0 1 9 0 0 1 ...
## $ M89_S138: num 24 0 3 0 0 0 15 0 38 16 ...
## $ D55_S146: num 46 0 2 0 0 0 15 0 40 12 ...
## $ D57_S143: num 35 0 1 0 0 0 6 0 28 2 ...
## $ D59_S142: num 83 0 0 0 0 0 24 0 31 14 ...
## $ M46_S141: num 64 0 0 0 0 1 7 0 0 3 ...
## $ M49_S139: num 43 0 0 0 0 0 14 0 37 11 ...
## $ M90_S147: num 75 0 1 0 0 0 13 0 35 1 ...
## $ N48_S194: num 31 0 14 0 0 1 8 0 28 8 ...
## $ N50_S187: num 22 0 1 0 0 0 17 0 43 5 ...
## $ N52_S184: num 79 0 3 0 0 8 7 0 18 0 ...
## $ N54_S193: num 13 0 4 0 0 13 2 0 39 12 ...
## $ N56_S192: num 1 0 0 0 0 0 0 0 4 1 ...
## $ N58_S195: num 50 0 1 0 0 0 11 0 25 3 ...
## $ N49_S185: num 13 0 0 0 0 3 3 0 18 12 ...
## $ N51_S186: num 33 0 0 0 0 0 6 0 26 0 ...
## $ N53_S188: num 36 0 8 0 0 0 24 0 0 0 ...
## $ N55_S190: num 3 0 3 0 0 0 3 0 20 0 ...
## $ N57_S191: num 19 0 5 0 0 6 8 0 0 18 ...
## $ N59_S189: num 49 0 0 0 0 0 11 0 7 26 ...
Get DEGs based on Desication
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
Use DESeq
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
Explore results
head(deseq2.res)
## log2 fold change (MLE): condition desicated vs control
## Wald test p-value: condition desicated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## NM_001305288.1 0.181270 1.0453698 3.002647 0.348149 7.27728e-01
## NM_001305289.1 0.881457 -2.8119577 1.068276 -2.632239 8.48240e-03
## NM_001305290.1 145.913728 0.4580323 0.116185 3.942251 8.07203e-05
## NM_001305291.1 0.261701 0.5618449 1.587076 0.354013 7.23329e-01
## NM_001305292.1 2.902430 -1.2181330 0.763421 -1.595624 1.10573e-01
## NM_001305293.1 234.342117 0.0663449 0.131969 0.502731 6.15154e-01
## padj
## <numeric>
## NM_001305288.1 NA
## NM_001305289.1 NA
## NM_001305290.1 0.00956401
## NM_001305291.1 NA
## NM_001305292.1 0.59541971
## NM_001305293.1 0.95562321
Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
## [1] 607 6
plot significant points
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
make table
write.table(tmp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T)
deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
datatable(deglist)
Look at top 9 some genes of interest
#examine data
colData(deseq2.dds)
## DataFrame with 24 rows and 4 columns
## condition type sizeFactor replaceable
## <factor> <factor> <numeric> <logical>
## D54_S145 control single-read 0.964999 TRUE
## D56_S136 control single-read 1.253452 TRUE
## D58_S144 control single-read 1.164245 TRUE
## M45_S140 control single-read 1.240188 TRUE
## M48_S137 control single-read 1.048000 TRUE
## ... ... ... ... ...
## N51_S186 desicated single-read 1.286115 TRUE
## N53_S188 desicated single-read 1.182073 TRUE
## N55_S190 desicated single-read 0.889093 TRUE
## N57_S191 desicated single-read 0.707224 TRUE
## N59_S189 desicated single-read 1.126471 TRUE
# get results and arrange by pvalue
res <- results(deseq2.dds, tidy=TRUE, contrast=c("condition", "control", "desicated")) %>%
arrange(padj, pvalue) %>%
as_tibble()
# Define the top 10 genes of interest
goi <- res$row[1:9]
stopifnot(all(goi %in% names(deseq2.dds)))
goi
## [1] "XM_011456111.3" "XM_011456107.3" "XR_004597797.1" "XM_011437990.3"
## [5] "NM_001305333.1" "XM_011457656.3" "NM_001308913.1" "XM_034482887.1"
## [9] "XM_034443058.1"
make count matrix
#make matrix
tcounts <- t(log2((counts(deseq2.dds[goi, ], normalized=TRUE, replaced=FALSE)+.5))) %>%
merge(colData(deseq2.dds), ., by="row.names") %>%
gather(gene, expression, (ncol(.)-length(goi)+1):ncol(.))
#take a look at matrix
tcounts %>%
select(Row.names, condition, gene, expression) %>%
head %>%
knitr::kable()
Row.names | condition | gene | expression |
---|---|---|---|
D54_S145 | control | XM_011456111.3 | 8.339032 |
D55_S146 | control | XM_011456111.3 | 8.366786 |
D56_S136 | control | XM_011456111.3 | 8.305731 |
D57_S143 | control | XM_011456111.3 | 8.171762 |
D58_S144 | control | XM_011456111.3 | 8.068697 |
D59_S142 | control | XM_011456111.3 | 8.395216 |
create box plot
ggplot(tcounts, aes(condition, expression)) +
geom_boxplot() +
facet_wrap(~gene, scales="free_y") +
labs(x="treatment",
y="Expression (log normalized counts)",
title="Top Nine Gene Results")+
theme_bw()