library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
First, we need to download and index a reference genome, as well as download the RNA transcripts from the experiments.
cd ../data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna
cd ../data
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
Next, we need to use kallisto to quantify the transcript data - linking the rna transcripts to the specific genes they originated from.
mkdir ../output/kallisto_01
find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
Now that each gene has been quantified, we’ll compile the data into a gene expression matrix for each sample, once again using Kallisto.
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix ../output/kallisto_01 \
--name_sample_by_basedir \
../output/kallisto_01/D54_S145/abundance.tsv \
../output/kallisto_01/D56_S136/abundance.tsv \
../output/kallisto_01/D58_S144/abundance.tsv \
../output/kallisto_01/M45_S140/abundance.tsv \
../output/kallisto_01/M48_S137/abundance.tsv \
../output/kallisto_01/M89_S138/abundance.tsv \
../output/kallisto_01/D55_S146/abundance.tsv \
../output/kallisto_01/D57_S143/abundance.tsv \
../output/kallisto_01/D59_S142/abundance.tsv \
../output/kallisto_01/M46_S141/abundance.tsv \
../output/kallisto_01/M49_S139/abundance.tsv \
../output/kallisto_01/M90_S147/abundance.tsv \
../output/kallisto_01/N48_S194/abundance.tsv \
../output/kallisto_01/N50_S187/abundance.tsv \
../output/kallisto_01/N52_S184/abundance.tsv \
../output/kallisto_01/N54_S193/abundance.tsv \
../output/kallisto_01/N56_S192/abundance.tsv \
../output/kallisto_01/N58_S195/abundance.tsv \
../output/kallisto_01/N49_S185/abundance.tsv \
../output/kallisto_01/N51_S186/abundance.tsv \
../output/kallisto_01/N53_S188/abundance.tsv \
../output/kallisto_01/N55_S190/abundance.tsv \
../output/kallisto_01/N57_S191/abundance.tsv \
../output/kallisto_01/N59_S189/abundance.tsv
Now we want to make sure the matrix actually is showing the counts and that it’s in the appropriate data format to move to DESeq2, so the following code runs that conversion.
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138 D55_S146
## XM_011422361.3 7 4 2 4 4 3 2
## XR_902772.3 0 0 0 0 0 0 0
## XM_034483157.1 0 0 0 0 0 0 0
## XM_011445347.3 2 17 0 0 0 0 6
## XM_034443835.1 127 106 87 117 104 76 104
## XM_034446705.1 0 0 0 0 0 0 0
## D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194 N50_S187
## XM_011422361.3 3 4.0000 2 1 0 1 4.0000
## XR_902772.3 0 0.0000 0 0 0 0 0.0000
## XM_034483157.1 0 9.4133 0 0 0 0 28.0453
## XM_011445347.3 3 15.0000 0 9 10 0 10.0000
## XM_034443835.1 130 116.0000 129 161 90 68 114.0000
## XM_034446705.1 0 0.0000 0 0 0 0 0.0000
## N52_S184 N54_S193 N56_S192 N58_S195 N49_S185 N51_S186 N53_S188
## XM_011422361.3 0 0 0 5 4 2 1
## XR_902772.3 0 0 0 0 0 2 0
## XM_034483157.1 0 0 0 0 0 0 0
## XM_011445347.3 0 2 0 0 9 3 0
## XM_034443835.1 73 72 7 148 86 93 134
## XM_034446705.1 0 0 0 0 0 0 0
## N55_S190 N57_S191 N59_S189
## XM_011422361.3 1 0 0
## XR_902772.3 0 0 0
## XM_034483157.1 0 0 0
## XM_011445347.3 5 37 2
## XM_034443835.1 85 50 83
## XM_034446705.1 0 0 0
countmatrix <- round(countmatrix, 0)
str(countmatrix)
## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 7 0 0 2 127 0 0 28 0 0 ...
## $ D56_S136: num 4 0 0 17 106 0 0 32 2 0 ...
## $ D58_S144: num 2 0 0 0 87 0 0 26 2 0 ...
## $ M45_S140: num 4 0 0 0 117 0 0 31 0 0 ...
## $ M48_S137: num 4 0 0 0 104 0 1 27 2 0 ...
## $ M89_S138: num 3 0 0 0 76 0 0 52 1 0 ...
## $ D55_S146: num 2 0 0 6 104 0 0 40 0 0 ...
## $ D57_S143: num 3 0 0 3 130 0 0 48 0 0 ...
## $ D59_S142: num 4 0 9 15 116 0 3 37 2 0 ...
## $ M46_S141: num 2 0 0 0 129 0 0 31 0 0 ...
## $ M49_S139: num 1 0 0 9 161 0 0 36 0 0 ...
## $ M90_S147: num 0 0 0 10 90 0 1 39 0 0 ...
## $ N48_S194: num 1 0 0 0 68 0 0 40 0 0 ...
## $ N50_S187: num 4 0 28 10 114 0 0 45 0 0 ...
## $ N52_S184: num 0 0 0 0 73 0 0 37 0 0 ...
## $ N54_S193: num 0 0 0 2 72 0 0 41 0 0 ...
## $ N56_S192: num 0 0 0 0 7 0 0 0 0 0 ...
## $ N58_S195: num 5 0 0 0 148 0 1 35 0 0 ...
## $ N49_S185: num 4 0 0 9 86 0 1 33 0 0 ...
## $ N51_S186: num 2 2 0 3 93 0 0 42 0 0 ...
## $ N53_S188: num 1 0 0 0 134 0 0 29 0 0 ...
## $ N55_S190: num 1 0 0 5 85 0 0 29 0 0 ...
## $ N57_S191: num 0 0 0 37 50 0 0 19 0 0 ...
## $ N59_S189: num 0 0 0 2 83 0 1 22 0 0 ...
Now that we have the counts, we need to separate out the variable we’re analyzing, which in this case is whether they are desicated or not. The following code will separate out into a control and desicated condition and then find which genes are differentially expressed based on that condition.
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
Now we’ll sort the resulting genes, and then count how many have a p-value greater than 0.05, which are the ones expressed.
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
Red dots are the significantly different samples, plotted against the relative change.
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
And finally, we can save the resulting list to a separate document to further research the most expressed genes.
write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)