library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)

1 Initial RNA Alignment using Kallisto

1.1 Collecting Data

First, we need to download and index a reference genome, as well as download the RNA transcripts from the experiments.

cd ../data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna

/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna

cd ../data 
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/

1.2 Kallisto

Next, we need to use kallisto to quantify the transcript data - linking the rna transcripts to the specific genes they originated from.

mkdir ../output/kallisto_01

find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz

Now that each gene has been quantified, we’ll compile the data into a gene expression matrix for each sample, once again using Kallisto.

perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/D54_S145/abundance.tsv \
    ../output/kallisto_01/D56_S136/abundance.tsv \
    ../output/kallisto_01/D58_S144/abundance.tsv \
    ../output/kallisto_01/M45_S140/abundance.tsv \
    ../output/kallisto_01/M48_S137/abundance.tsv \
    ../output/kallisto_01/M89_S138/abundance.tsv \
    ../output/kallisto_01/D55_S146/abundance.tsv \
    ../output/kallisto_01/D57_S143/abundance.tsv \
    ../output/kallisto_01/D59_S142/abundance.tsv \
    ../output/kallisto_01/M46_S141/abundance.tsv \
    ../output/kallisto_01/M49_S139/abundance.tsv \
    ../output/kallisto_01/M90_S147/abundance.tsv \
    ../output/kallisto_01/N48_S194/abundance.tsv \
    ../output/kallisto_01/N50_S187/abundance.tsv \
    ../output/kallisto_01/N52_S184/abundance.tsv \
    ../output/kallisto_01/N54_S193/abundance.tsv \
    ../output/kallisto_01/N56_S192/abundance.tsv \
    ../output/kallisto_01/N58_S195/abundance.tsv \
    ../output/kallisto_01/N49_S185/abundance.tsv \
    ../output/kallisto_01/N51_S186/abundance.tsv \
    ../output/kallisto_01/N53_S188/abundance.tsv \
    ../output/kallisto_01/N55_S190/abundance.tsv \
    ../output/kallisto_01/N57_S191/abundance.tsv \
    ../output/kallisto_01/N59_S189/abundance.tsv

Now we want to make sure the matrix actually is showing the counts and that it’s in the appropriate data format to move to DESeq2, so the following code runs that conversion.

countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)

##                D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138 D55_S146
## XM_011422361.3        7        4        2        4        4        3        2
## XR_902772.3           0        0        0        0        0        0        0
## XM_034483157.1        0        0        0        0        0        0        0
## XM_011445347.3        2       17        0        0        0        0        6
## XM_034443835.1      127      106       87      117      104       76      104
## XM_034446705.1        0        0        0        0        0        0        0
##                D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194 N50_S187
## XM_011422361.3        3   4.0000        2        1        0        1   4.0000
## XR_902772.3           0   0.0000        0        0        0        0   0.0000
## XM_034483157.1        0   9.4133        0        0        0        0  28.0453
## XM_011445347.3        3  15.0000        0        9       10        0  10.0000
## XM_034443835.1      130 116.0000      129      161       90       68 114.0000
## XM_034446705.1        0   0.0000        0        0        0        0   0.0000
##                N52_S184 N54_S193 N56_S192 N58_S195 N49_S185 N51_S186 N53_S188
## XM_011422361.3        0        0        0        5        4        2        1
## XR_902772.3           0        0        0        0        0        2        0
## XM_034483157.1        0        0        0        0        0        0        0
## XM_011445347.3        0        2        0        0        9        3        0
## XM_034443835.1       73       72        7      148       86       93      134
## XM_034446705.1        0        0        0        0        0        0        0
##                N55_S190 N57_S191 N59_S189
## XM_011422361.3        1        0        0
## XR_902772.3           0        0        0
## XM_034483157.1        0        0        0
## XM_011445347.3        5       37        2
## XM_034443835.1       85       50       83
## XM_034446705.1        0        0        0

countmatrix <- round(countmatrix, 0)
str(countmatrix)

## 'data.frame':    73307 obs. of  24 variables:
##  $ D54_S145: num  7 0 0 2 127 0 0 28 0 0 ...
##  $ D56_S136: num  4 0 0 17 106 0 0 32 2 0 ...
##  $ D58_S144: num  2 0 0 0 87 0 0 26 2 0 ...
##  $ M45_S140: num  4 0 0 0 117 0 0 31 0 0 ...
##  $ M48_S137: num  4 0 0 0 104 0 1 27 2 0 ...
##  $ M89_S138: num  3 0 0 0 76 0 0 52 1 0 ...
##  $ D55_S146: num  2 0 0 6 104 0 0 40 0 0 ...
##  $ D57_S143: num  3 0 0 3 130 0 0 48 0 0 ...
##  $ D59_S142: num  4 0 9 15 116 0 3 37 2 0 ...
##  $ M46_S141: num  2 0 0 0 129 0 0 31 0 0 ...
##  $ M49_S139: num  1 0 0 9 161 0 0 36 0 0 ...
##  $ M90_S147: num  0 0 0 10 90 0 1 39 0 0 ...
##  $ N48_S194: num  1 0 0 0 68 0 0 40 0 0 ...
##  $ N50_S187: num  4 0 28 10 114 0 0 45 0 0 ...
##  $ N52_S184: num  0 0 0 0 73 0 0 37 0 0 ...
##  $ N54_S193: num  0 0 0 2 72 0 0 41 0 0 ...
##  $ N56_S192: num  0 0 0 0 7 0 0 0 0 0 ...
##  $ N58_S195: num  5 0 0 0 148 0 1 35 0 0 ...
##  $ N49_S185: num  4 0 0 9 86 0 1 33 0 0 ...
##  $ N51_S186: num  2 2 0 3 93 0 0 42 0 0 ...
##  $ N53_S188: num  1 0 0 0 134 0 0 29 0 0 ...
##  $ N55_S190: num  1 0 0 5 85 0 0 29 0 0 ...
##  $ N57_S191: num  0 0 0 37 50 0 0 19 0 0 ...
##  $ N59_S189: num  0 0 0 2 83 0 1 22 0 0 ...

2 DESeq2 Analysis

2.1 DESeq2

Now that we have the counts, we need to separate out the variable we’re analyzing, which in this case is whether they are desicated or not. The following code will separate out into a control and desicated condition and then find which genes are differentially expressed based on that condition.

deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), 
                             type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)

Now we’ll sort the resulting genes, and then count how many have a p-value greater than 0.05, which are the ones expressed.

deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]

# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])

2.2 Data Analysis

Red dots are the significantly different samples, plotted against the relative change.

tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")

And finally, we can save the resulting list to a separate document to further research the most expressed genes.

write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)

FISH546 Week 2: RNA Seq Assignment

Sam Cryan