With wget’s recursive feature we access all 24 fastq files for this data set.
cd ../data
wget --no-check-certificate --recursive --no-parent --no-directories \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
This code indexes an rna.fna file and renames it cgigas_roslin_rna.index and puts it in the data directory.
cd ../data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna
Running the kallisto quant command on each input fastq file (that is found with the command find with using the wildcard ’*’ in front of the extension fastq.gz). It then writes output files to the sub-directory kallisto_01.
mkdir ../output/kallisto_01
find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
Down below we will be creating a gene expression matrix from the kallisto output files using the command abundance_estimates_to_matrix.pl from the Trinity RNA-seq assembly software package.
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix ../output/kallisto_01 \
--name_sample_by_basedir \
../output/kallisto_01/D54_S145/abundance.tsv \
../output/kallisto_01/D56_S136/abundance.tsv \
../output/kallisto_01/D58_S144/abundance.tsv \
../output/kallisto_01/M45_S140/abundance.tsv \
../output/kallisto_01/M48_S137/abundance.tsv \
../output/kallisto_01/M89_S138/abundance.tsv \
../output/kallisto_01/D55_S146/abundance.tsv \
../output/kallisto_01/D57_S143/abundance.tsv \
../output/kallisto_01/D59_S142/abundance.tsv \
../output/kallisto_01/M46_S141/abundance.tsv \
../output/kallisto_01/M49_S139/abundance.tsv \
../output/kallisto_01/M90_S147/abundance.tsv \
../output/kallisto_01/N48_S194/abundance.tsv \
../output/kallisto_01/N50_S187/abundance.tsv \
../output/kallisto_01/N52_S184/abundance.tsv \
../output/kallisto_01/N54_S193/abundance.tsv \
../output/kallisto_01/N56_S192/abundance.tsv \
../output/kallisto_01/N58_S195/abundance.tsv \
../output/kallisto_01/N49_S185/abundance.tsv \
../output/kallisto_01/N51_S186/abundance.tsv \
../output/kallisto_01/N53_S188/abundance.tsv \
../output/kallisto_01/N55_S190/abundance.tsv \
../output/kallisto_01/N57_S191/abundance.tsv \
../output/kallisto_01/N59_S189/abundance.tsv
Next we will install the R package BioManager which will allow us to install DEseq2 for our dge analysis. If you already have this package you can skip this step.
Below are the required packages needed to conduct a dge in R.
#requirements
library(DESeq2) #running several functions from this package to conduct a differential expression analysis between our two treatments
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
We will be reading in our count matrix and formatting it.
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix) # let's take a peak at the count matrix we just read in
Next we wantt to round up the values in the count matrix
countmatrix <- round(countmatrix, 0)
str(countmatrix)
Next we will conduct a dge based on the treatments of the data we are evaluating. Here it’s control vs desiccation in C. gigas.
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
head(deseq2.res)
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
the following code chunk will take the resutls we produced using DESeq2 to make a plot of differentially expressed genes
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)