Here are all the packages you will need for this code
::install("DESeq2")
BiocManager
library(knitr)
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(Biostrings)
::opts_chunk$set(
knitrecho = TRUE, # Display code chunks
eval = FALSE, # Evaluate code chunks
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
fig.width = 6, # Set plot width in inches
fig.height = 4, # Set plot height in inches
fig.align = "center" # Align plots to the center
)
File Location https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
This chunk uses wget to download files from the file location that matches a given pattern. In this instance, ’*001.fastq.gz’ dictates the file type and the last part of the file name.
cd ../data
wget --recursive --no-parent --no-directories \
'*001.fastq.gz' \
--accept https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
We need to get a reference to which we can compare our sequence data
cd ../data
curl -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
This code is indexing the file rna.fna while also renaming it as cgigas_roslin_rna.index
/home/shared/kallisto/kallisto \
-i \
index \
../data/cgigas_roslin_rna.index ../data/rna.fna
This code alligns the sequences
mkdir ../output/kallisto_01
find ../data/*_L001_R1_001.fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
-i ../data/cgigas_roslin_rna.index \
quant \
-o ../output/kallisto_01/{} \
-t 40 -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz --single
This command runs the abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to create a gene expression matrix from kallisto output files.
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
\
--est_method kallisto --gene_trans_map none \
--out_prefix ../output/kallisto_01 \
--name_sample_by_basedir \
\
../output/kallisto_01/D54_S145/abundance.tsv \
../output/kallisto_01/D56_S136/abundance.tsv \
../output/kallisto_01/D58_S144/abundance.tsv \
../output/kallisto_01/M45_S140/abundance.tsv \
../output/kallisto_01/M48_S137/abundance.tsv \
../output/kallisto_01/M89_S138/abundance.tsv \
../output/kallisto_01/D55_S146/abundance.tsv \
../output/kallisto_01/D57_S143/abundance.tsv \
../output/kallisto_01/D59_S142/abundance.tsv \
../output/kallisto_01/M46_S141/abundance.tsv \
../output/kallisto_01/M49_S139/abundance.tsv \
../output/kallisto_01/M90_S147/abundance.tsv \
../output/kallisto_01/N48_S194/abundance.tsv \
../output/kallisto_01/N50_S187/abundance.tsv \
../output/kallisto_01/N52_S184/abundance.tsv \
../output/kallisto_01/N54_S193/abundance.tsv \
../output/kallisto_01/N56_S192/abundance.tsv \
../output/kallisto_01/N58_S195/abundance.tsv \
../output/kallisto_01/N49_S185/abundance.tsv \
../output/kallisto_01/N51_S186/abundance.tsv \
../output/kallisto_01/N53_S188/abundance.tsv \
../output/kallisto_01/N55_S190/abundance.tsv \
../output/kallisto_01/N57_S191/abundance.tsv ../output/kallisto_01/N59_S189/abundance.tsv
Reads in a count matrix of isoform counts generated by Kallisto, with row names set to the gene/transcript IDs and the first column removed.
<- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
countmatrix rownames(countmatrix) <- countmatrix$X
<- countmatrix[,-1]
countmatrix head(countmatrix)
## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138
## XR_002198472.2 0.00000e+00 0.0000 0.00000 0.0000 0.00000e+00 0.00000000
## XM_011439767.3 3.17402e-08 0.0000 0.00000 0.0000 0.00000e+00 0.00000000
## XR_004595876.1 3.33657e+00 0.0000 6.54202 0.0000 0.00000e+00 0.00000000
## XM_011432597.3 0.00000e+00 0.0000 0.00000 0.0000 0.00000e+00 0.00000000
## XM_034467064.1 0.00000e+00 0.0000 0.00000 0.0000 0.00000e+00 0.00000000
## XM_034443175.1 2.59759e+01 33.2338 10.25260 52.7153 5.23614e-06 0.00825646
## D55_S146 D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194
## XR_002198472.2 0 0.0000 0 0.0000000 0 0.0000 0
## XM_011439767.3 0 0.0000 0 0.4783700 0 0.0000 0
## XR_004595876.1 0 0.0000 0 0.0000000 0 0.0000 0
## XM_011432597.3 0 0.0000 0 0.0000000 0 0.0000 0
## XM_034467064.1 0 0.0000 1 0.0000000 0 0.0000 0
## XM_034443175.1 0 27.4283 0 0.0335607 0 31.4732 0
## N50_S187 N52_S184 N54_S193 N56_S192 N58_S195 N49_S185 N51_S186
## XR_002198472.2 0 0.0000 0 0 0 0 0
## XM_011439767.3 0 0.0000 0 0 0 0 0
## XR_004595876.1 0 0.0000 0 0 0 0 0
## XM_011432597.3 0 0.0000 0 0 0 0 0
## XM_034467064.1 0 0.0000 0 0 0 0 0
## XM_034443175.1 0 29.6025 0 0 0 0 0
## N53_S188 N55_S190 N57_S191 N59_S189
## XR_002198472.2 0.000 0.00000 0.00000e+00 0
## XM_011439767.3 0.000 0.00000 0.00000e+00 0
## XR_004595876.1 18.345 0.00000 0.00000e+00 0
## XM_011432597.3 0.000 0.00000 0.00000e+00 0
## XM_034467064.1 0.000 0.00000 0.00000e+00 0
## XM_034443175.1 0.000 9.87152 8.47129e-06 0
Rounds the counts to whole numbers for further analysis
<- round(countmatrix, 0)
countmatrix str(countmatrix)
## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 0 0 3 0 0 26 0 0 15 15 ...
## $ D56_S136: num 0 0 0 0 0 33 0 0 12 16 ...
## $ D58_S144: num 0 0 7 0 0 10 0 0 19 23 ...
## $ M45_S140: num 0 0 0 0 0 53 0 0 6 15 ...
## $ M48_S137: num 0 0 0 0 0 0 0 0 20 16 ...
## $ M89_S138: num 0 0 0 0 0 0 1 0 8 30 ...
## $ D55_S146: num 0 0 0 0 0 0 0 0 35 4 ...
## $ D57_S143: num 0 0 0 0 0 27 0 0 5 20 ...
## $ D59_S142: num 0 0 0 0 1 0 1 0 6 16 ...
## $ M46_S141: num 0 0 0 0 0 0 2 0 18 13 ...
## $ M49_S139: num 0 0 0 0 0 0 1 0 17 15 ...
## $ M90_S147: num 0 0 0 0 0 31 0 0 18 14 ...
## $ N48_S194: num 0 0 0 0 0 0 0 0 23 2 ...
## $ N50_S187: num 0 0 0 0 0 0 1 0 19 9 ...
## $ N52_S184: num 0 0 0 0 0 30 7 0 0 11 ...
## $ N54_S193: num 0 0 0 0 0 0 0 0 62 12 ...
## $ N56_S192: num 0 0 0 0 0 0 0 0 0 0 ...
## $ N58_S195: num 0 0 0 0 0 0 0 0 24 4 ...
## $ N49_S185: num 0 0 0 0 0 0 0 0 9 5 ...
## $ N51_S186: num 0 0 0 0 0 0 0 0 22 13 ...
## $ N53_S188: num 0 0 18 0 0 0 2 0 15 2 ...
## $ N55_S190: num 0 0 0 0 0 10 0 0 14 7 ...
## $ N57_S191: num 0 0 0 0 0 0 1 0 0 5 ...
## $ N59_S189: num 0 0 0 0 0 0 0 0 0 6 ...
This code performs differential expression analysis to identify differentially expressed genes (DEGs) between a control condition and a desiccated condition.
<- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
deseq2.colData type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
<- DESeqDataSetFromMatrix(countData = countmatrix,
deseq2.dds colData = deseq2.colData,
design = ~ condition)
<- DESeq(deseq2.dds)
deseq2.dds <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] deseq2.res
head(deseq2.res)
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
This code makes a plot where statistally significant values will be highlighted red
<- deseq2.res
tmp # The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change") #This changes the x label, y label, and main title
# Getting the significant points and plotting them again so they're a different color
<- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
tmp.sig points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)