1 Kallisto Alignment

Download reference fasta file.

# Set working directory
cd ../data

# Download data using curl
curl  --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna

Build Kallisto index. Note that Kallisto is installed on the Raven server.

/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fna

Download sequence reads from server.

# Set up working directory
cd ../data 

# Use wget to download fastq files
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '[DMN]*001.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/

Run Kallisto on fastq files to align the reads. Make sure to create kallisto_01 folder before running.

find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz

Combine output files for easy visualization in R. Make sure folder structure in repo matches folder structure listed here.

perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/control/D54_S145/abundance.tsv \
    ../output/kallisto_01/control/D56_S136/abundance.tsv \
    ../output/kallisto_01/control/D58_S144/abundance.tsv \
    ../output/kallisto_01/control/M45_S140/abundance.tsv \
    ../output/kallisto_01/control/M48_S137/abundance.tsv \
    ../output/kallisto_01/control/M89_S138/abundance.tsv \
    ../output/kallisto_01/control/D55_S146/abundance.tsv \
    ../output/kallisto_01/control/D57_S143/abundance.tsv \
    ../output/kallisto_01/control/D59_S142/abundance.tsv \
    ../output/kallisto_01/control/M46_S141/abundance.tsv \
    ../output/kallisto_01/control/M49_S139/abundance.tsv \
    ../output/kallisto_01/control/M90_S147/abundance.tsv \
    ../output/kallisto_01/dessication/N48_S194/abundance.tsv \
    ../output/kallisto_01/dessication/N50_S187/abundance.tsv \
    ../output/kallisto_01/dessication/N52_S184/abundance.tsv \
    ../output/kallisto_01/dessication/N54_S193/abundance.tsv \
    ../output/kallisto_01/dessication/N56_S192/abundance.tsv \
    ../output/kallisto_01/dessication/N58_S195/abundance.tsv \
    ../output/kallisto_01/dessication/N49_S185/abundance.tsv \
    ../output/kallisto_01/dessication/N51_S186/abundance.tsv \
    ../output/kallisto_01/dessication/N53_S188/abundance.tsv \
    ../output/kallisto_01/dessication/N55_S190/abundance.tsv \
    ../output/kallisto_01/dessication/N57_S191/abundance.tsv \
    ../output/kallisto_01/dessication/N59_S189/abundance.tsv

2 Running DESeq2

Read in count matrix.

countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]

# Verify count matrix imported correctly
head(countmatrix)

##                D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138 D55_S146
## XM_034476778.1    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## XM_011433170.3    9.000   12.000    7.000   21.000   16.000   18.000    8.000
## XM_011439399.3    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## XM_011437415.3    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## XM_011440470.3  181.889  379.374  508.439  183.413  131.844  454.325  322.846
## XM_034460313.1    0.000    0.000    0.000    0.000    0.000    0.000    0.000
##                 D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194 N50_S187
## XM_034476778.1   3.07253    0.000    0.000   0.0000    0.000    0.000    3.000
## XM_011433170.3  15.00000   14.000   12.000  60.0000   10.000    1.000    1.000
## XM_011439399.3   0.00000    0.000    0.000   0.0000    0.000    0.000    0.000
## XM_011437415.3   0.00000    0.000    0.000   0.0000    0.000    0.000    0.000
## XM_011440470.3 175.32000  352.994  271.005  28.1127  325.097  273.872  288.338
## XM_034460313.1   0.00000    0.000    0.000   0.0000    0.000    0.000    0.000
##                N52_S184 N54_S193 N56_S192 N58_S195   N49_S185 N51_S186 N53_S188
## XM_034476778.1    0.000    0.000        0     0.00   0.554591    0.000    0.000
## XM_011433170.3    2.000    6.000        3     3.00   3.000000    3.000    6.000
## XM_011439399.3    0.000    0.000        0     0.00   0.000000    0.000    0.000
## XM_011437415.3    0.000    0.000        0     0.00   0.000000    0.000    0.000
## XM_011440470.3  373.885  308.851       41   167.83 229.582000  199.504   76.833
## XM_034460313.1    0.000    0.000        0     0.00   0.000000    0.000    0.000
##                N55_S190 N57_S191 N59_S189
## XM_034476778.1   0.0000     0.00    0.000
## XM_011433170.3   3.0000     3.00   16.000
## XM_011439399.3   0.0000     0.00    0.000
## XM_011437415.3   0.0000     0.00    0.000
## XM_011440470.3  91.1147   136.46  295.425
## XM_034460313.1   0.0000     0.00    0.000

Round integers to whole numbers and check dataframe.

countmatrix <- round(countmatrix, 0)
str(countmatrix)

## 'data.frame':    73307 obs. of  24 variables:
##  $ D54_S145: num  0 9 0 0 182 0 8 0 0 0 ...
##  $ D56_S136: num  0 12 0 0 379 0 8 0 0 0 ...
##  $ D58_S144: num  0 7 0 0 508 0 11 0 0 0 ...
##  $ M45_S140: num  0 21 0 0 183 0 13 0 0 0 ...
##  $ M48_S137: num  0 16 0 0 132 0 12 0 0 0 ...
##  $ M89_S138: num  0 18 0 0 454 0 0 0 0 0 ...
##  $ D55_S146: num  0 8 0 0 323 0 11 0 0 0 ...
##  $ D57_S143: num  3 15 0 0 175 0 8 0 0 0 ...
##  $ D59_S142: num  0 14 0 0 353 0 11 11 0 0 ...
##  $ M46_S141: num  0 12 0 0 271 0 22 0 0 0 ...
##  $ M49_S139: num  0 60 0 0 28 0 22 11 0 0 ...
##  $ M90_S147: num  0 10 0 0 325 0 9 0 0 0 ...
##  $ N48_S194: num  0 1 0 0 274 0 6 0 0 0 ...
##  $ N50_S187: num  3 1 0 0 288 0 23 2 0 0 ...
##  $ N52_S184: num  0 2 0 0 374 0 6 0 0 0 ...
##  $ N54_S193: num  0 6 0 0 309 0 4 0 0 0 ...
##  $ N56_S192: num  0 3 0 0 41 0 0 0 0 0 ...
##  $ N58_S195: num  0 3 0 0 168 0 14 0 0 0 ...
##  $ N49_S185: num  1 3 0 0 230 0 15 2 0 0 ...
##  $ N51_S186: num  0 3 0 0 200 0 17 0 0 0 ...
##  $ N53_S188: num  0 6 0 0 77 0 14 0 0 0 ...
##  $ N55_S190: num  0 3 0 0 91 0 9 0 0 0 ...
##  $ N57_S191: num  0 3 0 0 136 0 5 0 0 0 ...
##  $ N59_S189: num  0 16 0 0 295 0 7 0 0 0 ...

Get DEGs based on control vs desiccation

# Divide dataframe based on control vs desiccation
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desiccated", 12))), 
                             type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)

# Create DESeq dataframe
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)

# Run DESeq
deseq2.dds <- DESeq(deseq2.dds)

# Create results dataframe
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]

Check DESeq results.

dtbl <- as.data.frame(deseq2.res)

datatable(head(dtbl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))

Get significant results (p-value less than 0.05).

# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])

## [1] 607   6

3 Visualize Results

PCA of results. Control and desiccated tend to group in different areas of the plot.

vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition") + theme_bw() + scale_color_manual(values = c("deeppink2", "darkgoldenrod1"))

Plot log change of transcripts.

tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")

Create heatmap of gene expression. The two groups, control and desiccated, have a different change in gene expression.

# Select top 20 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:20]

# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")

Plot counts of most differentially expressed genes.

log_counts_top <- as.data.frame(log_counts_top)
log_counts_top$gene <- rownames(log_counts_top)

log_counts_top <- log_counts_top %>% pivot_longer(cols=c("D54_S145","D56_S136", "D58_S144", "M45_S140", "M48_S137", "M89_S138", "D55_S146", "D57_S143", "D59_S142", "M46_S141", "M49_S139", "M90_S147", "N48_S194", "N50_S187", "N52_S184", "N54_S193", "N56_S192", "N58_S195", "N49_S185", "N51_S186", "N53_S188", "N55_S190", "N57_S191", "N59_S189"),
                    names_to='sample',
                    values_to='log_counts')

log_counts_top$sampletype <- "control"
desiccated <- which(substr(log_counts_top$sample, 1, 1) == "N")
log_counts_top$sampletype[desiccated] <- "desiccated"

ggplot(log_counts_top) +
        geom_point(aes(x = gene, y = log_counts, color = sampletype)) +
        xlab("Genes") +
        ylab("log10 Normalized Counts") +
        ggtitle("Top 20 Significant DE Genes") +
        theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme(plot.title = element_text(hjust = 0.5))

Write table to output folder.

write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)

02-dge

2023-04-05

1 Kallisto Alignment

2 Running DESeq2

3 Visualize Results