--- title: "02-dge" date: "2023-04-05" output: html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=F} library(tidyverse) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(knitr) library(tidyr) library(DT) opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Don't evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Kallisto Alignment Download reference fasta file. ```{bash} # Set working directory cd ../data # Download data using curl curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna ``` Build Kallisto index. Note that Kallisto is installed on the Raven server. ```{bash} /home/shared/kallisto/kallisto \ index -i \ ../data/cgigas_roslin_rna.index \ ../data/rna.fna ``` Download sequence reads from server. ```{bash} # Set up working directory cd ../data # Use wget to download fastq files wget --recursive --no-parent --no-directories \ --no-check-certificate \ --accept '[DMN]*001.fastq.gz' \ https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/ ``` Run Kallisto on fastq files to align the reads. Make sure to create kallisto_01 folder before running. ```{bash} find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 40 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` Combine output files for easy visualization in R. Make sure folder structure in repo matches folder structure listed here. ```{bash} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/control/D54_S145/abundance.tsv \ ../output/kallisto_01/control/D56_S136/abundance.tsv \ ../output/kallisto_01/control/D58_S144/abundance.tsv \ ../output/kallisto_01/control/M45_S140/abundance.tsv \ ../output/kallisto_01/control/M48_S137/abundance.tsv \ ../output/kallisto_01/control/M89_S138/abundance.tsv \ ../output/kallisto_01/control/D55_S146/abundance.tsv \ ../output/kallisto_01/control/D57_S143/abundance.tsv \ ../output/kallisto_01/control/D59_S142/abundance.tsv \ ../output/kallisto_01/control/M46_S141/abundance.tsv \ ../output/kallisto_01/control/M49_S139/abundance.tsv \ ../output/kallisto_01/control/M90_S147/abundance.tsv \ ../output/kallisto_01/dessication/N48_S194/abundance.tsv \ ../output/kallisto_01/dessication/N50_S187/abundance.tsv \ ../output/kallisto_01/dessication/N52_S184/abundance.tsv \ ../output/kallisto_01/dessication/N54_S193/abundance.tsv \ ../output/kallisto_01/dessication/N56_S192/abundance.tsv \ ../output/kallisto_01/dessication/N58_S195/abundance.tsv \ ../output/kallisto_01/dessication/N49_S185/abundance.tsv \ ../output/kallisto_01/dessication/N51_S186/abundance.tsv \ ../output/kallisto_01/dessication/N53_S188/abundance.tsv \ ../output/kallisto_01/dessication/N55_S190/abundance.tsv \ ../output/kallisto_01/dessication/N57_S191/abundance.tsv \ ../output/kallisto_01/dessication/N59_S189/abundance.tsv ``` # Running DESeq2 Read in count matrix. ```{r, eval=T} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] # Verify count matrix imported correctly head(countmatrix) ``` Round integers to whole numbers and check dataframe. ```{r, eval=T} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` Get DEGs based on control vs desiccation ```{r, eval=T} # Divide dataframe based on control vs desiccation deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desiccated", 12))), type=factor(rep("single-read", 24))) rownames(deseq2.colData) <- colnames(data) # Create DESeq dataframe deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r, eval=T} # Run DESeq deseq2.dds <- DESeq(deseq2.dds) # Create results dataframe deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` Check DESeq results. ```{r, eval=T} dtbl <- as.data.frame(deseq2.res) datatable(head(dtbl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE)) ``` Get significant results (p-value less than 0.05). ```{r, eval=T} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` # Visualize Results PCA of results. Control and desiccated tend to group in different areas of the plot. ```{r, eval=T} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") + theme_bw() + scale_color_manual(values = c("deeppink2", "darkgoldenrod1")) ``` Plot log change of transcripts. ```{r, eval=T} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` Create heatmap of gene expression. The two groups, control and desiccated, have a different change in gene expression. ```{r, eval=T} # Select top 20 differentially expressed genes res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:20] # Extract counts and normalize counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` Plot counts of most differentially expressed genes. ```{r, eval=T} log_counts_top <- as.data.frame(log_counts_top) log_counts_top$gene <- rownames(log_counts_top) log_counts_top <- log_counts_top %>% pivot_longer(cols=c("D54_S145","D56_S136", "D58_S144", "M45_S140", "M48_S137", "M89_S138", "D55_S146", "D57_S143", "D59_S142", "M46_S141", "M49_S139", "M90_S147", "N48_S194", "N50_S187", "N52_S184", "N54_S193", "N56_S192", "N58_S195", "N49_S185", "N51_S186", "N53_S188", "N55_S190", "N57_S191", "N59_S189"), names_to='sample', values_to='log_counts') log_counts_top$sampletype <- "control" desiccated <- which(substr(log_counts_top$sample, 1, 1) == "N") log_counts_top$sampletype[desiccated] <- "desiccated" ggplot(log_counts_top) + geom_point(aes(x = gene, y = log_counts, color = sampletype)) + xlab("Genes") + ylab("log10 Normalized Counts") + ggtitle("Top 20 Significant DE Genes") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5)) ``` Write table to output folder. ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ```