--- title: "FISH546 Week 2: RNA Seq Assignment" author: "Sam Cryan" date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: journal toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2") ``` This is a program to coduct RNASeq analysis on transcript data from *C.gigas* in desciated environments. The following libraries were used: ```{r libraries, eval=TRUE, include=TRUE} library(DESeq2) library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) ``` # Initial RNA Alignment using Kallisto ## Collecting Data First, we need to download and index a reference genome, as well as download the RNA transcripts from the experiments. ```{bash} cd ../data curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna ``` ```{bash} /home/shared/kallisto/kallisto \ index -i \ ../data/cgigas_roslin_rna.index \ ../data/rna.fna ``` ``` {bash} cd ../data wget --recursive --no-parent --no-directories \ --no-check-certificate \ --accept '*.fastq.gz' \ https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/ ``` ## Kallisto Next, we need to use kallisto to quantify the transcript data - linking the rna transcripts to the specific genes they originated from. ```{bash} mkdir ../output/kallisto_01 find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 40 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` Now that each gene has been quantified, we'll compile the data into a gene expression matrix for each sample, once again using Kallisto. ```{bash} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/D54_S145/abundance.tsv \ ../output/kallisto_01/D56_S136/abundance.tsv \ ../output/kallisto_01/D58_S144/abundance.tsv \ ../output/kallisto_01/M45_S140/abundance.tsv \ ../output/kallisto_01/M48_S137/abundance.tsv \ ../output/kallisto_01/M89_S138/abundance.tsv \ ../output/kallisto_01/D55_S146/abundance.tsv \ ../output/kallisto_01/D57_S143/abundance.tsv \ ../output/kallisto_01/D59_S142/abundance.tsv \ ../output/kallisto_01/M46_S141/abundance.tsv \ ../output/kallisto_01/M49_S139/abundance.tsv \ ../output/kallisto_01/M90_S147/abundance.tsv \ ../output/kallisto_01/N48_S194/abundance.tsv \ ../output/kallisto_01/N50_S187/abundance.tsv \ ../output/kallisto_01/N52_S184/abundance.tsv \ ../output/kallisto_01/N54_S193/abundance.tsv \ ../output/kallisto_01/N56_S192/abundance.tsv \ ../output/kallisto_01/N58_S195/abundance.tsv \ ../output/kallisto_01/N49_S185/abundance.tsv \ ../output/kallisto_01/N51_S186/abundance.tsv \ ../output/kallisto_01/N53_S188/abundance.tsv \ ../output/kallisto_01/N55_S190/abundance.tsv \ ../output/kallisto_01/N57_S191/abundance.tsv \ ../output/kallisto_01/N59_S189/abundance.tsv ``` Now we want to make sure the matrix actually is showing the counts and that it's in the appropriate data format to move to DESeq2, so the following code runs that conversion. ```{r, eval=TRUE} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` ```{r, eval = TRUE} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` # DESeq2 Analysis ## DESeq2 Now that we have the counts, we need to separate out the variable we're analyzing, which in this case is whether they are desicated or not. The following code will separate out into a control and desicated condition and then find which genes are differentially expressed based on that condition. ```{r, eval=TRUE} deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), type=factor(rep("single-read", 24))) rownames(deseq2.colData) <- colnames(data) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` Now we'll sort the resulting genes, and then count how many have a p-value greater than 0.05, which are the ones expressed. ```{r, eval=TRUE} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ## Data Analysis Red dots are the significantly different samples, plotted against the relative change. ```{r, eval=TRUE} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` And finally, we can save the resulting list to a separate document to further research the most expressed genes. ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ```