--- title: "Week 02: Differential Gene Expression (dge)" output: html_document date: "2023-04-04" --- # Downloading sequence reads With wget's recursive feature we access all 24 fastq files for this data set. ```{bash downloading sequence reads} cd ../data wget --no-check-certificate --recursive --no-parent --no-directories \ --accept '*.fastq.gz' \ https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/ ``` # Downloading the reference This code indexes an rna.fna file and renames it cgigas_roslin_rna.index and puts it in my data directory. ```{bash} cd ../data curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna ``` ```{bash downloading reference} /home/shared/kallisto/kallisto \ index -i \ ../data/cgigas_roslin_rna.index \ ../data/rna.fna ``` #Using kallisto's quant command to create abundance estimates Running the kallisto quant command on each input fastq file (that is found with the command find with the file name fastq.gz). It then writes output files to the subdirectory kallisto_01. ```{bash output abundance and sequence run info per sequence read} mkdir ../output/kallisto_01 find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 40 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` Down below we will be creating a gene expression matrix from the kallisto output files using the command abundance_estimates_to_matrix.pl from the Trinity RNA-seq assembly software package. ```{bash quantify abundance} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/D54_S145/abundance.tsv \ ../output/kallisto_01/D56_S136/abundance.tsv \ ../output/kallisto_01/D58_S144/abundance.tsv \ ../output/kallisto_01/M45_S140/abundance.tsv \ ../output/kallisto_01/M48_S137/abundance.tsv \ ../output/kallisto_01/M89_S138/abundance.tsv \ ../output/kallisto_01/D55_S146/abundance.tsv \ ../output/kallisto_01/D57_S143/abundance.tsv \ ../output/kallisto_01/D59_S142/abundance.tsv \ ../output/kallisto_01/M46_S141/abundance.tsv \ ../output/kallisto_01/M49_S139/abundance.tsv \ ../output/kallisto_01/M90_S147/abundance.tsv \ ../output/kallisto_01/N48_S194/abundance.tsv \ ../output/kallisto_01/N50_S187/abundance.tsv \ ../output/kallisto_01/N52_S184/abundance.tsv \ ../output/kallisto_01/N54_S193/abundance.tsv \ ../output/kallisto_01/N56_S192/abundance.tsv \ ../output/kallisto_01/N58_S195/abundance.tsv \ ../output/kallisto_01/N49_S185/abundance.tsv \ ../output/kallisto_01/N51_S186/abundance.tsv \ ../output/kallisto_01/N53_S188/abundance.tsv \ ../output/kallisto_01/N55_S190/abundance.tsv \ ../output/kallisto_01/N57_S191/abundance.tsv \ ../output/kallisto_01/N59_S189/abundance.tsv ``` ```{r Installing BiooManager which will then be able to install DESeq2, echo=FALSE} if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2") ``` # Using DESeq2 to conduct a differential expression analysis ```{r requirements} #requirements library(DESeq2) #running several functions from this package to conduct a differential expression analysis between our two treatments library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) ``` ```{r read in count matrix} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` ```{r rounding up values} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` ```{r DEGs based on dessecation} deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), type=factor(rep("single-read", 24))) rownames(deseq2.colData) <- colnames(data) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] head(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ```