--- title: "differential gene expression" output: html_document created: "2023-04-07" last edited: "2023-04-20" --- # Introduction This is code for examining the effect of ocean acidification on gene expression of snow crabs. ### Look at all RNASeq files ```{bash} ls /home/shared/8TB_HDD_01/snow_crab/5010 ``` ### Download C. bairdi reference, save it to "data" directory ```{bash} cd ../data curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_genome_v1.0.fasta ``` ### Set index ```{bash} #index "cbai_genome_v1.0.fasta" and name it as "cbai_genome_v1.0.index" /home/shared/kallisto/kallisto \ index -i ../data/cbai_genome_v1.0.index \ ../data/cbai_genome_v1.0.fasta ``` ### Look at files within exp01 ```{bash} #list file names ls /home/shared/8TB_HDD_01/snow_crab/5010/exp01 ``` ### Create abundance estimate files for desired sequences ```{bash} find /home/shared/8TB_HDD_01/snow_crab/5010/exp01/*fastq.gz ``` ```{bash} #make a new directory in output mkdir ../output/kallisto_04_paired #find specific files and create abundance estimates find /home/shared/8TB_HDD_01/snow_crab/5010/exp01/*fastq.gz \ | xargs basename -s _L003_R1_001.fastq.gz | xargs -I{} \ /home/shared/kallisto/kallisto \ quant -i ../data/cbai_genome_v1.0.index \ -o ../output/kallisto_04_paired/{} \ -t 40 \ /home/shared/8TB_HDD_01/snow_crab/5010/exp01/{}_L003_R1_001.fastq.gz \ /home/shared/8TB_HDD_01/snow_crab/5010/exp01/{}_L003_R2_001.fastq.gz ``` ### Create a gene expression matrix from kallisto output files #### Run abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to ```{bash} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/5010_1_S1_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_1_S1_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_2_S2_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_2_S2_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_3_S3_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_3_S3_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_4_S4_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_4_S4_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_39_S39_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_39_S39_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_40_S40_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_40_S40_L003_R2/abundance.tsv \ ../output/kallisto_01/5010_41_S41_L003_R1/abundance.tsv \ ../output/kallisto_01/5010_41_S41_L003_R2/abundance.tsv ``` ### Get packages ```{r} library(DESeq2) library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) ``` ### Read in count matrix ```{r} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` #### Round up to whole numbers ```{r} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` ### Get differential gene expression based on pH exposure ```{r} deseq2.colData <- data.frame(condition=factor(c(rep("control", 8), rep("low pH", 6))), type=factor(rep("single-read", 14))) rownames(deseq2.colData) <- colnames(data) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) dim(countmatrix) dim(deseq2.colData) deseq2.colData deseq2.dds ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` ### Look at DESeq matrix ```{r} head(deseq2.res) ``` ### Look for signficant values ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG pH 7.8 (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ``` ## still missing sample 42 (5010_42_S42_L003_R2_001.fastq.gz) ```{bash} head -10 /home/shared/8TB_HDD_01/snow_crab/5010/exp01/5010_1_S1_L003_R1_001.fastq.gz ``` ```{bash} #list file names ls /home/shared/8TB_HDD_01/snow_crab/5010/exp01 ```