---
title: "differential gene expression"
output: html_document
created: "2023-04-07"
last edited: "2023-04-20"
---

# Introduction

This is code for examining the effect of ocean acidification on gene expression of snow crabs.

### Look at all RNASeq files

```{bash}
ls /home/shared/8TB_HDD_01/snow_crab/5010
```

### Download C. bairdi reference, save it to "data" directory

```{bash}
cd ../data
curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_genome_v1.0.fasta
```

### Set index

```{bash}
#index "cbai_genome_v1.0.fasta" and name it as "cbai_genome_v1.0.index"
/home/shared/kallisto/kallisto \
index -i ../data/cbai_genome_v1.0.index \  
../data/cbai_genome_v1.0.fasta
```

### Look at files within exp01

```{bash}
#list file names
ls /home/shared/8TB_HDD_01/snow_crab/5010/exp01
```

### Create abundance estimate files for desired sequences


```{bash}
find /home/shared/8TB_HDD_01/snow_crab/5010/exp01/*fastq.gz
```


```{bash}
#make a new directory in output
mkdir ../output/kallisto_04_paired

#find specific files and create abundance estimates
find /home/shared/8TB_HDD_01/snow_crab/5010/exp01/*fastq.gz \
| xargs basename -s _L003_R1_001.fastq.gz | xargs -I{} \
/home/shared/kallisto/kallisto \
quant -i ../data/cbai_genome_v1.0.index \
-o ../output/kallisto_04_paired/{} \
-t 40 \
/home/shared/8TB_HDD_01/snow_crab/5010/exp01/{}_L003_R1_001.fastq.gz \
/home/shared/8TB_HDD_01/snow_crab/5010/exp01/{}_L003_R2_001.fastq.gz
```

### Create a gene expression matrix from kallisto output files

#### Run abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to

```{bash}
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
    --gene_trans_map none \
    --out_prefix ../output/kallisto_01 \
    --name_sample_by_basedir \
    ../output/kallisto_01/5010_1_S1_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_1_S1_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_2_S2_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_2_S2_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_3_S3_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_3_S3_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_4_S4_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_4_S4_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_39_S39_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_39_S39_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_40_S40_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_40_S40_L003_R2/abundance.tsv \
    ../output/kallisto_01/5010_41_S41_L003_R1/abundance.tsv \
    ../output/kallisto_01/5010_41_S41_L003_R2/abundance.tsv
```

### Get packages

```{r}
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
```

### Read in count matrix

```{r}
countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
```

#### Round up to whole numbers

```{r}
countmatrix <- round(countmatrix, 0)
str(countmatrix)
```

### Get differential gene expression based on pH exposure

```{r}
deseq2.colData <- data.frame(condition=factor(c(rep("control", 8), rep("low pH", 6))), 
                             type=factor(rep("single-read", 14)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
                                     colData = deseq2.colData, 
                                     design = ~ condition)
dim(countmatrix)
dim(deseq2.colData)
deseq2.colData
deseq2.dds
```

```{r}
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
```

### Look at DESeq matrix

```{r}
head(deseq2.res)
```

### Look for signficant values

```{r}
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

```{r}
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG pH 7.8  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```

```{r}
write.table(tmp.sig, "../output/DEGlist.tab", row.names = T)
```

## still missing sample 42 (5010_42_S42_L003_R2_001.fastq.gz)


```{bash}
head -10 /home/shared/8TB_HDD_01/snow_crab/5010/exp01/5010_1_S1_L003_R1_001.fastq.gz
```

```{bash}
#list file names
ls /home/shared/8TB_HDD_01/snow_crab/5010/exp01
```