---
title: "10-Hisat"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
editor_options: 
  markdown: 
    wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
library(pheatmap)
library(DESeq2)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```

# Differentially Expressed Genes
 
Alt splice test run

# Reads

```{r, engine='bash', eval=TRUE}
ls ../data/reads/*
```

Need to find treatment file...


# Genome

<https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_031168955.1/>

![genome](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Gadus_macrocephalus_genome_assembly_ASM3116895v1_-_NCBI_-_NLM_2024-05-15_09-20-59.png)

```{r, engine='bash'}
cd ../data

/home/shared/datasets download genome accession GCF_031168955.1 --include gff3,gtf,rna,cds,protein,genome,seq-report
```

```{r, engine='bash'}
cd ../data 
unzip ncbi_dataset.zip
```

```{r, engine='bash', eval=TRUE}
ls /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1

```

# Hisat

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2_extract_exons.py \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
> ../output/10-hisat-deseq2/m_exon.tab
```

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
> ../output/10-hisat-deseq2/m_spice_sites.tab
```

```{r, engine='bash'}
echo "10-hisat-deseq2/GCF*" >> ../output/.gitignore
```

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2-build \
../data/ncbi_dataset/data/GCF_031168955.1/GCF_031168955.1_ASM3116895v1_genomic.fna \
../output/10-hisat-deseq2/GCF_031168955.1.index \
--exon ../output/10-hisat-deseq2/m_exon.tab \
--ss ../output/10-hisat-deseq2/m_spice_sites.tab \
-p 20 \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
2> ../output/10-hisat-deseq2/hisat2-build_stats.txt
```

```{r, engine='bash'}
echo "10-hisat-deseq2/*sam" >> ../output/.gitignore
```

```{r, engine='bash'}
find ../data/reads/*.trimmed.R1.fastq.gz \
| xargs basename -s .trimmed.R1.fastq.gz | xargs -I{} \
/home/shared/hisat2-2.2.1/hisat2 \
-x ../output/10-hisat-deseq2/GCF_031168955.1.index \
--dta \
-p 20 \
-1 ../data/reads/{}.trimmed.R1.fastq.gz \
-2 ../data/reads/{}.trimmed.R2.fastq.gz \
-S ../output/10-hisat-deseq2/{}.sam \
2> ../output/10-hisat-deseq2/hisat.out
```

```{r, engine='bash'}
echo "10-hisat-deseq2/*bam" >> ../output/.gitignore
echo "10-hisat-deseq2/*bam*" >> ../output/.gitignore
```

```{r, engine='bash'}
for samfile in ../output/10-hisat-deseq2/*.sam; do
  bamfile="${samfile%.sam}.bam"
  sorted_bamfile="${samfile%.sam}.sorted.bam"
  /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile"
  /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile"
  /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile"
done
```

```{r, engine='bash'}
rm ../output/10-hisat-deseq2/*sam
```


```{r, engine='bash'}
ls ../output/10-hisat-deseq2/*sorted.bam | wc -l
```

```{r, engine='bash', eval=TRUE}
tail ../output/10-hisat-deseq2/hisat.out
```

```{r, engine='bash', eval=TRUE}
cat ../output/10-hisat-deseq2/hisat.out \
| grep "overall alignment rate"
```

# Stringtie

```{r, engine='bash'}
echo "10-hisat-deseq2/*gtf" >> ../output/.gitignore
```

```{r, engine='bash'}
find ../output/10-hisat-deseq2/*sorted.bam \
| xargs basename -s .sorted.bam | xargs -I{} \
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
-p 36 \
-eB \
-G ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gff \
-o ../output/10-hisat-deseq2/{}.gtf \
../output/10-hisat-deseq2/{}.sorted.bam
```


# Count Matrix

list format
```
RNA-ACR-140 ../output/15-Apul-hisat/RNA-ACR-140.gtf
RNA-ACR-145 ../output/15-Apul-hisat/RNA-ACR-145.gtf
RNA-ACR-173 ../output/15-Apul-hisat/RNA-ACR-173.gtf
RNA-ACR-178 ../output/15-Apul-hisat/RNA-ACR-178.gtf
```

```{r, engine='bash', eval=TRUE}
ls ../output/10-hisat-deseq2/*gtf
```


```{r, engine='bash', eval=TRUE}
head ../data/list01.txt
```


```{r, engine='bash'}
python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
-i ../data/list01.txt \
-g ../output/10-hisat-deseq2/gene_count_matrix.csv \
-t ../output/10-hisat-deseq2/transcript_count_matrix.csv
```

```{r, engine='bash', eval=TRUE}
head ../output/10-hisat-deseq2/*matrix.csv
```

# DEseq2


need `conditions.txt` file in this format

```
SampleID	Condition
RNA.ACR.140	control
RNA.ACR.145	control
RNA.ACR.173	treated
RNA.ACR.178	treated
```

```{r, engine='bash', eval=TRUE}
head ../data/conditions.txt
```


```{r, eval=TRUE}
library(DESeq2)
```

```{r, eval=TRUE, cache=TRUE}
# Load gene(/transcript) count matrix and labels
countData <- as.matrix(read.csv("../output/10-hisat-deseq2/gene_count_matrix.csv", row.names="gene_id"))
colData <- read.csv("../data/conditions.txt", sep="\t", row.names = 1)

# Note: The PHENO_DATA file contains information on each sample, e.g., sex or population.
# The exact way to import this depends on the format of the file.

# Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(countData)) # This should return TRUE

countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData)) # This should also return TRUE

# Create a DESeqDataSet from count matrix and labels
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData, design = ~ Condition)


# Run the default analysis for DESeq2 and generate results table
dds <- DESeq(dds)
deseq2.res <- results(dds)

# Sort by adjusted p-value and display
resOrdered <- deseq2.res[order(deseq2.res$padj), ]
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup = "Condition")
```


```{r, eval=TRUE}
# Select top 50 differentially expressed genes
res <- results(dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

# Extract counts and normalize
counts <- counts(dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")
```

```{r, eval=TRUE}
# Count number of hits with adjusted p-value less then 0.05
dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

```{r, eval=TRUE}
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```

```{r}
write.table(tmp.sig, "../output/10-hisat-deseq2/DEGlist.tab", sep = '\t', row.names = T)
```

```{r, eval=TRUE}
deglist <- read.csv("../output/10-hisat-deseq2/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns
```

```{r,eval=TRUE}
head(deglist)
```