---
title: "10-Hisat"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
analyses: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
editor_options: 
  markdown: 
    wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
library(pheatmap)
library(DESeq2)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code analyses
)
```

# Differentially Expressed Genes
 

# Reads

```{r, engine='bash', eval=TRUE}
ls /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*
```

```{r, engine='bash}
/home/shared/FastQC-0.12.1/fastqc \
/home/shared/8TB_HDD_02/graceac9/data/pycno2021/*fq.gz \
-t 36 \
-o ../analyses/10-hisat-deseq2/
```


```{r, engine='bash'}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate
which multiqc


multiqc ../analyses/10-hisat-deseq2/ \
-o ../analyses/10-hisat-deseq2/
```


# Genome
https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_032158295.1/

![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Pycnopodia_helianthoides_genome_assembly_ASM3215829v1_-_NCBI_-_NLM_2024-05-25_15-32-13.png)

```{r, engine='bash'}
cd ../data

/home/shared/datasets download genome accession GCA_032158295.1 --include gff3,rna,cds,protein,genome,seq-report
```

```{r, engine='bash'}
cd ../data 
unzip ncbi_dataset.zip
```

```{r, engine='bash', eval=TRUE}
ls ../data/ncbi_dataset/data/GCA_032158295.1

```

## Annotation files


```{r, engine='bash', eval=TRUE}
head ../data/augustus.hints.gtf

```


# Hisat

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2_extract_exons.py \
../data/augustus.hints.gtf \
> ../analyses/10-hisat-deseq2/m_exon.tab
```

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \
../data/augustus.hints.gtf \
> ../analyses/10-hisat-deseq2/m_spice_sites.tab
```

```{r, engine='bash'}
echo "10-hisat-deseq2/GCF*" >> ../analyses/.gitignore
```

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2-build \
../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna \
../analyses/10-hisat-deseq2/GCA_032158295.index \
--exon ../analyses/10-hisat-deseq2/m_exon.tab \
--ss ../analyses/10-hisat-deseq2/m_spice_sites.tab \
-p 20 \
../data/augustus.hints.gtf \
2> ../analyses/10-hisat-deseq2/hisat2-build_stats.txt
```

```{r, engine='bash'}
echo "10-hisat-deseq2/*sam" >> ../analyses/.gitignore
```


```{r, engine='bash', eval=TRUE}
find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz | xargs basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz | xargs -I{} echo {}
```


```{r, engine='bash'}
find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
| xargs basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz | xargs -I{} \
/home/shared/hisat2-2.2.1/hisat2 \
-x ../analyses/10-hisat-deseq2/GCA_032158295.index \
--dta \
-p 36 \
-1 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
-2 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R2_001.fastq.gz.fastp-trim.20220810.fq.gz \
-S ../analyses/10-hisat-deseq2/{}.sam \
2> ../analyses/10-hisat-deseq2/hisat.out
```


mod

```{r, engine='bash'}
find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
| xargs -I{} basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz {} \
| xargs -I{} sh -c '/home/shared/hisat2-2.2.1/hisat2 \
-x ../analyses/10-hisat-deseq2/GCA_032158295.index \
--dta \
-p 20 \
-1 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
-2 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R2_001.fastq.gz.fastp-trim.20220810.fq.gz \
-S ../analyses/10-hisat-deseq2/{}_02.sam \
> ../analyses/10-hisat-deseq2/{}_hisat.stdout 2> ../analyses/10-hisat-deseq2/{}_hisat.stderr'
```


keeping unmapped reads 
```{r, engine='bash'}
find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
| xargs -I{} basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz {} \
| xargs -I{} sh -c '/home/shared/hisat2-2.2.1/hisat2 \
-x ../analyses/10-hisat-deseq2/GCA_032158295.index \
--dta \
-p 20 \
-1 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \
-2 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R2_001.fastq.gz.fastp-trim.20220810.fq.gz \
-S ../analyses/10-hisat-deseq2/{}_03.sam \
--un-conc ../analyses/10-hisat-deseq2/{}_unmapped_reads.fastq \
> ../analyses/10-hisat-deseq2/{}_hisat03.stdout 2> ../analyses/10-hisat-deseq2/{}_hisat03.stderr'
```

--un unmapped_reads.fastq

# Perform assembly
${trinity_dir}/Trinity \
--seqType fq \
--SS_lib_type RF \
--max_memory 100G \
--CPU ${threads} \
--samples_file ${samples}

```{bash}

export PATH=${PATH}:/home/shared/bowtie2-2.4.4-linux-x86_64/
export PATH=${PATH}:/home/shared/jellyfish-2.3.0/bin/
export PATH=${PATH}:/home/shared/salmon-1.4.0_linux_x86_64/bin/


/home/shared/trinityrnaseq-v2.12.0/Trinity \
--seqType fq \
--max_memory 300G \
--CPU 24 \
--left ../analyses/10-hisat-deseq2/PSC-35_unmapped_reads.1.fastq \
--right ../analyses/10-hisat-deseq2/PSC-35_unmapped_reads.2.fastq \
--output ../analyses/10-hisat-deseq2/trinity
```

```{bash}
/home/shared/trinityrnaseq-v2.12.0/Trinity --help

```

/home/shared/trinityrnaseq-v2.12.0


Explanation
xargs -I{}: This option allows you to replace {} in the command with the output from the previous command (i.e., basename). It's used twice: first, to strip the suffix from the filenames, and second, to construct and execute the hisat2 command.

sh -c: This is used to execute a complex command within xargs. It's necessary because the output redirection (>, 2>) is shell functionality, and without sh -c, xargs wouldn't handle it correctly.

Output Redirection:

> ../analyses/10-hisat-deseq2/{}_hisat.stdout: Redirects the standard output to a unique file for each sample.
2> ../analyses/10-hisat-deseq2/{}_hisat.stderr: Redirects the standard error to a different unique file for each sample.
This setup ensures that the output from each sample's alignment process is neatly organized into separate files, making it easier to manage and debug individual runs.


```{r, engine='bash'}
echo "10-hisat-deseq2/*bam" >> ../analyses/.gitignore
echo "10-hisat-deseq2/*bam*" >> ../analyses/.gitignore
```

```{r, engine='bash'}
for samfile in ../analyses/10-hisat-deseq2/*.sam; do
  bamfile="${samfile%.sam}.bam"
  sorted_bamfile="${samfile%.sam}.sorted.bam"
  /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile"
  /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile"
  /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile"
done
```

```{r, engine='bash'}
rm ../analyses/10-hisat-deseq2/*sam
```


```{r, engine='bash'}
ls ../analyses/10-hisat-deseq2/*sorted.bam | wc -l
```

```{r, engine='bash', eval=TRUE}
head ../analyses/10-hisat-deseq2/hisat.out
```

```{r, engine='bash', eval=TRUE}
cat ../analyses/10-hisat-deseq2/hisat.out \
| grep "overall alignment rate"
```

# Stringtie

```{r, engine='bash'}
echo "10-hisat-deseq2/*gtf" >> ../analyses/.gitignore
```


```{bash}
/home/shared/gffread-0.12.7.Linux_x86_64/gffread \
../data/augustus.hints.gtf \
-T \
-o ../analyses/10-hisat-deseq2/augustus.hint.2.gff

```


```{r, engine='bash'}
find ../analyses/10-hisat-deseq2/*sorted.bam \
| xargs basename -s .sorted.bam | xargs -I{} \
sh -c '/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
-p 36 \
-eB \
-G ../analyses/10-hisat-deseq2/augustus.hint.2.gff \
-o ../analyses/10-hisat-deseq2/{}.gtf \
../analyses/10-hisat-deseq2/{}.sorted.bam'
```

```{bash}
head ../analyses/10-hisat-deseq2/*gtf

```


# Count Matrix

list format
```
RNA-ACR-140 ../analyses/15-Apul-hisat/RNA-ACR-140.gtf
RNA-ACR-145 ../analyses/15-Apul-hisat/RNA-ACR-145.gtf
RNA-ACR-173 ../analyses/15-Apul-hisat/RNA-ACR-173.gtf
RNA-ACR-178 ../analyses/15-Apul-hisat/RNA-ACR-178.gtf
```

```{r, engine='bash', eval=TRUE}
ls ../analyses/10-hisat-deseq2/*gtf
```


```{r, engine='bash', eval=TRUE}
head ../data/list01.txt
```


```{r, engine='bash'}
python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
-i ../data/list01.txt \
-g ../analyses/10-hisat-deseq2/gene_count_matrix.csv \
-t ../analyses/10-hisat-deseq2/transcript_count_matrix.csv
```

```{r, engine='bash', eval=TRUE}
head ../analyses/10-hisat-deseq2/*matrix.csv
```

# DEseq2


need `conditions.txt` file in this format

```
SampleID	Condition
RNA.ACR.140	control
RNA.ACR.145	control
RNA.ACR.173	treated
RNA.ACR.178	treated
```

```{r, engine='bash', eval=TRUE}
head ../data/conditions.txt
```


```{r, eval=TRUE}
library(DESeq2)
```

```{r, eval=TRUE, cache=TRUE}
# Load gene(/transcript) count matrix and labels
countData <- as.matrix(read.csv("../analyses/10-hisat-deseq2/gene_count_matrix.csv", row.names="gene_id"))
colData <- read.csv("../data/conditions.txt", sep="\t", row.names = 1)

# Note: The PHENO_DATA file contains information on each sample, e.g., sex or population.
# The exact way to import this depends on the format of the file.

# Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(countData)) # This should return TRUE

countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData)) # This should also return TRUE

# Create a DESeqDataSet from count matrix and labels
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData, design = ~ Condition)


# Run the default analysis for DESeq2 and generate results table
dds <- DESeq(dds)
deseq2.res <- results(dds)

# Sort by adjusted p-value and display
resOrdered <- deseq2.res[order(deseq2.res$padj), ]
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup = "Condition")
```


```{r, eval=TRUE}
# Select top 50 differentially expressed genes
res <- results(dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

# Extract counts and normalize
counts <- counts(dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")
```

```{r, eval=TRUE}
# Count number of hits with adjusted p-value less then 0.05
dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```

```{r, eval=TRUE}
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```

```{r}
write.table(tmp.sig, "../analyses/10-hisat-deseq2/DEGlist.tab", sep = '\t', row.names = T)
```

```{r, eval=TRUE}
deglist <- read.csv("../analyses/10-hisat-deseq2/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns
```

```{r,eval=TRUE}
head(deglist)
```