---
title: "02.20-D-Apul-RNAseq-alignment-HiSat2"
author: "Sam White"
date: "2024-10-08"
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
bibliography: references.bib
---

```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,        # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```

This notebook will align [trimmed *A.pulchra* RNA-seq data](./01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC.Rmd) to the *A.pulchra* genome using [HISAT2](https://github.com/DaehwanKimLab/hisat2) [@kim2019].

# Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE}
{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular'
echo 'export genome_dir="${timeseries_dir}/D-Apul/data"'
echo 'export genome_index_dir="${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2"'
echo 'export output_dir_top="${timeseries_dir}/D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2"'
echo 'export trimmed_fastqs_dir="${timeseries_dir}/D-Apul/output/01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC/trimmed-fastqs"'
echo 'export trimmed_reads_url="https://gannet.fish.washington.edu/Atumefaciens/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"'
echo ""

echo "# Location of Hisat2 index files"
echo "# Must keep variable name formatting, as it's used by HiSat2"
echo 'export HISAT2_INDEXES="${genome_index_dir}"'


echo "# Input files"
echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"'
echo 'export genome_index_name="Apulchra-genome"'
echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"'
echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"'
echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"'
echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"'

echo "# Output files"
echo 'export gtf_list="${output_dir_top}/gtf_list.txt"'
echo 'export merged_bam="${output_dir_top}/sorted-bams-merged.bam"'
echo ""

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"'

echo 'export hisat2="${hisat2_dir}/hisat2"'

echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc'

echo 'export samtools="${programs_dir}/samtools-1.12/samtools"'

echo 'export prepDE="${programs_dir}/stringtie-2.2.1.Linux_x86_64/prepDE.py3"'
echo 'export stringtie="${programs_dir}/stringtie-2.2.1.Linux_x86_64/stringtie"'

echo ""

echo "# Set FastQ filename patterns"
echo "export R1_fastq_pattern='*_R1_*.fq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fq.gz'"
echo "export trimmed_fastq_pattern='*fastp-trim.fq.gz'"
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""

echo "# Set average read length - for StringTie prepDE.py"
echo 'export read_length=125'
echo ""


echo "## Initialize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo "declare -A sample_timepoint_map"
echo ""

echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[hisat2]="${hisat2}" \'
echo '[multiqc]="${multiqc}" \'
echo '[prepDE]="${prepDE}" \'
echo '[samtools]="${samtools}" \'
echo '[stringtie]="${stringtie}" \'
echo ")"
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
```

If needed, download raw RNA-seq.

Change `eval=FALSE` to `eval=TRUE` to execute the next two chunks to download RNA-seq and then verify MD5 checksums.

```{bash download-raw-reads, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Make output directory if it doesn't exist
mkdir --parents ${trimmed_fastqs_dir}

# Create list of only A.pulchra sample names
sample_list=$(awk -F "," 'NR > 2 {print $5"\t"$6}' ../../M-multi-species/data/ | awk -F"[\t-]" '$2 == "ACR" {print $1}')

echo ""
echo "${line}"
echo ""
echo "Sample list:"
echo ""
echo "${sample_list}"
echo ""
echo "${line}"
echo ""


# Use printf to format each item for use in wget
formatted_list=$(printf "*%s*," ${sample_list})

# Remove the trailing comma and append *.md5
formatted_list="${formatted_list%,},*.md5"

# Output the final wget command
echo ""
echo "${line}"
echo ""
echo "Formatted wget accept list:"
echo ""
echo "wget --accept=\"$formatted_list\""
echo ""
echo "${line}"
echo ""

# Run wget to retrieve FastQs and MD5 files
wget \
--directory-prefix ${trimmed_fastqs_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 3 \
--no-host-directories \
--no-parent \
--quiet \
--accept=\"$formatted_list\" ${trimmed_reads_url}

ls -lh "${trimmed_fastqs_dir}"
```

Verify raw read checksums

```{bash verify-raw-read-checksums, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

cd "${trimmed_fastqs_dir}"

# Verify checksums
for file in *.md5
do
  md5sum --check "${file}"
done
```

# Align reads using HISAT2

```{bash hisat2-alignments, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Make output directories, if they don't exist
mkdir --parents "${output_dir_top}"

# Change to ouput directory
cd "${output_dir_top}"

# Create associative array with sample and timepoint
metadata="../../../M-multi-species/data/rna_metadata.csv"

# Declare the array
declare -A sample_timepoint_map

# Read the metadata file line by line
while IFS=',' read -r sample_number sample_name plate well_number azenta_sample_name colony_id timepoint sample_type species_strain SampleBuffer; do
    # Check if the species is "Acropora pulchra"
    if [[ "${species_strain}" == "Acropora pulchra" ]]; then
        # Add the Azenta sample name as the key and Timepoint as the value in the associative array
        sample_timepoint_map["${azenta_sample_name}"]="${timepoint}"
    fi
done < <(tail -n +2 "${metadata}") # Skip the header

## Populate trimmed reads arrays
fastq_array_R1=("${trimmed_fastqs_dir}"/${R1_fastq_pattern})
fastq_array_R2=("${trimmed_fastqs_dir}"/${R2_fastq_pattern})

############## BEGIN HISAT2 ALIGNMENTS ##############
for sample in "${!sample_timepoint_map[@]}"
do


  # Create and switch to dedicated sample directory
  mkdir --parents "${sample}" && cd "$_"

  # Create HISAT2 list of fastq R1 files
  # and generated MD5 checksums file.
  for fastq in "${fastq_array_R1[@]}"
  do
    # Parse sample name from FastQ filename
    fastq_sample=$(echo "${fastq##*/}" | awk -F"[_-]" '{print $3}')
    
    # Process matching FastQ file, based on sample name
    if [ "${fastq_sample}" == "${sample}" ]; then
      
      # Generate checksum/list of input files used
      md5sum "${fastq}" >> input_fastqs_checksums.md5
      
      # Create comma-separated lists of FastQs for HISAT2
      printf -v joined_R1 '%s,' "${fastq}"
      fastq_list_R1=$(echo "${joined_R1%,}")
    fi
  done

  # Create HISAT2 list of fastq R1 files
  # and generated MD5 checksums file.
  for fastq in "${fastq_array_R2[@]}"
  do
    # Parse sample name from FastQ filename
    fastq_sample=$(echo "${fastq##*/}" | awk -F"[_-]" '{print $3}')
    
    # Process matching FastQ file, based on sample name
    if [ "${fastq_sample}" == "${sample}" ]; then
      
      # Generate checksum/list of input files used
      md5sum "${fastq}" >> input_fastqs_checksums.md5

      # Create comma-separated lists of FastQs for HISAT2
      printf -v joined_R2 '%s,' "${fastq}"
      fastq_list_R2=$(echo "${joined_R2%,}")
    fi
  done


  # HISAT2 alignments
  # Sets read group info (RG) using samples array
  "${programs_array[hisat2]}" \
  -x "${genome_index_name}" \
  -1 "${fastq_list_R1}" \
  -2 "${fastq_list_R2}" \
  --threads "${threads}" \
  --rg-id "${sample}" \
  --rg "SM:""${sample_timepoint_map[$sample]}" \
  2> "${sample}"_hisat2.stats \
  | tee >(${programs_array[samtools]} flagstat - > "${sample}"-hisat2_output.flagstat) \
  | ${programs_array[samtools]} sort - -@ "${threads}" -O BAM \
  | tee "${sample}".sorted.bam \
  | ${programs_array[samtools]} index - "${sample}".sorted.bam.bai
  
  
  # Run stringtie on alignments
  # Uses "-B" option to output tables intended for use in Ballgown
  # Uses "-e" option; recommended when using "-B" option.
  # Limits analysis to only reads alignments matching reference.
  "${programs_array[stringtie]}" "${sample}".sorted.bam \
  -p "${threads}" \
  -o "${sample}".gtf \
  -G "${genome_gff}" \
  -C "${sample}.cov_refs.gtf" \
  -B \
  -e 
  
  
  # Add GTFs to list file, only if non-empty
  # Identifies GTF files that only have header
  gtf_lines=$(wc -l < "${sample}".gtf )
  if [ "${gtf_lines}" -gt 2 ]; then
    echo "$(pwd)/${sample}.gtf" >> "${gtf_list}"
  fi 

  # Generate checksums
  find ./ -type f -not -name "*.md5" -exec md5sum {} \; > ${sample}_checksums.md5
  # Move up to orig. working directory
  cd ..

done
```

## Review HISAT2 Output

```{r tree-hisat2-output, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"

# Display HISAT2 output directory structure
# with directory (--du) and file sizes (-h)
tree --du -h
```

## MultiQC alignment rates

```{r multiqc-alignment-rates, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"

${programs_array[multiqc]} .

```

# Merge sorted BAMs

Merge all BAMs to singular BAM for use in transcriptome assembly later, if needed

```{r merge-sorted-BAMS, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"


## Create list of sorted BAMs for merging
find . -name "*sorted.bam" > sorted_bams.list

## Merge sorted BAMs
${programs_array[samtools]} merge \
-b sorted_bams.list \
${merged_bam} \
--threads ${threads}

## Index merged BAM
${programs_array[samtools]} index ${merged_bam}
```

# Create combined GTF

```{r combine-GTFs, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"


# Create singular transcript file, using GTF list file
"${programs_array[stringtie]}" --merge \
"${gtf_list}" \
-p "${threads}" \
-G "${genome_gff}" \
-o "${genome_index_name}".stringtie.gtf
```

# Create DESeq2 Count Matrices

```{r DESeq2-matrices, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"

# Create file list for prepDE.py
while read -r line
do
  sample_no_path=${line##*/}
  sample=${sample_no_path%.*}
  echo ${sample} ${line}
done < gtf_list.txt >> prepDE-sample_list.txt

# Create count matrices for genes and transcripts
# Compatible with import to DESeq2
python3 "${programs_array[prepDE]}" \
--input=prepDE-sample_list.txt \
-g apul-gene_count_matrix.csv \
-t apul-transcript_count_matrix.csv \
--length=${read_length}
```

# Generate checksums

```{r generate-checksums, engine='bash', eval=FALSE}
# Load bash variables into memory
source .bashvars

# Change to ouput directory
cd "${output_dir_top}"

# Uses find command to avoid passing
# directory names to the md5sum command.
find . -maxdepth 1 -type f -exec md5sum {} + \
| tee --append checksums.md5
```