02.10-D-Apul-RNAseq-genome-index-HiSat2
================
Sam White
2024-10-08

- <a href="#1-create-a-bash-variables-file"
  id="toc-1-create-a-bash-variables-file">1 Create a Bash variables
  file</a>
- <a href="#2-identify-exons" id="toc-2-identify-exons">2 Identify
  exons</a>
- <a href="#3-identify-splice-sites" id="toc-3-identify-splice-sites">3
  Identify splice sites</a>
- <a href="#4-build-hisat2-genome-index"
  id="toc-4-build-hisat2-genome-index">4 Build HISAT2 genome index</a>

This notebook will build an index of the *A.pulchra* genome using
[HISAT2](https://github.com/DaehwanKimLab/hisat2) (Kim et al. 2019). It
utilizes the GTF file created in
[`02.00-D-Apul-RNAseq-gff-to-gtf.Rmd`](./02.00-D-Apul-RNAseq-gff-to-gtf.Rmd).

# 1 Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

``` bash
{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular'
echo 'export genome_dir="${timeseries_dir}/D-Apul/data"'
echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2'
echo ""

echo "# Input/output files"
echo 'export genome_index_name="Apulchra-genome"'
echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"'
echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"'
echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"'
echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"'
echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"'

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"'
echo ""
echo 'export hisat2_build="${hisat2_dir}/hisat2-build"'
echo 'export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"'
echo 'export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"'
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""

echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[hisat2]="${hisat2}" \'
echo '[hisat2_build]="${hisat2_build}" \'
echo '[hisat2_exons]="${hisat2_exons}" \'
echo '[hisat2_splice_sites]="${hisat2_splice_sites}" \'
echo ")"
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
```

    #### Assign Variables ####

    # Data directories
    export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular
    export genome_dir="${timeseries_dir}/D-Apul/data"
    export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2

    # Input/output files
    export genome_index_name="Apulchra-genome"
    export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"
    export genome_gff="${genome_dir}/Apulchra-genome.gff"
    export genome_fasta="${genome_dir}/Apulchra-genome.fa"
    export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"
    export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"
    # Paths to programs
    export programs_dir="/home/shared"
    export hisat2_dir="${programs_dir}/hisat2-2.2.1"

    export hisat2_build="${hisat2_dir}/hisat2-build"
    export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"
    export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"

    # Set number of CPUs to use
    export threads=40

    # Programs associative array
    declare -A programs_array
    programs_array=(
    [hisat2]="${hisat2}" \
    [hisat2_build]="${hisat2_build}" \
    [hisat2_exons]="${hisat2_exons}" \
    [hisat2_splice_sites]="${hisat2_splice_sites}" \
    )

    # Print formatting
    export line="--------------------------------------------------------"

# 2 Identify exons

``` bash
# Load bash variables into memory
source .bashvars

# Make directories, if they don't exist
mkdir --parents "${output_dir_top}"

# Create Hisat2 exons tab file
"${programs_array[hisat2_exons]}" \
"${transcripts_gtf}" \
> "${exons}"

head "${exons}"
```

    ntLink_0    1104    1187    +
    ntLink_0    1860    1940    +
    ntLink_0    2761    2838    +
    ntLink_0    5043    7055    +
    ntLink_0    10214   10412   +
    ntLink_0    10613   10675   +
    ntLink_0    11271   11315   +
    ntLink_0    11517   11590   +
    ntLink_0    12240   12500   +
    ntLink_0    13073   14382   +

# 3 Identify splice sites

``` bash
# Load bash variables into memory
source .bashvars

# Create Hisat2 splice sites tab file
"${programs_array[hisat2_splice_sites]}" \
"${transcripts_gtf}" \
> "${splice_sites}"

head "${splice_sites}"
```

    ntLink_0    1187    1860    +
    ntLink_0    1940    2761    +
    ntLink_0    2838    5043    +
    ntLink_0    10412   10613   +
    ntLink_0    10675   11271   +
    ntLink_0    11315   11517   +
    ntLink_0    11590   12240   +
    ntLink_0    12500   13073   +
    ntLink_0    14382   14721   +
    ntLink_0    14899   15032   +

# 4 Build HISAT2 genome index

``` bash
# Load bash variables into memory
source .bashvars

# Change to working directory
cd "${output_dir_top}"

# Build Hisat2 reference index using splice sites and exons
"${programs_array[hisat2_build]}" \
"${genome_fasta}" \
"${genome_index_name}" \
--exon "${exons}" \
--ss "${splice_sites}" \
-p "${threads}" \
2> "${genome_index_name}"-hisat2_build.err

ls -lh
```

``` bash
# Load bash variables into memory
source .bashvars

for index in "${output_dir_top}"/*.ht2
do
  cp ${index} ${genome_dir}
done

ls -lh "${output_dir_top}"
```

    total 1.1G
    -rw-r--r-- 1 sam sam 312M Oct  8 12:19 Apulchra-genome.1.ht2
    -rw-r--r-- 1 sam sam 125M Oct  8 12:19 Apulchra-genome.2.ht2
    -rw-r--r-- 1 sam sam 1.6K Oct  8 12:09 Apulchra-genome.3.ht2
    -rw-r--r-- 1 sam sam 124M Oct  8 12:09 Apulchra-genome.4.ht2
    -rw-r--r-- 1 sam sam 335M Oct  8 12:21 Apulchra-genome.5.ht2
    -rw-r--r-- 1 sam sam 127M Oct  8 12:21 Apulchra-genome.6.ht2
    -rw-r--r-- 1 sam sam 7.2M Oct  8 12:09 Apulchra-genome.7.ht2
    -rw-r--r-- 1 sam sam 1.5M Oct  8 12:09 Apulchra-genome.8.ht2
    -rw-r--r-- 1 sam sam  21K Oct  8 12:21 Apulchra-genome-hisat2_build.err
    -rw-r--r-- 1 sam sam 5.9M Oct  8 12:35 Apulchra-genome_hisat2_exons.tab
    -rw-r--r-- 1 sam sam 4.7M Oct  8 12:35 Apulchra-genome_hisat2_splice_sites.tab

<div id="refs" class="references csl-bib-body hanging-indent">

<div id="ref-kim2019" class="csl-entry">

Kim, Daehwan, Joseph M. Paggi, Chanhee Park, Christopher Bennett, and
Steven L. Salzberg. 2019. “Graph-Based Genome Alignment and Genotyping
with HISAT2 and HISAT-Genotype.” *Nature Biotechnology* 37 (8): 907–15.
<https://doi.org/10.1038/s41587-019-0201-4>.

</div>

</div>