--- title: "02.10-D-Apul-RNAseq-genome-index-HiSat2" author: "Sam White" date: "2024-10-08" output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` This notebook will build an index of the *A.pulchra* genome using [HISAT2](https://github.com/DaehwanKimLab/hisat2) [@kim2019]. It utilizes the GTF file created in [`02.00-D-Apul-RNAseq-gff-to-gtf.Rmd`](./02.00-D-Apul-RNAseq-gff-to-gtf.Rmd). # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular' echo 'export genome_dir="${timeseries_dir}/D-Apul/data"' echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2' echo "" echo "# Input/output files" echo 'export genome_index_name="Apulchra-genome"' echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"' echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"' echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"' echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"' echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"' echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"' echo "" echo 'export hisat2_build="${hisat2_dir}/hisat2-build"' echo 'export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"' echo 'export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"' echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "# Programs associative array" echo "declare -A programs_array" echo "programs_array=(" echo '[hisat2]="${hisat2}" \' echo '[hisat2_build]="${hisat2_build}" \' echo '[hisat2_exons]="${hisat2_exons}" \' echo '[hisat2_splice_sites]="${hisat2_splice_sites}" \' echo ")" echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Identify exons ```{r identify-exons, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make directories, if they don't exist mkdir --parents "${output_dir_top}" # Create Hisat2 exons tab file "${programs_array[hisat2_exons]}" \ "${transcripts_gtf}" \ > "${exons}" head "${exons}" ``` # Identify splice sites ```{r identify-splice-sites, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Create Hisat2 splice sites tab file "${programs_array[hisat2_splice_sites]}" \ "${transcripts_gtf}" \ > "${splice_sites}" head "${splice_sites}" ``` # Build HISAT2 genome index ```{r build-hisat2-index, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to working directory cd "${output_dir_top}" # Build Hisat2 reference index using splice sites and exons "${programs_array[hisat2_build]}" \ "${genome_fasta}" \ "${genome_index_name}" \ --exon "${exons}" \ --ss "${splice_sites}" \ -p "${threads}" \ 2> "${genome_index_name}"-hisat2_build.err ls -lh ``` ```{r list-hisat2-index-output, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars for index in "${output_dir_top}"/*.ht2 do cp ${index} ${genome_dir} done ls -lh "${output_dir_top}" ```