02.10-D-Apul-RNAseq-genome-index-HiSat2 ================ Sam White 2024-10-08 - 1 Create a Bash variables file - 2 Identify exons - 3 Identify splice sites - 4 Build HISAT2 genome index This notebook will build an index of the *A.pulchra* genome using [HISAT2](https://github.com/DaehwanKimLab/hisat2) (Kim et al. 2019). It utilizes the GTF file created in [`02.00-D-Apul-RNAseq-gff-to-gtf.Rmd`](./02.00-D-Apul-RNAseq-gff-to-gtf.Rmd). # 1 Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ``` bash { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular' echo 'export genome_dir="${timeseries_dir}/D-Apul/data"' echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2' echo "" echo "# Input/output files" echo 'export genome_index_name="Apulchra-genome"' echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"' echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"' echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"' echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"' echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"' echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"' echo "" echo 'export hisat2_build="${hisat2_dir}/hisat2-build"' echo 'export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"' echo 'export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"' echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "# Programs associative array" echo "declare -A programs_array" echo "programs_array=(" echo '[hisat2]="${hisat2}" \' echo '[hisat2_build]="${hisat2_build}" \' echo '[hisat2_exons]="${hisat2_exons}" \' echo '[hisat2_splice_sites]="${hisat2_splice_sites}" \' echo ")" echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` #### Assign Variables #### # Data directories export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular export genome_dir="${timeseries_dir}/D-Apul/data" export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2 # Input/output files export genome_index_name="Apulchra-genome" export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab" export genome_gff="${genome_dir}/Apulchra-genome.gff" export genome_fasta="${genome_dir}/Apulchra-genome.fa" export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab" export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf" # Paths to programs export programs_dir="/home/shared" export hisat2_dir="${programs_dir}/hisat2-2.2.1" export hisat2_build="${hisat2_dir}/hisat2-build" export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py" export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py" # Set number of CPUs to use export threads=40 # Programs associative array declare -A programs_array programs_array=( [hisat2]="${hisat2}" \ [hisat2_build]="${hisat2_build}" \ [hisat2_exons]="${hisat2_exons}" \ [hisat2_splice_sites]="${hisat2_splice_sites}" \ ) # Print formatting export line="--------------------------------------------------------" # 2 Identify exons ``` bash # Load bash variables into memory source .bashvars # Make directories, if they don't exist mkdir --parents "${output_dir_top}" # Create Hisat2 exons tab file "${programs_array[hisat2_exons]}" \ "${transcripts_gtf}" \ > "${exons}" head "${exons}" ``` ntLink_0 1104 1187 + ntLink_0 1860 1940 + ntLink_0 2761 2838 + ntLink_0 5043 7055 + ntLink_0 10214 10412 + ntLink_0 10613 10675 + ntLink_0 11271 11315 + ntLink_0 11517 11590 + ntLink_0 12240 12500 + ntLink_0 13073 14382 + # 3 Identify splice sites ``` bash # Load bash variables into memory source .bashvars # Create Hisat2 splice sites tab file "${programs_array[hisat2_splice_sites]}" \ "${transcripts_gtf}" \ > "${splice_sites}" head "${splice_sites}" ``` ntLink_0 1187 1860 + ntLink_0 1940 2761 + ntLink_0 2838 5043 + ntLink_0 10412 10613 + ntLink_0 10675 11271 + ntLink_0 11315 11517 + ntLink_0 11590 12240 + ntLink_0 12500 13073 + ntLink_0 14382 14721 + ntLink_0 14899 15032 + # 4 Build HISAT2 genome index ``` bash # Load bash variables into memory source .bashvars # Change to working directory cd "${output_dir_top}" # Build Hisat2 reference index using splice sites and exons "${programs_array[hisat2_build]}" \ "${genome_fasta}" \ "${genome_index_name}" \ --exon "${exons}" \ --ss "${splice_sites}" \ -p "${threads}" \ 2> "${genome_index_name}"-hisat2_build.err ls -lh ``` ``` bash # Load bash variables into memory source .bashvars for index in "${output_dir_top}"/*.ht2 do cp ${index} ${genome_dir} done ls -lh "${output_dir_top}" ``` total 1.1G -rw-r--r-- 1 sam sam 312M Oct 8 12:19 Apulchra-genome.1.ht2 -rw-r--r-- 1 sam sam 125M Oct 8 12:19 Apulchra-genome.2.ht2 -rw-r--r-- 1 sam sam 1.6K Oct 8 12:09 Apulchra-genome.3.ht2 -rw-r--r-- 1 sam sam 124M Oct 8 12:09 Apulchra-genome.4.ht2 -rw-r--r-- 1 sam sam 335M Oct 8 12:21 Apulchra-genome.5.ht2 -rw-r--r-- 1 sam sam 127M Oct 8 12:21 Apulchra-genome.6.ht2 -rw-r--r-- 1 sam sam 7.2M Oct 8 12:09 Apulchra-genome.7.ht2 -rw-r--r-- 1 sam sam 1.5M Oct 8 12:09 Apulchra-genome.8.ht2 -rw-r--r-- 1 sam sam 21K Oct 8 12:21 Apulchra-genome-hisat2_build.err -rw-r--r-- 1 sam sam 5.9M Oct 8 12:35 Apulchra-genome_hisat2_exons.tab -rw-r--r-- 1 sam sam 4.7M Oct 8 12:35 Apulchra-genome_hisat2_splice_sites.tab