This notebook will build an index of the A.pulchra genome using HISAT2 (Kim et al. 2019). It utilizes the GTF file created in 02.00-D-Apul-RNAseq-gff-to-gtf.Rmd.

1 Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular'
echo 'export genome_dir="${timeseries_dir}/D-Apul/data"'
echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2'
echo ""

echo "# Input/output files"
echo 'export genome_index_name="Apulchra-genome"'
echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"'
echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"'
echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"'
echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"'
echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"'

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"'
echo ""
echo 'export hisat2_build="${hisat2_dir}/hisat2-build"'
echo 'export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"'
echo 'export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"'
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""

echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[hisat2]="${hisat2}" \'
echo '[hisat2_build]="${hisat2_build}" \'
echo '[hisat2_exons]="${hisat2_exons}" \'
echo '[hisat2_splice_sites]="${hisat2_splice_sites}" \'
echo ")"
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
#### Assign Variables ####

# Data directories
export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular
export genome_dir="${timeseries_dir}/D-Apul/data"
export output_dir_top=${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2

# Input/output files
export genome_index_name="Apulchra-genome"
export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"
export genome_gff="${genome_dir}/Apulchra-genome.gff"
export genome_fasta="${genome_dir}/Apulchra-genome.fa"
export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"
export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"
# Paths to programs
export programs_dir="/home/shared"
export hisat2_dir="${programs_dir}/hisat2-2.2.1"

export hisat2_build="${hisat2_dir}/hisat2-build"
export hisat2_exons="${hisat2_dir}/hisat2_extract_exons.py"
export hisat2_splice_sites="${hisat2_dir}/hisat2_extract_splice_sites.py"

# Set number of CPUs to use
export threads=40

# Programs associative array
declare -A programs_array
programs_array=(
[hisat2]="${hisat2}" \
[hisat2_build]="${hisat2_build}" \
[hisat2_exons]="${hisat2_exons}" \
[hisat2_splice_sites]="${hisat2_splice_sites}" \
)

# Print formatting
export line="--------------------------------------------------------"

2 Identify exons

# Load bash variables into memory
source .bashvars

# Make directories, if they don't exist
mkdir --parents "${output_dir_top}"

# Create Hisat2 exons tab file
"${programs_array[hisat2_exons]}" \
"${transcripts_gtf}" \
> "${exons}"

head "${exons}"
ntLink_0    1104    1187    +
ntLink_0    1860    1940    +
ntLink_0    2761    2838    +
ntLink_0    5043    7055    +
ntLink_0    10214   10412   +
ntLink_0    10613   10675   +
ntLink_0    11271   11315   +
ntLink_0    11517   11590   +
ntLink_0    12240   12500   +
ntLink_0    13073   14382   +

3 Identify splice sites

# Load bash variables into memory
source .bashvars

# Create Hisat2 splice sites tab file
"${programs_array[hisat2_splice_sites]}" \
"${transcripts_gtf}" \
> "${splice_sites}"

head "${splice_sites}"
ntLink_0    1187    1860    +
ntLink_0    1940    2761    +
ntLink_0    2838    5043    +
ntLink_0    10412   10613   +
ntLink_0    10675   11271   +
ntLink_0    11315   11517   +
ntLink_0    11590   12240   +
ntLink_0    12500   13073   +
ntLink_0    14382   14721   +
ntLink_0    14899   15032   +

4 Build HISAT2 genome index

# Load bash variables into memory
source .bashvars

# Change to working directory
cd "${output_dir_top}"

# Build Hisat2 reference index using splice sites and exons
"${programs_array[hisat2_build]}" \
"${genome_fasta}" \
"${genome_index_name}" \
--exon "${exons}" \
--ss "${splice_sites}" \
-p "${threads}" \
2> "${genome_index_name}"-hisat2_build.err

ls -lh
# Load bash variables into memory
source .bashvars

for index in "${output_dir_top}"/*.ht2
do
  cp ${index} ${genome_dir}
done

ls -lh "${output_dir_top}"
total 1.1G
-rw-r--r-- 1 sam sam 312M Oct  8 12:19 Apulchra-genome.1.ht2
-rw-r--r-- 1 sam sam 125M Oct  8 12:19 Apulchra-genome.2.ht2
-rw-r--r-- 1 sam sam 1.6K Oct  8 12:09 Apulchra-genome.3.ht2
-rw-r--r-- 1 sam sam 124M Oct  8 12:09 Apulchra-genome.4.ht2
-rw-r--r-- 1 sam sam 335M Oct  8 12:21 Apulchra-genome.5.ht2
-rw-r--r-- 1 sam sam 127M Oct  8 12:21 Apulchra-genome.6.ht2
-rw-r--r-- 1 sam sam 7.2M Oct  8 12:09 Apulchra-genome.7.ht2
-rw-r--r-- 1 sam sam 1.5M Oct  8 12:09 Apulchra-genome.8.ht2
-rw-r--r-- 1 sam sam  21K Oct  8 12:21 Apulchra-genome-hisat2_build.err
-rw-r--r-- 1 sam sam 5.9M Oct  8 12:35 Apulchra-genome_hisat2_exons.tab
-rw-r--r-- 1 sam sam 4.7M Oct  8 12:35 Apulchra-genome_hisat2_splice_sites.tab
Kim, Daehwan, Joseph M. Paggi, Chanhee Park, Christopher Bennett, and Steven L. Salzberg. 2019. “Graph-Based Genome Alignment and Genotyping with HISAT2 and HISAT-Genotype.” Nature Biotechnology 37 (8): 907–15. https://doi.org/10.1038/s41587-019-0201-4.
LS0tCnRpdGxlOiAiMDIuMTAtRC1BcHVsLVJOQXNlcS1nZW5vbWUtaW5kZXgtSGlTYXQyIgphdXRob3I6ICJTYW0gV2hpdGUiCmRhdGU6ICIyMDI0LTEwLTA4IgpvdXRwdXQ6IAogIGJvb2tkb3duOjpodG1sX2RvY3VtZW50MjoKICAgIHRoZW1lOiBjb3NtbwogICAgdG9jOiB0cnVlCiAgICB0b2NfZmxvYXQ6IHRydWUKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgZ2l0aHViX2RvY3VtZW50OgogICAgdG9jOiB0cnVlCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBodG1sX2RvY3VtZW50OgogICAgdGhlbWU6IGNvc21vCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKYmlibGlvZ3JhcGh5OiByZWZlcmVuY2VzLmJpYgotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KGtuaXRyKQprbml0cjo6b3B0c19jaHVuayRzZXQoCiAgZWNobyA9IFRSVUUsICAgICAgICAgIyBEaXNwbGF5IGNvZGUgY2h1bmtzCiAgZXZhbCA9IEZBTFNFLCAgICAgICAgIyBFdmFsdWF0ZSBjb2RlIGNodW5rcwogIHdhcm5pbmcgPSBGQUxTRSwgICAgICMgSGlkZSB3YXJuaW5ncwogIG1lc3NhZ2UgPSBGQUxTRSwgICAgICMgSGlkZSBtZXNzYWdlcwogIGNvbW1lbnQgPSAiIiAgICAgICAgICMgUHJldmVudHMgYXBwZW5kaW5nICcjIycgdG8gYmVnaW5uaW5nIG9mIGxpbmVzIGluIGNvZGUgb3V0cHV0CikKYGBgCgpUaGlzIG5vdGVib29rIHdpbGwgYnVpbGQgYW4gaW5kZXggb2YgdGhlICpBLnB1bGNocmEqIGdlbm9tZSB1c2luZyBbSElTQVQyXShodHRwczovL2dpdGh1Yi5jb20vRGFlaHdhbktpbUxhYi9oaXNhdDIpIFtAa2ltMjAxOV0uIEl0IHV0aWxpemVzIHRoZSBHVEYgZmlsZSBjcmVhdGVkIGluIFtgMDIuMDAtRC1BcHVsLVJOQXNlcS1nZmYtdG8tZ3RmLlJtZGBdKC4vMDIuMDAtRC1BcHVsLVJOQXNlcS1nZmYtdG8tZ3RmLlJtZCkuCgojIENyZWF0ZSBhIEJhc2ggdmFyaWFibGVzIGZpbGUKClRoaXMgYWxsb3dzIHVzYWdlIG9mIEJhc2ggdmFyaWFibGVzIGFjcm9zcyBSIE1hcmtkb3duIGNodW5rcy4KCmBgYHtyIHNhdmUtYmFzaC12YXJpYWJsZXMtdG8tcnZhcnMtZmlsZSwgZW5naW5lPSdiYXNoJywgZXZhbD1UUlVFfQp7CmVjaG8gIiMjIyMgQXNzaWduIFZhcmlhYmxlcyAjIyMjIgplY2hvICIiCgplY2hvICIjIERhdGEgZGlyZWN0b3JpZXMiCmVjaG8gJ2V4cG9ydCB0aW1lc2VyaWVzX2Rpcj0vaG9tZS9zaGFyZWQvOFRCX0hERF8wMS9zYW0vZ2l0cmVwb3MvdXJvbC1lNS90aW1lc2VyaWVzX21vbGVjdWxhcicKZWNobyAnZXhwb3J0IGdlbm9tZV9kaXI9IiR7dGltZXNlcmllc19kaXJ9L0QtQXB1bC9kYXRhIicKZWNobyAnZXhwb3J0IG91dHB1dF9kaXJfdG9wPSR7dGltZXNlcmllc19kaXJ9L0QtQXB1bC9vdXRwdXQvMDIuMTAtRC1BcHVsLVJOQXNlcS1nZW5vbWUtaW5kZXgtSGlTYXQyJwplY2hvICIiCgplY2hvICIjIElucHV0L291dHB1dCBmaWxlcyIKZWNobyAnZXhwb3J0IGdlbm9tZV9pbmRleF9uYW1lPSJBcHVsY2hyYS1nZW5vbWUiJwplY2hvICdleHBvcnQgZXhvbnM9IiR7b3V0cHV0X2Rpcl90b3B9L0FwdWxjaHJhLWdlbm9tZV9oaXNhdDJfZXhvbnMudGFiIicKZWNobyAnZXhwb3J0IGdlbm9tZV9nZmY9IiR7Z2Vub21lX2Rpcn0vQXB1bGNocmEtZ2Vub21lLmdmZiInCmVjaG8gJ2V4cG9ydCBnZW5vbWVfZmFzdGE9IiR7Z2Vub21lX2Rpcn0vQXB1bGNocmEtZ2Vub21lLmZhIicKZWNobyAnZXhwb3J0IHNwbGljZV9zaXRlcz0iJHtvdXRwdXRfZGlyX3RvcH0vQXB1bGNocmEtZ2Vub21lX2hpc2F0Ml9zcGxpY2Vfc2l0ZXMudGFiIicKZWNobyAnZXhwb3J0IHRyYW5zY3JpcHRzX2d0Zj0iJHtnZW5vbWVfZGlyfS9BcHVsY2hyYS1nZW5vbWUuZ3RmIicKCmVjaG8gIiMgUGF0aHMgdG8gcHJvZ3JhbXMiCmVjaG8gJ2V4cG9ydCBwcm9ncmFtc19kaXI9Ii9ob21lL3NoYXJlZCInCmVjaG8gJ2V4cG9ydCBoaXNhdDJfZGlyPSIke3Byb2dyYW1zX2Rpcn0vaGlzYXQyLTIuMi4xIicKZWNobyAiIgplY2hvICdleHBvcnQgaGlzYXQyX2J1aWxkPSIke2hpc2F0Ml9kaXJ9L2hpc2F0Mi1idWlsZCInCmVjaG8gJ2V4cG9ydCBoaXNhdDJfZXhvbnM9IiR7aGlzYXQyX2Rpcn0vaGlzYXQyX2V4dHJhY3RfZXhvbnMucHkiJwplY2hvICdleHBvcnQgaGlzYXQyX3NwbGljZV9zaXRlcz0iJHtoaXNhdDJfZGlyfS9oaXNhdDJfZXh0cmFjdF9zcGxpY2Vfc2l0ZXMucHkiJwplY2hvICIiCgplY2hvICIjIFNldCBudW1iZXIgb2YgQ1BVcyB0byB1c2UiCmVjaG8gJ2V4cG9ydCB0aHJlYWRzPTQwJwplY2hvICIiCgplY2hvICIjIFByb2dyYW1zIGFzc29jaWF0aXZlIGFycmF5IgplY2hvICJkZWNsYXJlIC1BIHByb2dyYW1zX2FycmF5IgplY2hvICJwcm9ncmFtc19hcnJheT0oIgplY2hvICdbaGlzYXQyXT0iJHtoaXNhdDJ9IiBcJwplY2hvICdbaGlzYXQyX2J1aWxkXT0iJHtoaXNhdDJfYnVpbGR9IiBcJwplY2hvICdbaGlzYXQyX2V4b25zXT0iJHtoaXNhdDJfZXhvbnN9IiBcJwplY2hvICdbaGlzYXQyX3NwbGljZV9zaXRlc109IiR7aGlzYXQyX3NwbGljZV9zaXRlc30iIFwnCmVjaG8gIikiCmVjaG8gIiIKCmVjaG8gIiMgUHJpbnQgZm9ybWF0dGluZyIKZWNobyAnZXhwb3J0IGxpbmU9Ii0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tIicKZWNobyAiIgp9ID4gLmJhc2h2YXJzCgpjYXQgLmJhc2h2YXJzCmBgYAoKIyBJZGVudGlmeSBleG9ucwpgYGB7ciBpZGVudGlmeS1leG9ucywgZW5naW5lPSdiYXNoJywgZXZhbD1UUlVFfQojIExvYWQgYmFzaCB2YXJpYWJsZXMgaW50byBtZW1vcnkKc291cmNlIC5iYXNodmFycwoKIyBNYWtlIGRpcmVjdG9yaWVzLCBpZiB0aGV5IGRvbid0IGV4aXN0Cm1rZGlyIC0tcGFyZW50cyAiJHtvdXRwdXRfZGlyX3RvcH0iCgojIENyZWF0ZSBIaXNhdDIgZXhvbnMgdGFiIGZpbGUKIiR7cHJvZ3JhbXNfYXJyYXlbaGlzYXQyX2V4b25zXX0iIFwKIiR7dHJhbnNjcmlwdHNfZ3RmfSIgXAo+ICIke2V4b25zfSIKCmhlYWQgIiR7ZXhvbnN9IgoKYGBgCgojIElkZW50aWZ5IHNwbGljZSBzaXRlcwoKYGBge3IgaWRlbnRpZnktc3BsaWNlLXNpdGVzLCBlbmdpbmU9J2Jhc2gnLCBldmFsPVRSVUV9CiMgTG9hZCBiYXNoIHZhcmlhYmxlcyBpbnRvIG1lbW9yeQpzb3VyY2UgLmJhc2h2YXJzCgojIENyZWF0ZSBIaXNhdDIgc3BsaWNlIHNpdGVzIHRhYiBmaWxlCiIke3Byb2dyYW1zX2FycmF5W2hpc2F0Ml9zcGxpY2Vfc2l0ZXNdfSIgXAoiJHt0cmFuc2NyaXB0c19ndGZ9IiBcCj4gIiR7c3BsaWNlX3NpdGVzfSIKCmhlYWQgIiR7c3BsaWNlX3NpdGVzfSIKYGBgCgojIEJ1aWxkIEhJU0FUMiBnZW5vbWUgaW5kZXgKCmBgYHtyIGJ1aWxkLWhpc2F0Mi1pbmRleCwgZW5naW5lPSdiYXNoJywgZXZhbD1GQUxTRX0KIyBMb2FkIGJhc2ggdmFyaWFibGVzIGludG8gbWVtb3J5CnNvdXJjZSAuYmFzaHZhcnMKCiMgQ2hhbmdlIHRvIHdvcmtpbmcgZGlyZWN0b3J5CmNkICIke291dHB1dF9kaXJfdG9wfSIKCiMgQnVpbGQgSGlzYXQyIHJlZmVyZW5jZSBpbmRleCB1c2luZyBzcGxpY2Ugc2l0ZXMgYW5kIGV4b25zCiIke3Byb2dyYW1zX2FycmF5W2hpc2F0Ml9idWlsZF19IiBcCiIke2dlbm9tZV9mYXN0YX0iIFwKIiR7Z2Vub21lX2luZGV4X25hbWV9IiBcCi0tZXhvbiAiJHtleG9uc30iIFwKLS1zcyAiJHtzcGxpY2Vfc2l0ZXN9IiBcCi1wICIke3RocmVhZHN9IiBcCjI+ICIke2dlbm9tZV9pbmRleF9uYW1lfSItaGlzYXQyX2J1aWxkLmVycgoKbHMgLWxoCmBgYApgYGB7ciBsaXN0LWhpc2F0Mi1pbmRleC1vdXRwdXQsIGVuZ2luZT0nYmFzaCcsIGV2YWw9VFJVRX0KIyBMb2FkIGJhc2ggdmFyaWFibGVzIGludG8gbWVtb3J5CnNvdXJjZSAuYmFzaHZhcnMKCmZvciBpbmRleCBpbiAiJHtvdXRwdXRfZGlyX3RvcH0iLyouaHQyCmRvCiAgY3AgJHtpbmRleH0gJHtnZW5vbWVfZGlyfQpkb25lCgpscyAtbGggIiR7b3V0cHV0X2Rpcl90b3B9IgpgYGAKCg==