--- title: "02.20-D-Apul-RNAseq-alignment-HiSat2" author: "Sam White" date: "2024-10-08" output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` This notebook will align [trimmed *A.pulchra* RNA-seq data](./01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC.Rmd) to the *A.pulchra* genome using [HISAT2](https://github.com/DaehwanKimLab/hisat2) [@kim2019]. # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular' echo 'export genome_dir="${timeseries_dir}/D-Apul/data"' echo 'export genome_index_dir="${timeseries_dir}/D-Apul/output/02.10-D-Apul-RNAseq-genome-index-HiSat2"' echo 'export output_dir_top="${timeseries_dir}/D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2"' echo 'export trimmed_fastqs_dir="${timeseries_dir}/D-Apul/output/01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC/trimmed-fastqs"' echo 'export trimmed_reads_url="https://gannet.fish.washington.edu/Atumefaciens/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/01.00-D-Apul-RNAseq-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"' echo "" echo "# Location of Hisat2 index files" echo "# Must keep variable name formatting, as it's used by HiSat2" echo 'export HISAT2_INDEXES="${genome_index_dir}"' echo "# Input files" echo 'export exons="${output_dir_top}/Apulchra-genome_hisat2_exons.tab"' echo 'export genome_index_name="Apulchra-genome"' echo 'export genome_gff="${genome_dir}/Apulchra-genome.gff"' echo 'export genome_fasta="${genome_dir}/Apulchra-genome.fa"' echo 'export splice_sites="${output_dir_top}/Apulchra-genome_hisat2_splice_sites.tab"' echo 'export transcripts_gtf="${genome_dir}/Apulchra-genome.gtf"' echo "# Output files" echo 'export gtf_list="${output_dir_top}/gtf_list.txt"' echo 'export merged_bam="${output_dir_top}/sorted-bams-merged.bam"' echo "" echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export hisat2_dir="${programs_dir}/hisat2-2.2.1"' echo 'export hisat2="${hisat2_dir}/hisat2"' echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc' echo 'export samtools="${programs_dir}/samtools-1.12/samtools"' echo 'export prepDE="${programs_dir}/stringtie-2.2.1.Linux_x86_64/prepDE.py3"' echo 'export stringtie="${programs_dir}/stringtie-2.2.1.Linux_x86_64/stringtie"' echo "" echo "# Set FastQ filename patterns" echo "export R1_fastq_pattern='*_R1_*.fq.gz'" echo "export R2_fastq_pattern='*_R2_*.fq.gz'" echo "export trimmed_fastq_pattern='*fastp-trim.fq.gz'" echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "# Set average read length - for StringTie prepDE.py" echo 'export read_length=125' echo "" echo "## Initialize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "declare -A sample_timepoint_map" echo "" echo "# Programs associative array" echo "declare -A programs_array" echo "programs_array=(" echo '[hisat2]="${hisat2}" \' echo '[multiqc]="${multiqc}" \' echo '[prepDE]="${prepDE}" \' echo '[samtools]="${samtools}" \' echo '[stringtie]="${stringtie}" \' echo ")" echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` If needed, download raw RNA-seq. Change `eval=FALSE` to `eval=TRUE` to execute the next two chunks to download RNA-seq and then verify MD5 checksums. ```{bash download-raw-reads, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Make output directory if it doesn't exist mkdir --parents ${trimmed_fastqs_dir} # Create list of only A.pulchra sample names sample_list=$(awk -F "," 'NR > 2 {print $5"\t"$6}' ../../M-multi-species/data/ | awk -F"[\t-]" '$2 == "ACR" {print $1}') echo "" echo "${line}" echo "" echo "Sample list:" echo "" echo "${sample_list}" echo "" echo "${line}" echo "" # Use printf to format each item for use in wget formatted_list=$(printf "*%s*," ${sample_list}) # Remove the trailing comma and append *.md5 formatted_list="${formatted_list%,},*.md5" # Output the final wget command echo "" echo "${line}" echo "" echo "Formatted wget accept list:" echo "" echo "wget --accept=\"$formatted_list\"" echo "" echo "${line}" echo "" # Run wget to retrieve FastQs and MD5 files wget \ --directory-prefix ${trimmed_fastqs_dir} \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 3 \ --no-host-directories \ --no-parent \ --quiet \ --accept=\"$formatted_list\" ${trimmed_reads_url} ls -lh "${trimmed_fastqs_dir}" ``` Verify raw read checksums ```{bash verify-raw-read-checksums, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars cd "${trimmed_fastqs_dir}" # Verify checksums for file in *.md5 do md5sum --check "${file}" done ``` # Align reads using HISAT2 ```{bash hisat2-alignments, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Make output directories, if they don't exist mkdir --parents "${output_dir_top}" # Change to ouput directory cd "${output_dir_top}" # Create associative array with sample and timepoint metadata="../../../M-multi-species/data/rna_metadata.csv" # Declare the array declare -A sample_timepoint_map # Read the metadata file line by line while IFS=',' read -r sample_number sample_name plate well_number azenta_sample_name colony_id timepoint sample_type species_strain SampleBuffer; do # Check if the species is "Acropora pulchra" if [[ "${species_strain}" == "Acropora pulchra" ]]; then # Add the Azenta sample name as the key and Timepoint as the value in the associative array sample_timepoint_map["${azenta_sample_name}"]="${timepoint}" fi done < <(tail -n +2 "${metadata}") # Skip the header ## Populate trimmed reads arrays fastq_array_R1=("${trimmed_fastqs_dir}"/${R1_fastq_pattern}) fastq_array_R2=("${trimmed_fastqs_dir}"/${R2_fastq_pattern}) ############## BEGIN HISAT2 ALIGNMENTS ############## for sample in "${!sample_timepoint_map[@]}" do # Create and switch to dedicated sample directory mkdir --parents "${sample}" && cd "$_" # Create HISAT2 list of fastq R1 files # and generated MD5 checksums file. for fastq in "${fastq_array_R1[@]}" do # Parse sample name from FastQ filename fastq_sample=$(echo "${fastq##*/}" | awk -F"[_-]" '{print $3}') # Process matching FastQ file, based on sample name if [ "${fastq_sample}" == "${sample}" ]; then # Generate checksum/list of input files used md5sum "${fastq}" >> input_fastqs_checksums.md5 # Create comma-separated lists of FastQs for HISAT2 printf -v joined_R1 '%s,' "${fastq}" fastq_list_R1=$(echo "${joined_R1%,}") fi done # Create HISAT2 list of fastq R1 files # and generated MD5 checksums file. for fastq in "${fastq_array_R2[@]}" do # Parse sample name from FastQ filename fastq_sample=$(echo "${fastq##*/}" | awk -F"[_-]" '{print $3}') # Process matching FastQ file, based on sample name if [ "${fastq_sample}" == "${sample}" ]; then # Generate checksum/list of input files used md5sum "${fastq}" >> input_fastqs_checksums.md5 # Create comma-separated lists of FastQs for HISAT2 printf -v joined_R2 '%s,' "${fastq}" fastq_list_R2=$(echo "${joined_R2%,}") fi done # HISAT2 alignments # Sets read group info (RG) using samples array "${programs_array[hisat2]}" \ -x "${genome_index_name}" \ -1 "${fastq_list_R1}" \ -2 "${fastq_list_R2}" \ --threads "${threads}" \ --rg-id "${sample}" \ --rg "SM:""${sample_timepoint_map[$sample]}" \ 2> "${sample}"_hisat2.stats \ | tee >(${programs_array[samtools]} flagstat - > "${sample}"-hisat2_output.flagstat) \ | ${programs_array[samtools]} sort - -@ "${threads}" -O BAM \ | tee "${sample}".sorted.bam \ | ${programs_array[samtools]} index - "${sample}".sorted.bam.bai # Run stringtie on alignments # Uses "-B" option to output tables intended for use in Ballgown # Uses "-e" option; recommended when using "-B" option. # Limits analysis to only reads alignments matching reference. "${programs_array[stringtie]}" "${sample}".sorted.bam \ -p "${threads}" \ -o "${sample}".gtf \ -G "${genome_gff}" \ -C "${sample}.cov_refs.gtf" \ -B \ -e # Add GTFs to list file, only if non-empty # Identifies GTF files that only have header gtf_lines=$(wc -l < "${sample}".gtf ) if [ "${gtf_lines}" -gt 2 ]; then echo "$(pwd)/${sample}.gtf" >> "${gtf_list}" fi # Generate checksums find ./ -type f -not -name "*.md5" -exec md5sum {} \; > ${sample}_checksums.md5 # Move up to orig. working directory cd .. done ``` ## Review HISAT2 Output ```{r tree-hisat2-output, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" # Display HISAT2 output directory structure # with directory (--du) and file sizes (-h) tree --du -h ``` ## MultiQC alignment rates ```{r multiqc-alignment-rates, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" ${programs_array[multiqc]} . ``` # Merge sorted BAMs Merge all BAMs to singular BAM for use in transcriptome assembly later, if needed ```{r merge-sorted-BAMS, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" ## Create list of sorted BAMs for merging find . -name "*sorted.bam" > sorted_bams.list ## Merge sorted BAMs ${programs_array[samtools]} merge \ -b sorted_bams.list \ ${merged_bam} \ --threads ${threads} ## Index merged BAM ${programs_array[samtools]} index ${merged_bam} ``` # Create combined GTF ```{r combine-GTFs, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" # Create singular transcript file, using GTF list file "${programs_array[stringtie]}" --merge \ "${gtf_list}" \ -p "${threads}" \ -G "${genome_gff}" \ -o "${genome_index_name}".stringtie.gtf ``` # Create DESeq2 Count Matrices ```{r DESeq2-matrices, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" # Create file list for prepDE.py while read -r line do sample_no_path=${line##*/} sample=${sample_no_path%.*} echo ${sample} ${line} done < gtf_list.txt >> prepDE-sample_list.txt # Create count matrices for genes and transcripts # Compatible with import to DESeq2 python3 "${programs_array[prepDE]}" \ --input=prepDE-sample_list.txt \ -g apul-gene_count_matrix.csv \ -t apul-transcript_count_matrix.csv \ --length=${read_length} ``` # Generate checksums ```{r generate-checksums, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Change to ouput directory cd "${output_dir_top}" # Uses find command to avoid passing # directory names to the md5sum command. find . -maxdepth 1 -type f -exec md5sum {} + \ | tee --append checksums.md5 ```