--- title: "00.00-heatwave-genetics-raw-FastQC-MultiQC" author: "Sam White" date: "2025-07-10" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # Background This Rmd file will download raw FastQs for the heatwave genetics sub-project and evaluate them using [FastQC](https://github.com/s-andrews/FastQC) and [MultiQC](https://multiqc.info/) [@ewels2016]. ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-cod-temperature' echo 'export output_dir_top=${repo_dir}/output/00.00-heatwave-genetics-raw-FastQC-MultiQC' echo 'export raw_fastqc_dir=${output_dir_top}' echo 'export raw_reads_dir=${repo_dir}/data/raw-fastqs-heatwave-genetics' echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/G_macrocephalus/30-1149633765/00_fastq/"' echo "" echo "# Paths to programs" echo 'export fastqc=/home/shared/FastQC-0.12.1/fastqc' echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc' echo "" echo "# Set FastQ filename patterns" echo "export fastq_pattern='*.fastq.gz'" echo "export R1_fastq_pattern='*_R1_*.fastq.gz'" echo "export R2_fastq_pattern='*_R2_*.fastq.gz'" echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "## Inititalize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export raw_fastqs_array=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "" echo "# Programs associative array" echo "declare -A programs_array" echo "programs_array=(" echo '[fastqc]="${fastqc}" \' echo '[multiqc]="${multiqc}" \' echo ")" echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Download raw FastQs Reads are downloaded from The `--cut-dirs 3` command cuts the preceding directory structure so that we just end up with the reads. ## Download raw reads ```{bash download-raw-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make output directory if it doesn't exist mkdir --parents ${raw_reads_dir} # Run wget to retrieve FastQs and MD5 files # Note: the --no-clobber command will skip re-downloading any files that are already present in the output directory wget \ --directory-prefix ${raw_reads_dir} \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 3 \ --no-host-directories \ --no-parent \ --quiet \ --no-clobber \ ${raw_reads_url} ls -lh "${raw_reads_dir}" ``` ## Verify raw read checksums ```{bash verify-raw-read-checksums, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}" # Checksums file contains other files, so this just looks for the RNAseq files. for file in *.md5 do md5sum --check "${file}" done ``` # FastQC/MultiQC on raw reads ```{bash raw-fastqc-multiqc, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make output directory if it doesn't exist mkdir --parents "${raw_fastqc_dir}" ############ RUN FASTQC ############ # Create array of trimmed FastQs raw_fastqs_array=(${raw_reads_dir}/${fastq_pattern}) # Pass array contents to new variable as space-delimited list raw_fastqc_list=$(echo "${raw_fastqs_array[*]}") echo "Beginning FastQC on raw reads..." echo "" # Run FastQC ### NOTE: Do NOT quote raw_fastqc_list ${programs_array[fastqc]} \ --threads ${threads} \ --outdir ${raw_fastqc_dir} \ --quiet \ ${raw_fastqc_list} echo "FastQC on raw reads complete!" echo "" ############ END FASTQC ############ ############ RUN MULTIQC ############ echo "Beginning MultiQC on raw FastQC..." echo "" ${programs_array[multiqc]} ${raw_fastqc_dir} -o ${raw_fastqc_dir} echo "" echo "MultiQC on raw FastQs complete." echo "" ############ END MULTIQC ############ echo "Removing FastQC zip files." echo "" rm ${raw_fastqc_dir}/*.zip echo "FastQC zip files removed." echo "" ``` ```{bash, eval=TRUE} # Load bash variables into memory source .bashvars # View directory contents ls -lh ${raw_fastqc_dir} ```