--- title: "00.20-D-Apul-WGBS-reads-FastQC-MultiQC" author: "Sam White" date: "2025-02-10" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # Background This Rmd file will download raw WGBS FastQs for *A.pulchra* and evaluate them using [FastQC](https://github.com/s-andrews/FastQC) and [MultiQC](https://multiqc.info/)[@ewels2016]. Additionally, it will rename the files. New naming format will be `-`. This will make the naming consistent across species, improve sample identification, and improve parsing in downstream analyses. ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export timeseries_dir=/home/shared/8TB_HDD_01/sam/gitrepos/urol-e5/timeseries_molecular' echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/00.20-D-Apul-WGBS-reads-FastQC-MultiQC' echo 'export raw_reads_dir=${timeseries_dir}/D-Apul/data/wgbs-raw-fastqs' echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/E5-coral-time-series/30-1067895835/"' echo "" echo "# Input files" echo 'export metadata_file="${timeseries_dir}/M-multi-species/data/e5_DNA_Azenta_metadata.csv"' echo "# Paths to programs" echo 'export fastqc=/home/shared/FastQC-0.12.1/fastqc' echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc' echo "" echo "# Set FastQ filename patterns" echo "export fastq_pattern='*.fastq.gz'" echo "export R1_fastq_pattern='*_R1_*.fastq.gz'" echo "export R2_fastq_pattern='*_R2_*.fastq.gz'" echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "## Inititalize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export raw_fastqs_array=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "" echo "# Programs associative array" echo "declare -A programs_array" echo "programs_array=(" echo '[fastqc]="${fastqc}" \' echo '[multiqc]="${multiqc}" \' echo ")" echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Download *A.pulchra* WGBS FastQs ## Inspect metadata file ```{r, inspect-metadta, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars head ${metadata_file} | column -t -s"," ``` ## Download raw WGBS reads Reads are downloaded from Since sequencing included multiple species, the code will also parse only those that are *A.pulchra*. Some filenames are preceeded by `--` for some reason. This formatting is limited to only samples in `E` wells, so the code below uses that aspect to retrieve those samples as well. The `--cut-dirs 3` command cuts the preceding directory structure (i.e. `nightingales/E5-coral-time-series/30-1067895835/`) so that we just end up with the reads. ```{bash download-raw-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make output directory if it doesn't exist mkdir --parents ${raw_reads_dir} # Create list of only A.pulchra sample names # Some samples names are preceded by the Sample Number for some reason, # so this handles that formatting. sample_list=$(awk -F"," '$11 == "Acropora pulchra" { if ($9 ~ /E/) { print $8 "--" $9 } else { print $9 } }' ${timeseries_dir}/M-multi-species/data/30-1067895835-WGBS-sample-submission-form.csv \ | sort) echo "" echo "${line}" echo "" echo "Sample list:" echo "" echo "${sample_list}" echo "" echo "${line}" echo "" # Use printf to format each item for use in wget formatted_list=$(printf "%s_*," ${sample_list}) # Remove the trailing comma formatted_list="${formatted_list%,}" # Output the final wget command echo "" echo "${line}" echo "" echo "Formatted wget accept list:" echo "" echo "wget --accept=\"$formatted_list\"" echo "" echo "${line}" echo "" # Run wget to retrieve FastQs and MD5 files wget \ --directory-prefix ${raw_reads_dir} \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 3 \ --no-host-directories \ --no-parent \ --quiet \ --accept=\"$formatted_list\" ${raw_reads_url} ls -lh "${raw_reads_dir}" ``` ## Verify raw read checksums ```{bash verify-raw-read-checksums, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}" for file in *.md5 do md5sum --check "${file}" done ``` # Rename FastQs for Project Consistency New naming format will be `-`. Use of a temp file, instead of working directory from the metadata file can avoid potential issues with character/text formatting from the input CSV. ```{bash rename-fastqs, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}" # Create an associative array to store the mapping declare -A number_to_colony_timepoint # Create a temporary file to store the mapping temp_mapping_file=$(mktemp) # Read the metadata file and populate the temporary file using awk # Maps Azenta sample name to colony_id and timepoint. awk -F, 'NR > 1 { print $5, $6 "-" $7 }' "$metadata_file" > "$temp_mapping_file" # Read the temporary file and populate the associative array while read -r Number colony_timepoint; do number_to_colony_timepoint["$Number"]="$colony_timepoint" done < "$temp_mapping_file" # Remove the temporary file rm "$temp_mapping_file" # Print the associative array for debugging echo "Number to colony_id-Timepoint mapping:" for key in "${!number_to_colony_timepoint[@]}"; do echo "" echo "$key -> ${number_to_colony_timepoint[$key]}" echo "" done # Iterate over the FastQ files in the current directory for fastq_file in *.fastq.gz; do # Extract the sample number from the filename sample_number=$(echo "$fastq_file" | sed -E 's/^([0-9]+--)?([A-Za-z0-9]+).*/\2/') # Check if the sample number exists in the associative array if [[ -n "${number_to_colony_timepoint[$sample_number]}" ]]; then new_sample_name="${number_to_colony_timepoint[$sample_number]}" new_filename=$(echo "$fastq_file" | sed -E "s/^([0-9]+--)?$sample_number/$new_sample_name/") if [[ "$fastq_file" != "$new_filename" ]]; then mv "$fastq_file" "$new_filename" echo "Renamed $fastq_file to $new_filename" else echo "No renaming needed for $fastq_file" fi else echo "Sample number '$sample_number' not found in metadata." # Debugging: Print all keys in the associative array with quotes echo "Available keys in the associative array:" for key in "${!number_to_colony_timepoint[@]}"; do echo "'$key'" done fi done ``` # FastQC/MultiQC on raw reads ```{bash raw-fastqc-multiqc, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars ############ RUN FASTQC ############ # Create array of trimmed FastQs raw_fastqs_array=(${raw_reads_dir}/${fastq_pattern}) # Pass array contents to new variable as space-delimited list raw_fastqc_list=$(echo "${raw_fastqs_array[*]}") echo "Beginning FastQC on raw reads..." echo "" # Run FastQC ### NOTE: Do NOT quote raw_fastqc_list ${programs_array[fastqc]} \ --threads ${threads} \ --outdir ${raw_reads_dir} \ --quiet \ ${raw_fastqc_list} echo "FastQC on raw reads complete!" echo "" ############ END FASTQC ############ ############ RUN MULTIQC ############ echo "Beginning MultiQC on raw FastQC..." echo "" ${programs_array[multiqc]} \ ${raw_reads_dir} \ --interactive \ -o ${raw_reads_dir} echo "" echo "MultiQC on raw FastQs complete." echo "" ############ END MULTIQC ############ echo "Removing FastQC zip files." echo "" rm ${raw_reads_dir}/*.zip echo "FastQC zip files removed." echo "" # View directory contents ls -lh ${raw_reads_dir} ```