--- title: "00.00-fastqc-concatenation-raw_reads" author: "Sam White" date: "2024-12-05" output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # Description This notebook will download raw sRNA-seq FastQs, concatenate them (there were multiple lanes run), and then assess with [FastQC](https://github.com/s-andrews/FastQC) and [MultiQC](https://github.com/MultiQC/MultiQC) [@ewels2016]. ## Inputs Raw FastQ files with the following pattern: - `*.fastq.gz` ## Outputs The expected outputs will be: - `*.fastq.gz`: Concatenated FastQ files. - `*.fastqc.html`: FastQC results, in HTML format. - `multiqc_report.html`: A summary report of the alignment results generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML format. Due to large file sizes of FastQs, they cannot be added to GitHub. Full output from this notebook are available here: - [https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/00.00-fastqc-concatenation-raw_reads](https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/00.00-fastqc-concatenation-raw_reads) ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa' echo 'export output_dir_top=${repo_dir}/output/00.00-fastqc-concatenation-raw_reads' echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/R_philippinarum/"' echo 'export raw_reads_dir="${repo_dir}/data/raw_reads"' echo 'export project_dir_1="30-1035633055"' echo 'export project_dir_2="30-1035633055-TS01"' echo "" echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"' echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"' echo "" echo "# Set FastQ filename patterns" echo "export fastq_pattern='*.fastq.gz'" echo "export R1_fastq_pattern='*_R1_*.fastq.gz'" echo "export R2_fastq_pattern='*_R2_*.fastq.gz'" echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "## Inititalize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export trimmed_fastqs_array=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Download raw reads The `--cut-dirs 3` command cuts the preceding directory structure (i.e. `R_philippinarum/30-1035633055/`) so that we just end up with the reads. ```{r download-raw-reads, engine='bash', eval=FALSE} # Load bash variables into memory source .bashvars # Create directory, if it doesn't exist mkdir --parents \ ${raw_reads_dir}/${project_dir_1} \ ${raw_reads_dir}/${project_dir_2} for directory in ${raw_reads_dir}/${project_dir_1} ${raw_reads_dir}/${project_dir_2} do wget \ --directory-prefix ${directory} \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 3 \ --no-parent \ --no-host-directories \ --quiet \ ${raw_reads_url} # Remove extraneous indext files rm ${raw_reads_dir}/${project_dir_1}/index* rm ${raw_reads_dir}/${project_dir_2}/index* done ``` ## Overview of downloads ```{r check-downloads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars tree --du -h "${raw_reads_dir}" ``` ## Verify checkums ```{r verify-checksums, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}/${project_dir_1}" pwd echo "" for checksum in *.md5 do md5sum --check ${checksum} done echo "" echo "${line}" echo "" cd "${raw_reads_dir}/${project_dir_2}" pwd echo "" for checksum in *.md5 do md5sum --check ${checksum} done ``` # Concatenate reads Concatenation also handles samples where there might be a missing set of R2 reads in the second round of sequencing. ```{r concatenate-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make output directory, if it doens't exist mkdir --parents ${output_dir_top} cd "${raw_reads_dir}" # Concatenate FastQ files from 1st and 2nd runs # Do NOT quote fastq_pattern variable # Declare an associative array to keep track of processed files declare -A processed_files for first_run_fastq in "${raw_reads_dir}"/"${project_dir_1}"/${fastq_pattern} do # Strip full path to just get filename. first_run_fastq_name="${first_run_fastq##*/}" # Initialize a flag to check if a match is found match_found=false # Process second run and concatenate with corresponding FastQ from first run # Do NOT quote fastq_pattern variable for second_run_fastq in "${raw_reads_dir}"/"${project_dir_2}"/${fastq_pattern} do # Strip full path to just get filename. second_run_fastq_name="${second_run_fastq##*/}" # Concatenate FastQs with same filenames if [[ "${first_run_fastq_name}" == "${second_run_fastq_name}" ]] then echo "Concatenating ${first_run_fastq} with ${second_run_fastq} to ${output_dir_top}/${first_run_fastq_name}" echo "" cat "${first_run_fastq}" "${second_run_fastq}" >> "${output_dir_top}/${first_run_fastq_name}" match_found=true processed_files["${first_run_fastq_name}"]=true break fi done # If no match is found, copy the file to the target directory if [[ "${match_found}" == false ]] then if [[ -z "${processed_files[${first_run_fastq_name}]}" ]] then echo "NO MATCH!" echo "Copying ${first_run_fastq} to ${output_dir_top}" echo "" cp "${first_run_fastq}" "${output_dir_top}" processed_files["${first_run_fastq_name}"]=true fi fi # Generate MD5 checksums cd ${output_dir_top} echo "Generating checksums for concatenated FastQs..." md5sum "${first_run_fastq_name}" | tee --append "${first_run_fastq_name}".md5 echo "" echo "${line}" echo "" cd - done ``` # FastQC/MultiQC on raw reads ```{bash raw-fastqc-multiqc, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars ############ RUN FASTQC ############ # Create array of trimmed FastQs raw_fastqs_array=(${output_dir_top}/${fastq_pattern}) # Pass array contents to new variable as space-delimited list raw_fastqc_list=$(echo "${raw_fastqs_array[*]}") echo "Beginning FastQC on raw reads..." echo "" # Run FastQC ### NOTE: Do NOT quote raw_fastqc_list ${fastqc} \ --threads ${threads} \ --outdir ${output_dir_top} \ --quiet \ ${raw_fastqc_list} echo "FastQC on raw reads complete!" echo "" ############ END FASTQC ############ ############ RUN MULTIQC ############ echo "Beginning MultiQC on raw FastQC..." echo "" ${multiqc} ${output_dir_top} -o ${output_dir_top} echo "" echo "MultiQC on raw FastQs complete." echo "" ############ END MULTIQC ############ echo "Removing FastQC zip files." echo "" rm ${output_dir_top}/*.zip echo "FastQC zip files removed." echo "" ``` ## View directory contents ```{bash list-output-files, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars ls -lh ${output_dir_top} ```