--- title: "01.00-trimming-fastp-fastqc-multiqc" author: "Sam White" date: "2025-04-17" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # BACKGROUND This Rmd file trims FastQ files using [fastp](https://github.com/OpenGene/fastp) [@chen2023], followed by quality checks with [FastQC](https://github.com/s-andrews/FastQC) and [MultiQC](https://multiqc.info/) [@ewels2016]. Additionally, it renames the trimmed FastQ files to be compliant with QIIME 2 naming expectations. ::: callout-note If you need to download the raw sequencing reads, please see [00.00-fastqc-multiqc-raw-reads.Rmd](https://github.com/RobertsLab/project-eDNA-yellow/blob/2cf2f152dd2a416360752a72c1bf2602d28cdd03/code/00.00-fastqc-multiqc-raw-reads.Rmd) ::: ```{r setup, include=FALSE} library(knitr) library(dplyr) library(reticulate) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` Inputs: - Paired-end FastQs - Example filename format: `Yell_July18_Site1f_S92_L001_R[12]_001.fastq.gz` Outputs: - Trimmed FastQs. - Example filename format: `Site1z_S129_L001_R1_001.fastq.gz` - fastp JSON files. - FastQC HTML files. - MultiQC HTML report. Software requirements: - [fastp](https://github.com/OpenGene/fastp) - [FastQC](https://github.com/s-andrews/FastQC) - [MultiQC](https://github.com/MultiQC/MultiQC) [@ewels2016] # Set R variables ```{r R-variables, eval=TRUE} # Data directories repo_dir <- "/home/sam/gitrepos/RobertsLab/project-eDNA-yellow" data_dir <- file.path(repo_dir, "data") raw_reads_dir <- file.path(data_dir, "raw-fastqs") output_dir <- file.path(repo_dir, "output", "01.00-trimming-fastp-fastqc-multiqc") # PROGRAMS fastp <- "/home/sam/programs/fastp" fastqc <- "/home/sam/programs/FastQC-0.12.1/fastqc" multiqc <- "/home/sam/programs/miniforge3/bin/multiqc" # CPUs threads <- 4 # Define file patterns R1_fastq_pattern <- "*_R1_001.fastq.gz" R2_fastq_pattern <- "*_R2_001.fastq.gz" # Export these as environment variables for bash chunks. Sys.setenv( fastp = fastp, fastqc = fastqc, data_dir = data_dir, multiqc = multiqc, raw_reads_dir = raw_reads_dir, threads = threads, R1_fastq_pattern = R1_fastq_pattern, R2_fastq_pattern = R2_fastq_pattern, output_dir = output_dir ) ``` # Fastp Trimming [fastp](https://github.com/OpenGene/fastp) [@chen2023] is set to auto-detect Illumina adapters, as well as trim the first 10bp from each read, as past experience shows these first 10bp are more inconsistent than the remainder of the read length. ::: callout-note Appends `_L001` to trimmed filenames for downstream QIIME 2 filename parsing requirements. ::: ```{r fastp-trimming, engine='bash', eval=TRUE} # Make output directories, if it doesn't exist mkdir --parents "${output_dir}" # Change to raw reads directory cd "${raw_reads_dir}" # Initialize metadata file for QIIME2 usage printf "%s\t%s\n" "id" "site" > metadata.tsv # Create arrays of fastq R1 files and sample names # Renames to match QIIME2 regex # Adds id to metadata TSV file for fastq in ${R1_fastq_pattern} do # Extract site information from filename using awk site=$(echo "${fastq}" | awk -F'_' '{print $3}') # Check if site information was extracted if [[ -n "${site}" ]]; then site_name=$(echo "$site" | sed 's/Site//') echo -e "${site}\t${site_name}" >> metadata.tsv else echo "Site information not found in filename: ${fastq}" fi fastq_array_R1+=("${fastq}") R1_names_array+=("$(echo "${fastq}" | awk -F'_' '{print $3"_"$4"_L001"}')") done # Create array of fastq R2 files # Renames to match QIIME2 regex for fastq in ${R2_fastq_pattern} do fastq_array_R2+=("${fastq}") R2_names_array+=("$(echo "${fastq}" | awk -F"_" '{print $3"_"$4"_L001"}')") done # Create list of fastq files used in analysis # Create MD5 checksum for reference if [ ! -f "${output_dir}"/raw-fastq-checksums.md5 ]; then for fastq in *.gz do md5sum "${fastq}" >>"${output_dir}"/raw-fastq-checksums.md5 done fi # Run fastp on files # Adds JSON report output for downstream usage by MultiQC for index in "${!fastq_array_R1[@]}" do R1_sample_name=$(echo "${R1_names_array[index]}") R2_sample_name=$(echo "${R2_names_array[index]}") ${fastp} \ --in1 ${fastq_array_R1[index]} \ --in2 ${fastq_array_R2[index]} \ --detect_adapter_for_pe \ --trim_front1 20 \ --trim_front2 20 \ --trim_poly_g \ --thread ${threads} \ --html "${output_dir}"/"${R1_sample_name}".fastp-trim.report.html \ --json "${output_dir}"/"${R1_sample_name}".fastp-trim.report.json \ --out1 "${output_dir}"/"${R1_sample_name}"_R1_001.fastq.gz \ --out2 "${output_dir}"/"${R2_sample_name}"_R2_001.fastq.gz \ 2>> "${output_dir}"/"${R1_sample_name}"-fastp.stderr # Generate md5 checksums for newly trimmed files cd "${output_dir}" md5sum "${R1_sample_name}"_R1_001.fastq.gz | tee "${R1_sample_name}"_R1_001.fastq.gz.md5 md5sum "${R2_sample_name}"_R2_001.fastq.gz | tee "${R2_sample_name}"_R2_001.fastq.gz.md5 # Change back to previous directory # Directing to /dev/null prevents printing directory each time. cd - > /dev/null done ``` # FastQC/MultiQC on trimmed reads ## Create FastQ list to use across chunks ```{r create-fastq-list, eval=TRUE} # Create the fastq list and string trimmed_fastqs_list <- list.files(path = Sys.getenv("output_dir"), pattern = "\\.fastq\\.gz$", full.names = TRUE) trimmed_fastqs_string <- paste(trimmed_fastqs_list, collapse = " ") # Update the environment variable with the actual string Sys.setenv(trimmed_fastqs_string = trimmed_fastqs_string) ``` ## Run FastQC/MultiQC ```{r trimmed-fastqc-multiqc, engine='bash', eval=TRUE} # Make output directory if it doesn't exist mkdir --parents "${output_dir}" cd "${output_dir}" ############ RUN FASTQC ############ # Create array of trimmed FastQs # Access the individual FastQ files from the array IFS=' ' read -r -a trimmed_fastqs_array <<< "${trimmed_fastqs_string}" # Pass array contents to new variable as space-delimited list trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}") echo "Beginning FastQC on raw reads..." echo "" # Run FastQC ### NOTE: Do NOT quote trimmed_fastqc_list ${fastqc} \ --threads ${threads} \ --outdir ${output_dir} \ --quiet \ ${trimmed_fastqc_list} echo "FastQC on raw reads complete!" echo "" ############ END FASTQC ############ ############ RUN MULTIQC ############ echo "Beginning MultiQC on raw FastQC..." echo "" ${multiqc} --interactive \ ${output_dir} \ -o ${output_dir} echo "" echo "MultiQC on raw FastQs complete." echo "" ############ END MULTIQC ############ echo "Removing FastQC zip files." echo "" rm ${output_dir}/*.zip echo "FastQC zip files removed." echo "" ``` ## List output files ```{r list-outputs, engine='bash', eval=TRUE} cd "${output_dir}" ls -lh ```