--- title: "01.00-trimming-fastp-fastqc" author: "Sam White" date: "2024-12-07" output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # Description This notebook will trim and merge R1 and R2 reads. The max length of 31bp is based on the `fastp` insert peak size from previous trimming tests based on the the adapter and polyG trimming results, and previous evaluation of mean read lengths via [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [`MultiQC`](https://multiqc.info/). ## Inputs: - sRNAseq paired-end FastQs (e.g. `*.fastq.gz`) ## Outputs: - `*.fastqc.html`: FastQC results, in HTML format. - `*fastp-adapters-polyG-31bp-merged.fq.gz`: Trimmed and merged reads with final length of 31bp. - `multiqc_report.html`: A summary report of the alignment results generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML format. Libraries were prepared and sequenced by Azenta: - Library prep: [NEB nebnext-small-rna-library-prep-set-for-illumina kit](https://www.neb.com/en-us/-/media/nebus/files/manuals/manuale7300_e7330_e7560_e7580.pdf?rev=d0964a2e637843b1afcb9f7d666d07b2&hash=7AC0B0EB012708EFAB0E4DBEEAF1446A) (PDF) - Sequencing: Illumina HiSeq 4000, 150bp PE Due to large file sizes of FastQs, they cannot be added to GitHub. Full output from this notebook are available here: - [https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/01.00-trimming-fastp-fastqc](https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/01.00-trimming-fastp-fastqc) ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa' echo 'export output_dir_top=${repo_dir}/output/01.00-trimming-fastp-fastqc' echo 'export raw_reads_dir="${repo_dir}/output/00.00-fastqc-concatenation-raw_reads"' echo 'export trimmed_fastqs_dir="${output_dir_top}"' echo "" echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export fastp="${programs_dir}/fastp-v0.24.0/fastp"' echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"' echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"' echo "" echo "# Set FastQ filename patterns" echo "export fastq_pattern='*.fastq.gz'" echo "export R1_fastq_pattern='*_R1_*.fastq.gz'" echo "export R2_fastq_pattern='*_R2_*.fastq.gz'" echo "export trimmed_fastq_pattern='*fastp-trim*.fq.gz'" echo "" echo "# Input/output files" echo 'export fastq_checksums=input_fastq_checksums.md5' echo 'export NEB_adapters_fasta=NEB-adapters.fasta' echo "" echo "## NEB nebnext-small-rna-library-prep-set-for-illumina adapters" echo 'export first_adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"' echo 'export second_adapter="GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT"' echo "" echo "# Set maximum read length for fastp merging" echo 'export max_read_length="31"' echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "## Inititalize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export trimmed_fastqs_array=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Create adapters FastA for use with [`fastp`](https://github.com/OpenGene/fastp) trimming ```{bash create-FastA-of-adapters, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Create output directory, if it doesn't exist mkdir --parents "${output_dir_top}" echo "Creating adapters FastA." echo "" adapter_count=0 # Check for adapters file first # Then create adapters file if doesn't exist if [ -f "${output_dir_top}/${NEB_adapters_fasta}" ]; then echo "${output_dir_top}/${NEB_adapters_fasta} already exists. Nothing to do." else for adapter in "${first_adapter}" "${second_adapter}" do adapter_count=$((adapter_count + 1)) printf ">%s\n%s\n" "adapter_${adapter_count}" "${adapter}" done >> "${output_dir_top}/${NEB_adapters_fasta}" fi echo "" echo "Adapters FastA:" echo "" cat "${output_dir_top}/${NEB_adapters_fasta}" echo "" ``` # Trimming and merging with fastp ```{bash fastp-and-merging, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Create output directory, if it doesn't exist. mkdir --parents "${trimmed_fastqs_dir}" # Change to directory with raw reads cd "${raw_reads_dir}" # Create arrays of FastQ R1 files and sample names # Do NOT quote R1_fastq_pattern variable for fastq in ${R1_fastq_pattern} do fastq_array_R1+=("${fastq}") # Use parameter substitution to remove all text up to and including last "." from # right side of string. R1_names_array+=("${fastq%%.*}") done # Create array of FastQ R2 files # Do NOT quote R2_fastq_pattern variable for fastq in ${R2_fastq_pattern} do fastq_array_R2+=("${fastq}") # Use parameter substitution to remove all text up to and including last "." from # right side of string. R2_names_array+=("${fastq%%.*}") done ############ RUN FASTP ############ # Uses parameter substitution (e.g. ${R1_sample_name%%_*})to rm the _R[12] # Uses NEB adapter file # Run fastp on files echo "Beginning fastp trimming." echo "" time \ for index in "${!fastq_array_R1[@]}" do # Get sample name R1_sample_name="${R1_names_array[index]%%_*}" R2_sample_name="${R2_names_array[index]%%_*}" # Save merged sample name merged_sample_name="${R1_sample_name}-fastp-adapters-polyG-${max_read_length}bp-merged" # Begin fastp trimming ${fastp} \ --in1 ${fastq_array_R1[index]} \ --in2 ${fastq_array_R2[index]} \ --adapter_fasta ${output_dir_top}/${NEB_adapters_fasta} \ --trim_poly_g \ --overlap_len_require 17 \ --length_limit ${max_read_length} \ --merge \ --merged_out ${trimmed_fastqs_dir}/${merged_sample_name}.fq.gz \ --thread ${threads} \ --html "${trimmed_fastqs_dir}/${merged_sample_name}.html" \ --json "${trimmed_fastqs_dir}/${merged_sample_name}.json" \ --report_title "${trimmed_fastqs_dir}/${merged_sample_name}" \ 2> ${trimmed_fastqs_dir}/${merged_sample_name}.stderr # Move to trimmed directory # This is done so checksums file doesn't include excess path cd ${trimmed_fastqs_dir} # Generate md5 checksums for newly trimmed files md5sum "${merged_sample_name}.fq.gz" | tee --append "${merged_sample_name}.fq.gz.md5" # Change back to to raw reads directory cd "${raw_reads_dir}" done echo "" echo "fastp trimming complete." echo "" ############ END fastp ############ ``` # FastQC/MultiQC on trimmed reads ```{bash FastQC-MultiQC-trimmed-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Create output directory, if it doesn't exist. mkdir --parents "${trimmed_fastqs_dir}" ############ RUN FASTQC ############ ### NOTE: Do NOT quote raw_fastqc_list # Create array of trimmed FastQs trimmed_fastqs_array=(${trimmed_fastqs_dir}/*merged.fq.gz) # Pass array contents to new variable as space-delimited list trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}") echo "Beginning FastQC on raw reads..." echo "" # Run FastQC ${fastqc} \ --threads ${threads} \ --outdir ${trimmed_fastqs_dir} \ --quiet \ ${trimmed_fastqc_list} echo "FastQC on trimmed reads complete!" echo "" ############ END FASTQC ############ ############ RUN MULTIQC ############ echo "Beginning MultiQC on raw FastQC..." echo "" ${multiqc} ${trimmed_fastqs_dir} -o ${trimmed_fastqs_dir} echo "" echo "MultiQC on trimmed FastQs complete." echo "" ############ END MULTIQC ############ echo "Removing FastQC zip files." echo "" rm ${trimmed_fastqs_dir}/*.zip echo "FastQC zip files removed." echo "" ``` # List output files ```{bash list-output-files, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # View directory contents ls -lh ${trimmed_fastqs_dir} ```