--- title: "00.00-trimming-fastp" author: "Sam White" date: "2024-11-15" output: bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # Intro ## Inputs This notebook will download raw WGBS FastQs and trim them using [fastp](https://github.com/OpenGene/fastp) . Trimming results will be analyszed with FastQC and summarized by [MultiQC](https://github.com/MultiQC/MultiQC) [@ewels2016]. ## Outputs The expected outputs will be: - `multiqc_report.html`: A summary report of the alignment results generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML format. Due to the large file sizes of FastQs, these cannot be hosted in the [ceasmallr GitHub repo](https://github.com/sr320/ceasmallr). As such these files are available for download here: - [https://gannet.fish.washington.edu/gitrepos/ceasmallr/output/00.00-trimming-fastp/](https://gannet.fish.washington.edu/gitrepos/ceasmallr/output/00.00-trimming-fastp/) ```{r setup, include=FALSE} library(knitr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Data directories" echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/ceasmallr' echo 'export output_dir_top=${repo_dir}/output/00.00-trimming-fastp' echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/C_virginica/2018OALarvae_DNAm_discovery/"' echo 'export raw_reads_dir="${repo_dir}/data/raw_reads"' echo 'export trimmed_fastqs_dir="${output_dir_top}/trimmed-fastqs"' echo "" echo "# Paths to programs" echo 'export programs_dir="/home/shared"' echo 'export bbmap_repair="${programs_dir}/bbmap-39.12/repair.sh"' echo 'export bismark_dir="${programs_dir}/Bismark-0.24.0"' echo 'export bowtie2_dir="${programs_dir}/bowtie2-2.4.4-linux-x86_64"' echo 'export fastp="${programs_dir}/fastp-v0.24.0/fastp"' echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"' echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"' echo 'export samtools_dir="${programs_dir}/samtools-1.12"' echo "" echo "# Set FastQ filename patterns" echo "export fastq_pattern='*.fastq.gz'" echo "export R1_fastq_pattern='*_R1_*.fastq.gz'" echo "export R2_fastq_pattern='*_R2_*.fastq.gz'" echo "export trimmed_fastq_pattern='*fastp-trim*.fq.gz'" echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" echo "## Inititalize arrays" echo 'export fastq_array_R1=()' echo 'export fastq_array_R2=()' echo 'export trimmed_fastqs_array=()' echo 'export R1_names_array=()' echo 'export R2_names_array=()' echo "" echo "# Print formatting" echo 'export line="--------------------------------------------------------"' echo "" } > .bashvars cat .bashvars ``` # Download raw reads The `--cut-dirs 2` command cuts the preceding directory structure (i.e. `nightingales/C_virginica/`) so that we just end up with the reads. ```{r download-raw-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Create directory, if it doesn't exist mkdir --parents ${raw_reads_dir} wget \ --directory-prefix ${raw_reads_dir} \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 2 \ --no-parent \ --no-host-directories \ --quiet \ ${raw_reads_url} ls -lh "${raw_reads_dir}" ``` ## Verify checkums ```{r verify-checksums, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}"/2018OALarvae_DNAm_discovery md5sum --check md5sum_list.txt echo "" echo "${line}" echo "" cd second_lane md5sum --check md5sum_list.txt ``` # Concatenate reads ```{r concatenate-reads, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars cd "${raw_reads_dir}" # Concatenate FastQ files from 1st and 2nd runs # Do NOT quote fastq_pattern variable # Declare an associative array to keep track of processed files declare -A processed_files for first_run_fastq in "${raw_reads_dir}"/2018OALarvae_DNAm_discovery/${fastq_pattern} do # Strip full path to just get filename. first_run_fastq_name="${first_run_fastq##*/}" # Initialize a flag to check if a match is found match_found=false # Process second run and concatenate with corresponding FastQ from first run # Do NOT quote fastq_pattern variable for second_run_fastq in "${raw_reads_dir}"/2018OALarvae_DNAm_discovery/second_lane/${fastq_pattern} do # Strip full path to just get filename. second_run_fastq_name="${second_run_fastq##*/}" # Concatenate FastQs with same filenames if [[ "${first_run_fastq_name}" == "${second_run_fastq_name}" ]] then echo "Concatenating ${first_run_fastq} with ${second_run_fastq} to ${raw_reads_dir}/${first_run_fastq_name}" echo "" cat "${first_run_fastq}" "${second_run_fastq}" >> "${raw_reads_dir}/${first_run_fastq_name}" match_found=true processed_files["${first_run_fastq_name}"]=true break fi done # If no match is found, copy the file to the target directory if [[ "${match_found}" == false ]] then if [[ -z "${processed_files[${first_run_fastq_name}]}" ]] then echo "NO MATCH!" echo "Copying ${first_run_fastq} to ${raw_reads_dir}" echo "" cp "${first_run_fastq}" "${raw_reads_dir}" processed_files["${first_run_fastq_name}"]=true fi fi echo "Generating checksums for concatenated FastQs..." md5sum "${first_run_fastq_name}" | tee --append "${first_run_fastq_name}".md5 echo "" echo "${line}" echo "" done ``` # Trimming ```{r fastp-trimming, engine='bash', eval=TRUE} # Load bash variables into memory source .bashvars # Make output directory, if it doesn't exist mkdir --parents ${output_dir_top} cd "${raw_reads_dir}" # Create arrays of fastq R1 files and sample names # Do NOT quote R1_fastq_pattern variable for fastq in ${R1_fastq_pattern} do fastq_array_R1+=("${fastq}") # Use parameter substitution to remove all text up to and including last "." from # right side of string. R1_names_array+=("${fastq%%.*}") done # Create array of fastq R2 files # Do NOT quote R2_fastq_pattern variable for fastq in ${R2_fastq_pattern} do fastq_array_R2+=("${fastq}") # Use parameter substitution to remove all text up to and including last "." from # right side of string. R2_names_array+=(${fastq%%.*}) done # Run fastp on files # Adds JSON report output for downstream usage by MultiQC echo "Beginning fastp trimming." echo "" for index in "${!fastq_array_R1[@]}" do R1_sample_name="${R1_names_array[index]}" R2_sample_name="${R2_names_array[index]}" stderr_PE_name=$(echo ${R1_sample_name} | awk -F"_" '{print $1}') ${fastp} \ --in1 ${fastq_array_R1[index]} \ --in2 ${fastq_array_R2[index]} \ --detect_adapter_for_pe \ --trim_poly_g \ --trim_poly_x \ --thread 16 \ --trim_front1 15 \ --trim_front2 15 \ --html ${output_dir_top}/"${R1_sample_name}".fastp-trim.report.html \ --json ${output_dir_top}/"${R1_sample_name}".fastp-trim.report.json \ --out1 ${output_dir_top}/"${R1_sample_name}".fastp-trim.fq.gz \ --out2 ${output_dir_top}/"${R2_sample_name}".fastp-trim.fq.gz \ 2> ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr grep --before-context 5 "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr # Check for fastp errors and then repair if grep --quiet "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr; then rm ${output_dir_top}/"${R1_sample_name}".fastp-trim.fq.gz rm ${output_dir_top}/"${R2_sample_name}".fastp-trim.fq.gz ${bbmap_repair} \ in1=${fastq_array_R1[index]} \ in2=${fastq_array_R2[index]} \ out1="${R1_sample_name}".REPAIRED.fastq.gz \ out2="${R2_sample_name}".REPAIRED.fastq.gz \ outs=/dev/null \ 2> "${R1_sample_name}".REPAIRED.stderr ${fastp} \ --in1 "${R1_sample_name}".REPAIRED.fastq.gz \ --in2 "${R2_sample_name}".REPAIRED.fastq.gz \ --detect_adapter_for_pe \ --trim_poly_g \ --trim_poly_x \ --thread 16 \ --trim_front1 15 \ --trim_front2 15 \ --html ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.report.html \ --json ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.report.json \ --out1 ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.fq.gz \ --out2 ${output_dir_top}/"${R2_sample_name}".fastp-trim.REPAIRED.fq.gz \ 2> ${output_dir_top}/"${stderr_PE_name}".fastp-trim.REPAIRED.stderr if grep --quiet "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.REPAIRED.stderr; then echo "These ${stderr_PE_name} samples are broken." echo "Just give up. :(" echo "" fi fi echo "Finished trimming:" echo "${fastq_array_R1[index]}" echo "${fastq_array_R1[index]}" echo "" # Generate md5 checksums for newly trimmed files cd "${output_dir_top}" md5sum "${R1_sample_name}".fastp-trim.fq.gz > "${R1_sample_name}".fastp-trim.fq.gz.md5 md5sum "${R2_sample_name}".fastp-trim.fq.gz > "${R2_sample_name}".fastp-trim.fq.gz.md5 cd "${raw_reads_dir}" done ``` # FastQC ```{r fastqc, engine='bash'} # Load bash variables into memory source .bashvars cd "${output_dir_top}" ############ RUN FASTQC ############ # Create array of trimmed FastQs trimmed_fastqs_array=(${trimmed_fastq_pattern}) # Pass array contents to new variable as space-delimited list trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}") echo "Beginning FastQC on trimmed reads..." echo "" # Run FastQC ### NOTE: Do NOT quote raw_fastqc_list ${fastqc} \ --threads ${threads} \ --outdir "${output_dir_top}" \ --quiet \ ${trimmed_fastqc_list} echo "FastQC on trimmed reads complete!" echo "" ############ END FASTQC ############ ``` # MultiQC ```{r multiqc, engine='bash'} # Load bash variables into memory source .bashvars cd "${output_dir_top}" ${multiqc} . ```