---
title: "01.00-trimming-fastp-fastqc"
author: "Sam White"
date: "2024-12-07"
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
bibliography: references.bib
---

# Description

This notebook will trim and merge R1 and R2 reads. The max length of 31bp is based on the `fastp` insert peak size from previous trimming tests based on the the adapter and polyG trimming results, and previous evaluation of mean read lengths via [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [`MultiQC`](https://multiqc.info/).

## Inputs:

-   sRNAseq paired-end FastQs (e.g. `*.fastq.gz`)

## Outputs:

- `*.fastqc.html`: FastQC results, in HTML format.

- `*fastp-adapters-polyG-31bp-merged.fq.gz`: Trimmed and merged reads with final length of 31bp.

- `multiqc_report.html`: A summary report of the alignment results
    generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML
    format.


Libraries were prepared and sequenced by Azenta:

-   Library prep: [NEB nebnext-small-rna-library-prep-set-for-illumina kit](https://www.neb.com/en-us/-/media/nebus/files/manuals/manuale7300_e7330_e7560_e7580.pdf?rev=d0964a2e637843b1afcb9f7d666d07b2&hash=7AC0B0EB012708EFAB0E4DBEEAF1446A) (PDF)

-   Sequencing: Illumina HiSeq 4000, 150bp PE


Due to large file sizes of FastQs, they cannot be added to GitHub. Full output from this notebook are available here:

- [https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/01.00-trimming-fastp-fastqc](https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/01.00-trimming-fastp-fastqc)

```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,        # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```

# Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE}
{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa'
echo 'export output_dir_top=${repo_dir}/output/01.00-trimming-fastp-fastqc'
echo 'export raw_reads_dir="${repo_dir}/output/00.00-fastqc-concatenation-raw_reads"'
echo 'export trimmed_fastqs_dir="${output_dir_top}"'
echo ""

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export fastp="${programs_dir}/fastp-v0.24.0/fastp"'
echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"'
echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"'
echo ""


echo "# Set FastQ filename patterns"
echo "export fastq_pattern='*.fastq.gz'"
echo "export R1_fastq_pattern='*_R1_*.fastq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fastq.gz'"
echo "export trimmed_fastq_pattern='*fastp-trim*.fq.gz'"
echo ""

echo "# Input/output files"
echo 'export fastq_checksums=input_fastq_checksums.md5'
echo 'export NEB_adapters_fasta=NEB-adapters.fasta'
echo ""

echo "## NEB nebnext-small-rna-library-prep-set-for-illumina adapters"
echo 'export first_adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"'
echo 'export second_adapter="GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT"'
echo ""

echo "# Set maximum read length for fastp merging"
echo 'export max_read_length="31"'
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""


echo "## Inititalize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export trimmed_fastqs_array=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
```

# Create adapters FastA for use with [`fastp`](https://github.com/OpenGene/fastp) trimming

```{bash create-FastA-of-adapters, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Create output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"

echo "Creating adapters FastA."
echo ""
adapter_count=0

# Check for adapters file first
# Then create adapters file if doesn't exist
if [ -f "${output_dir_top}/${NEB_adapters_fasta}" ]; then
  echo "${output_dir_top}/${NEB_adapters_fasta} already exists. Nothing to do."
else
  for adapter in "${first_adapter}" "${second_adapter}"
  do
    adapter_count=$((adapter_count + 1))
    printf ">%s\n%s\n" "adapter_${adapter_count}" "${adapter}"
  done >> "${output_dir_top}/${NEB_adapters_fasta}"
fi

echo ""
echo "Adapters FastA:"
echo ""
cat "${output_dir_top}/${NEB_adapters_fasta}"
echo ""
```

# Trimming and merging with fastp

```{bash fastp-and-merging, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Create output directory, if it doesn't exist.
mkdir --parents "${trimmed_fastqs_dir}"

# Change to directory with raw reads
cd "${raw_reads_dir}"

# Create arrays of FastQ R1 files and sample names
# Do NOT quote R1_fastq_pattern variable
for fastq in ${R1_fastq_pattern}
do
  fastq_array_R1+=("${fastq}")

  # Use parameter substitution to remove all text up to and including last "." from
  # right side of string.
  R1_names_array+=("${fastq%%.*}")
done

# Create array of FastQ R2 files
# Do NOT quote R2_fastq_pattern variable
for fastq in ${R2_fastq_pattern}
do
  fastq_array_R2+=("${fastq}")

  # Use parameter substitution to remove all text up to and including last "." from
  # right side of string.
  R2_names_array+=("${fastq%%.*}")
done

############ RUN FASTP ############
# Uses parameter substitution (e.g. ${R1_sample_name%%_*})to rm the _R[12]
# Uses NEB adapter file


# Run fastp on files
echo "Beginning fastp trimming."
echo ""

time \
for index in "${!fastq_array_R1[@]}"
do
  # Get sample name
  R1_sample_name="${R1_names_array[index]%%_*}"
  R2_sample_name="${R2_names_array[index]%%_*}"

  # Save merged sample name
  merged_sample_name="${R1_sample_name}-fastp-adapters-polyG-${max_read_length}bp-merged"
  
  # Begin fastp trimming
  ${fastp} \
  --in1 ${fastq_array_R1[index]} \
  --in2 ${fastq_array_R2[index]} \
  --adapter_fasta ${output_dir_top}/${NEB_adapters_fasta} \
  --trim_poly_g \
  --overlap_len_require 17 \
  --length_limit ${max_read_length} \
  --merge \
  --merged_out ${trimmed_fastqs_dir}/${merged_sample_name}.fq.gz \
  --thread ${threads} \
  --html "${trimmed_fastqs_dir}/${merged_sample_name}.html" \
  --json "${trimmed_fastqs_dir}/${merged_sample_name}.json" \
  --report_title "${trimmed_fastqs_dir}/${merged_sample_name}" \
  2> ${trimmed_fastqs_dir}/${merged_sample_name}.stderr
    
  # Move to trimmed directory
  # This is done so checksums file doesn't include excess path
  cd ${trimmed_fastqs_dir}

  # Generate md5 checksums for newly trimmed files
  md5sum "${merged_sample_name}.fq.gz" | tee --append "${merged_sample_name}.fq.gz.md5"
    
  # Change back to to raw reads directory
  cd "${raw_reads_dir}"

done

echo ""
echo "fastp trimming complete."
echo ""

############ END fastp ############

```

# FastQC/MultiQC on trimmed reads

```{bash FastQC-MultiQC-trimmed-reads, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Create output directory, if it doesn't exist.
mkdir --parents "${trimmed_fastqs_dir}"

############ RUN FASTQC ############

### NOTE: Do NOT quote raw_fastqc_list
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastqs_dir}/*merged.fq.gz)

# Pass array contents to new variable as space-delimited list
trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}")

echo "Beginning FastQC on raw reads..."
echo ""

# Run FastQC
${fastqc} \
--threads ${threads} \
--outdir ${trimmed_fastqs_dir} \
--quiet \
${trimmed_fastqc_list}

echo "FastQC on trimmed reads complete!"
echo ""

############ END FASTQC ############

############ RUN MULTIQC ############
echo "Beginning MultiQC on raw FastQC..."
echo ""

${multiqc} ${trimmed_fastqs_dir} -o ${trimmed_fastqs_dir}

echo ""
echo "MultiQC on trimmed FastQs complete."
echo ""

############ END MULTIQC ############

echo "Removing FastQC zip files."
echo ""
rm ${trimmed_fastqs_dir}/*.zip
echo "FastQC zip files removed."
echo ""
```

# List output files
```{bash list-output-files, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# View directory contents
ls -lh ${trimmed_fastqs_dir}
```