---
title: "00.00-trimming-fastp"
author: "Sam White"
date: "2024-11-15"
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
bibliography: references.bib
---
# Intro

## Inputs
This notebook will download raw WGBS FastQs and trim them using [fastp](https://github.com/OpenGene/fastp) . Trimming results will be analyszed with FastQC and summarized by [MultiQC](https://github.com/MultiQC/MultiQC) [@ewels2016].

## Outputs

The expected outputs will be:

-   `multiqc_report.html`: A summary report of the alignment results generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML format.

Due to the large file sizes of FastQs, these cannot be hosted in the [ceasmallr GitHub repo](https://github.com/sr320/ceasmallr). As such these files are available for download here:

- [https://gannet.fish.washington.edu/gitrepos/ceasmallr/output/00.00-trimming-fastp/](https://gannet.fish.washington.edu/gitrepos/ceasmallr/output/00.00-trimming-fastp/)

```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,        # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```

# Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE}
{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/ceasmallr'
echo 'export output_dir_top=${repo_dir}/output/00.00-trimming-fastp'
echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/C_virginica/2018OALarvae_DNAm_discovery/"'
echo 'export raw_reads_dir="${repo_dir}/data/raw_reads"'
echo 'export trimmed_fastqs_dir="${output_dir_top}/trimmed-fastqs"'
echo ""

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export bbmap_repair="${programs_dir}/bbmap-39.12/repair.sh"'
echo 'export bismark_dir="${programs_dir}/Bismark-0.24.0"'
echo 'export bowtie2_dir="${programs_dir}/bowtie2-2.4.4-linux-x86_64"'
echo 'export fastp="${programs_dir}/fastp-v0.24.0/fastp"'
echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"'
echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"'
echo 'export samtools_dir="${programs_dir}/samtools-1.12"'
echo ""


echo "# Set FastQ filename patterns"
echo "export fastq_pattern='*.fastq.gz'"
echo "export R1_fastq_pattern='*_R1_*.fastq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fastq.gz'"
echo "export trimmed_fastq_pattern='*fastp-trim*.fq.gz'"
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""


echo "## Inititalize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export trimmed_fastqs_array=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
```


# Download raw reads

The `--cut-dirs 2` command cuts the preceding directory structure (i.e. `nightingales/C_virginica/`)
so that we just end up with the reads.
```{r download-raw-reads, engine='bash', eval=TRUE}

# Load bash variables into memory
source .bashvars

# Create directory, if it doesn't exist
mkdir --parents ${raw_reads_dir}

wget \
--directory-prefix ${raw_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 2 \
--no-parent \
--no-host-directories \
--quiet \
${raw_reads_url}

ls -lh "${raw_reads_dir}"
```
## Verify checkums
```{r verify-checksums, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

cd "${raw_reads_dir}"/2018OALarvae_DNAm_discovery

md5sum --check md5sum_list.txt

echo ""
echo "${line}"
echo ""

cd second_lane

md5sum --check md5sum_list.txt
```

# Concatenate reads
```{r concatenate-reads, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

cd "${raw_reads_dir}"


# Concatenate FastQ files from 1st and 2nd runs
# Do NOT quote fastq_pattern variable

# Declare an associative array to keep track of processed files
declare -A processed_files

for first_run_fastq in "${raw_reads_dir}"/2018OALarvae_DNAm_discovery/${fastq_pattern}
do
  # Strip full path to just get filename.
  first_run_fastq_name="${first_run_fastq##*/}"

  # Initialize a flag to check if a match is found
  match_found=false

  # Process second run and concatenate with corresponding FastQ from first run
  # Do NOT quote fastq_pattern variable
  for second_run_fastq in "${raw_reads_dir}"/2018OALarvae_DNAm_discovery/second_lane/${fastq_pattern}
  do
    # Strip full path to just get filename.
    second_run_fastq_name="${second_run_fastq##*/}"

    # Concatenate FastQs with same filenames
    if [[ "${first_run_fastq_name}" == "${second_run_fastq_name}" ]]
    then
      echo "Concatenating ${first_run_fastq} with ${second_run_fastq} to ${raw_reads_dir}/${first_run_fastq_name}"
      echo ""
      cat "${first_run_fastq}" "${second_run_fastq}" >> "${raw_reads_dir}/${first_run_fastq_name}"
      match_found=true
      processed_files["${first_run_fastq_name}"]=true
      break
    fi
  done

  # If no match is found, copy the file to the target directory
  if [[ "${match_found}" == false ]]
  then
    if [[ -z "${processed_files[${first_run_fastq_name}]}" ]]
    then
      echo "NO MATCH!"
      echo "Copying ${first_run_fastq} to ${raw_reads_dir}"
      echo ""
      cp "${first_run_fastq}" "${raw_reads_dir}"
      processed_files["${first_run_fastq_name}"]=true
    fi
  fi

  echo "Generating checksums for concatenated FastQs..."
  md5sum "${first_run_fastq_name}" | tee --append "${first_run_fastq_name}".md5
  echo ""
  echo "${line}"
  echo ""
done
```


# Trimming
```{r fastp-trimming, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Make output directory, if it doesn't exist
mkdir --parents ${output_dir_top}

cd "${raw_reads_dir}"


# Create arrays of fastq R1 files and sample names
# Do NOT quote R1_fastq_pattern variable

for fastq in ${R1_fastq_pattern}
do
  fastq_array_R1+=("${fastq}")

  # Use parameter substitution to remove all text up to and including last "." from
  # right side of string.
  R1_names_array+=("${fastq%%.*}")
done

# Create array of fastq R2 files
# Do NOT quote R2_fastq_pattern variable
for fastq in ${R2_fastq_pattern}
do
  fastq_array_R2+=("${fastq}")

  # Use parameter substitution to remove all text up to and including last "." from
  # right side of string.
  R2_names_array+=(${fastq%%.*})
done


# Run fastp on files
# Adds JSON report output for downstream usage by MultiQC

echo "Beginning fastp trimming."
echo ""

for index in "${!fastq_array_R1[@]}"
do
  R1_sample_name="${R1_names_array[index]}"
  R2_sample_name="${R2_names_array[index]}"
  
  stderr_PE_name=$(echo ${R1_sample_name} | awk -F"_" '{print $1}')
  
  ${fastp} \
  --in1 ${fastq_array_R1[index]} \
  --in2 ${fastq_array_R2[index]} \
  --detect_adapter_for_pe \
  --trim_poly_g \
  --trim_poly_x \
  --thread 16 \
  --trim_front1 15 \
  --trim_front2 15 \
  --html ${output_dir_top}/"${R1_sample_name}".fastp-trim.report.html \
  --json ${output_dir_top}/"${R1_sample_name}".fastp-trim.report.json \
  --out1 ${output_dir_top}/"${R1_sample_name}".fastp-trim.fq.gz \
  --out2 ${output_dir_top}/"${R2_sample_name}".fastp-trim.fq.gz \
  2> ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr
  
  grep --before-context 5 "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr
  
  # Check for fastp errors and then repair
  if grep --quiet "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.stderr; then
      
    rm ${output_dir_top}/"${R1_sample_name}".fastp-trim.fq.gz
    rm ${output_dir_top}/"${R2_sample_name}".fastp-trim.fq.gz
    
    
    ${bbmap_repair} \
    in1=${fastq_array_R1[index]} \
    in2=${fastq_array_R2[index]} \
    out1="${R1_sample_name}".REPAIRED.fastq.gz \
    out2="${R2_sample_name}".REPAIRED.fastq.gz \
    outs=/dev/null \
    2> "${R1_sample_name}".REPAIRED.stderr
    
    ${fastp} \
    --in1 "${R1_sample_name}".REPAIRED.fastq.gz \
    --in2 "${R2_sample_name}".REPAIRED.fastq.gz \
    --detect_adapter_for_pe \
    --trim_poly_g \
    --trim_poly_x \
    --thread 16 \
    --trim_front1 15 \
    --trim_front2 15 \
    --html ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.report.html \
    --json ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.report.json \
    --out1 ${output_dir_top}/"${R1_sample_name}".fastp-trim.REPAIRED.fq.gz \
    --out2 ${output_dir_top}/"${R2_sample_name}".fastp-trim.REPAIRED.fq.gz \
    2> ${output_dir_top}/"${stderr_PE_name}".fastp-trim.REPAIRED.stderr
 
    
    if grep --quiet "ERROR" ${output_dir_top}/"${stderr_PE_name}".fastp-trim.REPAIRED.stderr; then
      echo "These ${stderr_PE_name} samples are broken."
      echo "Just give up.  :("
      echo ""
    fi
  fi

  echo "Finished trimming:"
  echo "${fastq_array_R1[index]}"
  echo "${fastq_array_R1[index]}"
  echo ""
  
  # Generate md5 checksums for newly trimmed files
  cd "${output_dir_top}"
  md5sum "${R1_sample_name}".fastp-trim.fq.gz > "${R1_sample_name}".fastp-trim.fq.gz.md5
  md5sum "${R2_sample_name}".fastp-trim.fq.gz > "${R2_sample_name}".fastp-trim.fq.gz.md5
  
  cd "${raw_reads_dir}"
done
```


# FastQC
```{r fastqc, engine='bash'}
# Load bash variables into memory
source .bashvars

cd "${output_dir_top}"

############ RUN FASTQC ############


# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastq_pattern})

# Pass array contents to new variable as space-delimited list
trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}")

echo "Beginning FastQC on trimmed reads..."
echo ""

# Run FastQC
### NOTE: Do NOT quote raw_fastqc_list
${fastqc} \
--threads ${threads} \
--outdir "${output_dir_top}" \
--quiet \
${trimmed_fastqc_list}

echo "FastQC on trimmed reads complete!"
echo ""

############ END FASTQC ############
```


# MultiQC
```{r multiqc, engine='bash'}
# Load bash variables into memory
source .bashvars

cd "${output_dir_top}"

${multiqc} .
```