---
title: "00.00-fastqc-concatenation-raw_reads"
author: "Sam White"
date: "2024-12-05"
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
bibliography: references.bib
---


# Description

This notebook will download raw sRNA-seq FastQs, concatenate them (there
were multiple lanes run), and then assess with [FastQC](https://github.com/s-andrews/FastQC) and
[MultiQC](https://github.com/MultiQC/MultiQC) [@ewels2016].

## Inputs

Raw FastQ files with the following pattern:

- `*.fastq.gz`

## Outputs

The expected outputs will be:

- `*.fastq.gz`: Concatenated FastQ files.

- `*.fastqc.html`: FastQC results, in HTML format.

- `multiqc_report.html`: A summary report of the alignment results
    generated by [MultiQC](https://github.com/MultiQC/MultiQC), in HTML
    format.
    
Due to large file sizes of FastQs, they cannot be added to GitHub. Full output from this notebook are available here:

- [https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/00.00-fastqc-concatenation-raw_reads](https://gannet.fish.washington.edu/gitrepos/project-clam-oa/output/00.00-fastqc-concatenation-raw_reads)

```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,        # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```


# Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE}
{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa'
echo 'export output_dir_top=${repo_dir}/output/00.00-fastqc-concatenation-raw_reads'
echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/R_philippinarum/"'
echo 'export raw_reads_dir="${repo_dir}/data/raw_reads"'
echo 'export project_dir_1="30-1035633055"'
echo 'export project_dir_2="30-1035633055-TS01"'
echo ""

echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo 'export fastqc="${programs_dir}/FastQC-0.12.1/fastqc"'
echo 'export multiqc="/home/sam/programs/mambaforge/bin/multiqc"'
echo ""


echo "# Set FastQ filename patterns"
echo "export fastq_pattern='*.fastq.gz'"
echo "export R1_fastq_pattern='*_R1_*.fastq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fastq.gz'"
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""


echo "## Inititalize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export trimmed_fastqs_array=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
```


# Download raw reads

The `--cut-dirs 3` command cuts the preceding directory structure (i.e. `R_philippinarum/30-1035633055/`)
so that we just end up with the reads.

```{r download-raw-reads, engine='bash', eval=FALSE}

# Load bash variables into memory
source .bashvars

# Create directory, if it doesn't exist
mkdir --parents \
${raw_reads_dir}/${project_dir_1} \
${raw_reads_dir}/${project_dir_2}

for directory in ${raw_reads_dir}/${project_dir_1} ${raw_reads_dir}/${project_dir_2}
do
  wget \
  --directory-prefix ${directory} \
  --recursive \
  --no-check-certificate \
  --continue \
  --cut-dirs 3 \
  --no-parent \
  --no-host-directories \
  --quiet \
  ${raw_reads_url}
  
  # Remove extraneous indext files
  rm ${raw_reads_dir}/${project_dir_1}/index*
  rm ${raw_reads_dir}/${project_dir_2}/index*
done
```

## Overview of downloads

```{r check-downloads, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

tree --du -h "${raw_reads_dir}"
```


## Verify checkums
```{r verify-checksums, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

cd "${raw_reads_dir}/${project_dir_1}"

pwd
echo ""

for checksum in *.md5
do
  md5sum --check ${checksum}
done

echo ""
echo "${line}"
echo ""

cd "${raw_reads_dir}/${project_dir_2}"

pwd
echo ""

for checksum in *.md5
do
  md5sum --check ${checksum}
done
```

# Concatenate reads

Concatenation also handles samples where there might be a missing set of R2 reads in the second round of sequencing.

```{r concatenate-reads, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars

# Make output directory, if it doens't exist
mkdir --parents ${output_dir_top}

cd "${raw_reads_dir}"


# Concatenate FastQ files from 1st and 2nd runs
# Do NOT quote fastq_pattern variable

# Declare an associative array to keep track of processed files
declare -A processed_files

for first_run_fastq in "${raw_reads_dir}"/"${project_dir_1}"/${fastq_pattern}
do
  # Strip full path to just get filename.
  first_run_fastq_name="${first_run_fastq##*/}"

  # Initialize a flag to check if a match is found
  match_found=false

  # Process second run and concatenate with corresponding FastQ from first run
  # Do NOT quote fastq_pattern variable
  for second_run_fastq in "${raw_reads_dir}"/"${project_dir_2}"/${fastq_pattern}
  do
    # Strip full path to just get filename.
    second_run_fastq_name="${second_run_fastq##*/}"

    # Concatenate FastQs with same filenames
    if [[ "${first_run_fastq_name}" == "${second_run_fastq_name}" ]]
    then
      echo "Concatenating ${first_run_fastq} with ${second_run_fastq} to ${output_dir_top}/${first_run_fastq_name}"
      echo ""
      cat "${first_run_fastq}" "${second_run_fastq}" >> "${output_dir_top}/${first_run_fastq_name}"
      match_found=true
      processed_files["${first_run_fastq_name}"]=true
      break
    fi
  done

  # If no match is found, copy the file to the target directory
  if [[ "${match_found}" == false ]]
  then
    if [[ -z "${processed_files[${first_run_fastq_name}]}" ]]
    then
      echo "NO MATCH!"
      echo "Copying ${first_run_fastq} to ${output_dir_top}"
      echo ""
      cp "${first_run_fastq}" "${output_dir_top}"
      processed_files["${first_run_fastq_name}"]=true
    fi
  fi
  
  # Generate MD5 checksums
  cd ${output_dir_top}
  echo "Generating checksums for concatenated FastQs..."
  md5sum "${first_run_fastq_name}" | tee --append "${first_run_fastq_name}".md5
  echo ""
  echo "${line}"
  echo ""
  cd -
done
```

# FastQC/MultiQC on raw reads

```{bash raw-fastqc-multiqc, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars


############ RUN FASTQC ############


# Create array of trimmed FastQs
raw_fastqs_array=(${output_dir_top}/${fastq_pattern})

# Pass array contents to new variable as space-delimited list
raw_fastqc_list=$(echo "${raw_fastqs_array[*]}")

echo "Beginning FastQC on raw reads..."
echo ""

# Run FastQC
### NOTE: Do NOT quote raw_fastqc_list
${fastqc} \
--threads ${threads} \
--outdir ${output_dir_top} \
--quiet \
${raw_fastqc_list}

echo "FastQC on raw reads complete!"
echo ""

############ END FASTQC ############

############ RUN MULTIQC ############
echo "Beginning MultiQC on raw FastQC..."
echo ""

${multiqc} ${output_dir_top} -o ${output_dir_top}

echo ""
echo "MultiQC on raw FastQs complete."
echo ""

############ END MULTIQC ############

echo "Removing FastQC zip files."
echo ""
rm ${output_dir_top}/*.zip
echo "FastQC zip files removed."
echo ""
```


## View directory contents
```{bash list-output-files, engine='bash', eval=TRUE}
# Load bash variables into memory
source .bashvars
ls -lh ${output_dir_top}

```