---
title: "03-Peve bismark"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```


```{bash}
cd ../data
curl -O https://gannet.fish.washington.edu/seashell/snaps/Porites_evermanni_v1.fa
```


# ALignment

```{bash}
# Directories and programs
bismark_dir="/programs/Bismark-0.21.0"
bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64"
genome_folder="../data/"

/home/shared/Bismark-0.24.0/bismark_genome_preparation \
--path_to_aligner ${bowtie2_dir} \
--verbose \
--parallel 28 \
${genome_folder}
```


https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/

```{bash}
wget -r \
--no-directories --no-parent \
-P ../data/03-Peve-bismark \
-A "*gz" https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/
```

```{bash}
ls ../data/03-Peve-bismark

```


# test min scores

```{bash}
# Set directories and files
reads_dir="../data/03-Peve-bismark"
genome_folder="../data"
output_dir="../output/03-Peve-bismark"
checkpoint_file="${output_dir}/completed_samples.log"

# Define the file suffix for R1 reads; R2 suffix will be derived by replacing "R1" with "R2"
read_suffix="_R1_001.fastp-trim.fq.gz"

# Enable nullglob so that non-matching globs return an empty array
shopt -s nullglob

# Create the checkpoint file if it doesn't exist
touch "${checkpoint_file}"

# Define the array of score_min parameters to test
score_min_params=(
    "L,0,-0.4"
    "L,0,-0.6"
    "L,0,-0.8"
    "L,0,-1.0"
    "L,-1,-0.6"
)

# Get list of R1 files matching the pattern (using the file suffix variable)
r1_files=("${reads_dir}"/*"${read_suffix}")
if [ ${#r1_files[@]} -eq 0 ]; then
    echo "No R1 files found in ${reads_dir} with the pattern *${read_suffix}. Exiting."
    exit 1
fi

# Loop through each sample based on the R1 files
for r1_file in "${r1_files[@]}"; do
    # Extract sample name by removing the file suffix
    sample_name=$(basename "${r1_file}" | sed "s/${read_suffix}//")
    
    if [ -z "$sample_name" ]; then
        echo "Could not extract sample name from ${r1_file}. Skipping."
        continue
    fi

    # Skip sample if already processed
    if grep -q "^${sample_name}$" "${checkpoint_file}"; then
        echo "Skipping sample ${sample_name} as it is already processed."
        continue
    fi

    # Loop through each score_min parameter for the current sample
    for score_min in "${score_min_params[@]}"; do
        echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
        
        # Sanitize the score_min string for directory names (replace commas with underscores)
        sanitized_score=$(echo "${score_min}" | tr ',' '_')
        param_output_dir="${output_dir}/${sample_name}_score_${sanitized_score}"
        mkdir -p "${param_output_dir}"

        # Define file names for R1 and R2 (R2 is derived by substituting "R1" with "R2" in the suffix)
        read1="${reads_dir}/${sample_name}${read_suffix}"
        read2="${reads_dir}/${sample_name}${read_suffix/R1/R2}"

        # Run Bismark alignment
        bismark \
            -genome "${genome_folder}" \
            -p 8 \
            -u 10000 \
            -score_min "${score_min}" \
            #--non_directional \
            -1 "${read1}" \
            -2 "${read2}" \
            -o "${param_output_dir}" \
            --basename "${sample_name}_${sanitized_score}" \
            2> "${param_output_dir}/${sample_name}-bismark_summary.txt"

        # Check if the Bismark command was successful
        if [ $? -eq 0 ]; then
            echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
        else
            echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${param_output_dir}/${sample_name}-bismark_summary.txt for details."
        fi
    done

    # Mark sample as completed in the checkpoint file
    echo "${sample_name}" >> "${checkpoint_file}"
    echo "All tests for sample ${sample_name} completed."
done

# Define summary file for parameter comparison
summary_file="${output_dir}/parameter_comparison_summary.csv"

# (Optional) Initialize summary file header
# echo "Sample,Score_Min,Mapping_Efficiency" > "${summary_file}"

# Loop through each parameter output directory to extract summary metrics
for dir in "${output_dir}"/*_score_*; do
    if [ -d "$dir" ]; then
        # Extract sample name and score_min parameter from directory name
        sample_name=$(basename "$dir" | cut -d'_' -f1)
        score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')

        # Locate the summary report file generated by Bismark
        summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
        
        if [ -f "${summary_file_path}" ]; then
            # Extract the mapping efficiency value from the summary file
            mapping=$(grep "Mapping efficiency:" "${summary_file_path}" | awk '{print $3}')
            
            # Append the extracted metrics to the summary CSV
            echo "${sample_name},${score_min},${mapping}" >> "${summary_file}"
        else
            echo "Summary file not found for ${sample_name} with score_min ${score_min}"
        fi
    fi
done
```


```{bash}
reads_dir="../data/03-Peve-bismark/"
genome_folder="../data/"
output_dir="../output/03-Peve-bismark/"

mkdir -p "${output_dir}"

find ${reads_dir}*_R1_001.fastp-trim.fq.gz \
| xargs -n 1 basename -s _R1_001.fastp-trim.fq.gz \
| while read sample; do
<<<<<<< HEAD
    # Define the expected output file
    expected_output="${output_dir}${sample}_R1_001.fastp-trim_bismark_bt2_pe.bam"

    # Check if the expected output file exists before running
    if [[ ! -f "${expected_output}" ]]; then
        echo "Processing sample: ${sample}"
        /home/shared/Bismark-0.24.0/bismark \
            --genome ${genome_folder} \
            -p 8 \
            -u 10000 \
            --path_to_bowtie2 /home/shared/bowtie2-2.4.4-linux-x86_64 \
            --score_min L,0,-0.6 \
            -1 ${reads_dir}${sample}_R1_001.fastp-trim.fq.gz \
            -2 ${reads_dir}${sample}_R2_001.fastp-trim.fq.gz \
            -o "${output_dir}" \
            > "${output_dir}${sample}_bismark.out" 2> "${output_dir}${sample}_bismark.err"
    else
        echo "Skipping ${sample}, output already exists."
    fi

    bismark \
        --genome ${genome_folder} \
        -p 12 \
        --score_min L,0,-0.6 \
        -1 ${reads_dir}${sample}_R1_001.fastp-trim.fq.gz \
        -2 ${reads_dir}${sample}_R2_001.fastp-trim.fq.gz \
        -o ../output/03-Peve-bismark \
        > ../output/03-Peve-bismark/${sample}_bismark.out 2> ../output/03-Peve-bismark/${sample}_bismark.err
done
```


#dedup

```{bash}
find /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark/*.bam | \
xargs -n 1 basename -s _R1_001.fastp-trim_bismark_bt2_pe.bam | \
parallel -j 8 deduplicate_bismark \
--bam \
--paired \
--output_dir ../output/15-Apul-bismark \
/gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark//{}_R1_001.fastp-trim_bismark_bt2_pe.bam
```


```{bash}
find ../output/15-Apul-bismark/*deduplicated.bam | \
xargs basename -s _R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam | \
xargs -I{} samtools \
sort --threads 24 \
../output/15-Apul-bismark/{}_R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam \
-o ../output/15-Apul-bismark/{}.sorted.bam
```

# methylation exraction

```
find ../output/09-meth-quant/*deduplicated.bam | xargs -n 1 -I{} /home/shared/Bismark-0.24.0/bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG --multicore 24 --buffer_size 75% --output ../output/09-meth-quant {} 
```

```{bash}
find ../output/15-Apul-bismark/*deduplicated.bam | xargs -n 1 -I{} \
bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG \
--multicore 24 --buffer_size 75% --output ../output/15-Apul-bismark "{}"
```

# Methylation call

```
find ../output/09-meth-quant/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \
--genome_folder ../data/ \
-o ../output/09-meth-quant/{} \
--merge_CpG \
--zero_based \
../output/09-meth-quant/{}_pe.deduplicated.bismark.cov.gz
```

```{bash}
find ../output/15-Apul-bismark/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 24 coverage2cytosine \
--genome_folder /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/data/ \
-o ../output/15-Apul-bismark/{} \
--merge_CpG \
--zero_based \
../output/15-Apul-bismark/{}_pe.deduplicated.bismark.cov.gz
```

```{bash}
head ../output/15-Apul-bismark/1H3*evidence.cov
```