--- title: "22-Apul methylation" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` Methylation files at https://owl.fish.washington.edu/nightingales/E5-coral-deep-dive-expression/genohub2216545/ ![](http://gannet.fish.washington.edu/seashell/snaps/2025-02-05_07-56-52.png) # RAW REads ```{bash} wget -r \ --no-directories --no-parent \ -P ../data/22-Apul-meth \ -A "*.gz" \ https://gannet.fish.washington.edu/gitrepos/urol-e5/deep-dive-expression/D-Apul/output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ ``` ```{bash} # Directories and programs bismark_dir="/home/shared/Bismark-0.24.0/" bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/" genome_folder="../data/" ${bismark_dir}/bismark_genome_preparation \ --verbose \ --parallel 28 \ --path_to_aligner ${bowtie2_dir} \ ${genome_folder} ``` # Checking alignment min scores ```{bash} # Set directories and files reads_dir="../data/22-Apul-meth/" genome_folder="../data/" output_dir="../output/22-Apul-methylation" checkpoint_file="../output/22-Apul-methylation/completed_samples.log" bismark_dir="/home/shared/Bismark-0.24.0/" bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/" # Create the checkpoint file if it doesn't exist touch ${checkpoint_file} # Get the list of sample files and corresponding sample names for file in ${reads_dir}*_R1.fastp-trim.fq.gz; do sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz") # Check if the sample has already been processed if grep -q "^${sample_name}$" ${checkpoint_file}; then echo "Sample ${sample_name} already processed. Skipping..." continue fi # Define log files for stdout and stderr stdout_log="${output_dir}/${sample_name}_stdout.log" stderr_log="${output_dir}/${sample_name}_stderr.log" # Define the array of score_min parameters to test score_min_params=( "L,0,-0.4" "L,0,-0.6" "L,0,-0.8" "L,0,-1.0" "L,-1,-0.6" ) # Loop through each score_min parameter for score_min in "${score_min_params[@]}"; do echo "Running Bismark for sample ${sample_name} with score_min ${score_min}" # Create a subdirectory for this parameter param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}" mkdir -p ${param_output_dir} # Run Bismark alignment ${bismark_dir}bismark \ -genome ${genome_folder} \ -p 16 \ -u 25000 \ -score_min ${score_min} \ --non_directional \ --path_to_bowtie ${bowtie2_dir} \ -1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \ -2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \ -o ${param_output_dir} \ --basename ${sample_name}_${score_min//,/} \ 2> "${param_output_dir}/${sample_name}-bismark_summary.txt" # Check if the command was successful if [ $? -eq 0 ]; then echo "Sample ${sample_name} with score_min ${score_min} processed successfully." else echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details." fi done # Mark the sample as completed in the checkpoint file if [ $? -eq 0 ]; then echo ${sample_name} >> ${checkpoint_file} echo "All tests for sample ${sample_name} completed." else echo "Sample ${sample_name} encountered errors. Check logs for details." fi done # Define summary file summary_file="${output_dir}/parameter_comparison_summary.csv" # Initialize summary file echo "Sample,Score_Min,Alignment_Rate" > ${summary_file} # Loop through parameter output directories for dir in ${output_dir}/*_score_*; do if [ -d "$dir" ]; then # Extract sample name and score_min parameter from directory name sample_name=$(basename "$dir" | cut -d'_' -f1) score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g') # Locate the summary file summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt" # Extract metrics if [ -f "$summary_file_path" ]; then mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}') echo "${sample_name},${score_min},${mapping}" >> ${summary_file} fi fi done ``` full alignment ```{bash} # Set variables reads_dir="../data/22-Apul-meth/" genome_folder="../data/" output_dir="../output/22-Apul-methylation" score_min="L,0,-1.0" # Single value for score_min # Get the list of sample files and corresponding sample names for file in ${reads_dir}*_R1.fastp-trim.fq.gz; do sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz") echo "Running Bismark for sample ${sample_name} with score_min ${score_min}" # Run Bismark alignment /home/shared/Bismark-0.24.0/bismark \ --path_to_bowtie2 /home/shared/bowtie2-2.4.4-linux-x86_64 \ -genome ${genome_folder} \ -p 10 \ -score_min ${score_min} \ -1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \ -2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \ -o ${output_dir} \ --basename ${sample_name} \ 2> "${output_dir}/${sample_name}-bismark_summary.txt" done ```