--- title: "03-Peve bismark" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` ```{bash} cd ../data curl -O https://gannet.fish.washington.edu/seashell/snaps/Porites_evermanni_v1.fa ``` # ALignment ```{bash} # Directories and programs bismark_dir="/programs/Bismark-0.21.0" bowtie2_dir="/programs/bowtie2-2.3.4.1-linux-x86_64/" genome_folder="../data/" bismark_genome_preparation \ --verbose \ --parallel 28 \ ${genome_folder} ``` https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/ ```{bash} wget -r \ --no-directories --no-parent \ -P ../data/03-Peve-bismark \ -A "*gz" https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/ ``` ```{bash} ls ../data/03-Peve-bismark ``` # test min scores ```{bash} # Set directories and files reads_dir="../data/03-Peve-bismark" genome_folder="../data" output_dir="../output/03-Peve-bismark" checkpoint_file="${output_dir}/completed_samples.log" # Define the file suffix for R1 reads; R2 suffix will be derived by replacing "R1" with "R2" read_suffix="_R1_001.fastp-trim.fq.gz" # Enable nullglob so that non-matching globs return an empty array shopt -s nullglob # Create the checkpoint file if it doesn't exist touch "${checkpoint_file}" # Define the array of score_min parameters to test score_min_params=( "L,0,-0.4" "L,0,-0.6" "L,0,-0.8" "L,0,-1.0" "L,-1,-0.6" ) # Get list of R1 files matching the pattern (using the file suffix variable) r1_files=("${reads_dir}"/*"${read_suffix}") if [ ${#r1_files[@]} -eq 0 ]; then echo "No R1 files found in ${reads_dir} with the pattern *${read_suffix}. Exiting." exit 1 fi # Loop through each sample based on the R1 files for r1_file in "${r1_files[@]}"; do # Extract sample name by removing the file suffix sample_name=$(basename "${r1_file}" | sed "s/${read_suffix}//") if [ -z "$sample_name" ]; then echo "Could not extract sample name from ${r1_file}. Skipping." continue fi # Skip sample if already processed if grep -q "^${sample_name}$" "${checkpoint_file}"; then echo "Skipping sample ${sample_name} as it is already processed." continue fi # Loop through each score_min parameter for the current sample for score_min in "${score_min_params[@]}"; do echo "Running Bismark for sample ${sample_name} with score_min ${score_min}" # Sanitize the score_min string for directory names (replace commas with underscores) sanitized_score=$(echo "${score_min}" | tr ',' '_') param_output_dir="${output_dir}/${sample_name}_score_${sanitized_score}" mkdir -p "${param_output_dir}" # Define file names for R1 and R2 (R2 is derived by substituting "R1" with "R2" in the suffix) read1="${reads_dir}/${sample_name}${read_suffix}" read2="${reads_dir}/${sample_name}${read_suffix/R1/R2}" # Run Bismark alignment bismark \ -genome "${genome_folder}" \ -p 8 \ -u 10000 \ -score_min "${score_min}" \ #--non_directional \ -1 "${read1}" \ -2 "${read2}" \ -o "${param_output_dir}" \ --basename "${sample_name}_${sanitized_score}" \ 2> "${param_output_dir}/${sample_name}-bismark_summary.txt" # Check if the Bismark command was successful if [ $? -eq 0 ]; then echo "Sample ${sample_name} with score_min ${score_min} processed successfully." else echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${param_output_dir}/${sample_name}-bismark_summary.txt for details." fi done # Mark sample as completed in the checkpoint file echo "${sample_name}" >> "${checkpoint_file}" echo "All tests for sample ${sample_name} completed." done # Define summary file for parameter comparison summary_file="${output_dir}/parameter_comparison_summary.csv" # (Optional) Initialize summary file header # echo "Sample,Score_Min,Mapping_Efficiency" > "${summary_file}" # Loop through each parameter output directory to extract summary metrics for dir in "${output_dir}"/*_score_*; do if [ -d "$dir" ]; then # Extract sample name and score_min parameter from directory name sample_name=$(basename "$dir" | cut -d'_' -f1) score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g') # Locate the summary report file generated by Bismark summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt" if [ -f "${summary_file_path}" ]; then # Extract the mapping efficiency value from the summary file mapping=$(grep "Mapping efficiency:" "${summary_file_path}" | awk '{print $3}') # Append the extracted metrics to the summary CSV echo "${sample_name},${score_min},${mapping}" >> "${summary_file}" else echo "Summary file not found for ${sample_name} with score_min ${score_min}" fi fi done ``` ```{bash} reads_dir="../data/03-Peve-bismark/" genome_folder="../data/" find ${reads_dir}*_R1_001.fastp-trim.fq.gz \ | xargs -n 1 basename -s _R1_001.fastp-trim.fq.gz \ | while read sample; do bismark \ --genome ${genome_folder} \ -p 12 \ --score_min L,0,-0.6 \ -1 ${reads_dir}${sample}_R1_001.fastp-trim.fq.gz \ -2 ${reads_dir}${sample}_R2_001.fastp-trim.fq.gz \ -o ../output/03-Peve-bismark \ > ../output/03-Peve-bismark/${sample}_bismark.out 2> ../output/03-Peve-bismark/${sample}_bismark.err done ``` #dedup ```{bash} find /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark/*.bam | \ xargs -n 1 basename -s _R1_001.fastp-trim_bismark_bt2_pe.bam | \ parallel -j 8 deduplicate_bismark \ --bam \ --paired \ --output_dir ../output/15-Apul-bismark \ /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark//{}_R1_001.fastp-trim_bismark_bt2_pe.bam ``` ```{bash} find ../output/15-Apul-bismark/*deduplicated.bam | \ xargs basename -s _R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam | \ xargs -I{} samtools \ sort --threads 24 \ ../output/15-Apul-bismark/{}_R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam \ -o ../output/15-Apul-bismark/{}.sorted.bam ``` # methylation exraction ``` find ../output/09-meth-quant/*deduplicated.bam | xargs -n 1 -I{} /home/shared/Bismark-0.24.0/bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG --multicore 24 --buffer_size 75% --output ../output/09-meth-quant {} ``` ```{bash} find ../output/15-Apul-bismark/*deduplicated.bam | xargs -n 1 -I{} \ bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG \ --multicore 24 --buffer_size 75% --output ../output/15-Apul-bismark "{}" ``` # Methylation call ``` find ../output/09-meth-quant/*deduplicated.bismark.cov.gz | \ xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \ parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \ --genome_folder ../data/ \ -o ../output/09-meth-quant/{} \ --merge_CpG \ --zero_based \ ../output/09-meth-quant/{}_pe.deduplicated.bismark.cov.gz ``` ```{bash} find ../output/15-Apul-bismark/*deduplicated.bismark.cov.gz | \ xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \ parallel -j 24 coverage2cytosine \ --genome_folder /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/data/ \ -o ../output/15-Apul-bismark/{} \ --merge_CpG \ --zero_based \ ../output/15-Apul-bismark/{}_pe.deduplicated.bismark.cov.gz ``` ```{bash} head ../output/15-Apul-bismark/1H3*evidence.cov ```