---
title: "15.5-Apul bismark"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```


Sam did alingment


```{bash}
ls /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark 
```


#dedup

```{bash}
find /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark/*.bam | \
xargs -n 1 basename -s _R1_001.fastp-trim_bismark_bt2_pe.bam | \
parallel -j 8 deduplicate_bismark \
--bam \
--paired \
--output_dir ../output/15-Apul-bismark \
/gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark//{}_R1_001.fastp-trim_bismark_bt2_pe.bam
```


```{bash}
find ../output/15-Apul-bismark/*deduplicated.bam | \
xargs basename -s _R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam | \
xargs -I{} samtools \
sort --threads 24 \
../output/15-Apul-bismark/{}_R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bam \
-o ../output/15-Apul-bismark/{}.sorted.bam
```

# methylation exraction

```
find ../output/09-meth-quant/*deduplicated.bam | xargs -n 1 -I{} /home/shared/Bismark-0.24.0/bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG --multicore 24 --buffer_size 75% --output ../output/09-meth-quant {} 
```

```{bash}
find ../output/15-Apul-bismark/*deduplicated.bam | xargs -n 1 -I{} \
bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG \
--multicore 24 --buffer_size 75% --output ../output/15-Apul-bismark "{}"
```

# Methylation call

```
find ../output/09-meth-quant/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \
--genome_folder ../data/ \
-o ../output/09-meth-quant/{} \
--merge_CpG \
--zero_based \
../output/09-meth-quant/{}_pe.deduplicated.bismark.cov.gz
```

```{bash}
find ../output/15-Apul-bismark/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 24 coverage2cytosine \
--genome_folder /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/data/ \
-o ../output/15-Apul-bismark/{} \
--merge_CpG \
--zero_based \
../output/15-Apul-bismark/{}_pe.deduplicated.bismark.cov.gz
```

```{bash}
head ../output/15-Apul-bismark/1H3*evidence.cov
```


# above was on klone...

# lets look at it..

```{bash}
ls ../output/15-Apul-bismark/
```


```{bash}

# Define paths
CSV_FILE="../../M-multi-species/data/e5_DNA_Azenta_metadata.csv"  # Change this to your actual CSV file path
SOURCE_DIR="../output/15-Apul-bismark"  # Change to your actual directory containing the files
DEST_DIR="../output/15.5-Apul-bismark"  # Change to your desired output directory

# Create the destination directory if it doesn't exist
mkdir -p "$DEST_DIR"

# Read the CSV and create a mapping of old IDs to new IDs
declare -A rename_map
while IFS=, read -r sample_name azenta_sample_name _; do
    # Trim whitespace
    sample_name=$(echo "$sample_name" | tr -d '[:space:]')
    azenta_sample_name=$(echo "$azenta_sample_name" | tr -d '[:space:]')
    
    # Add to mapping if both fields exist
    if [[ -n "$sample_name" && -n "$azenta_sample_name" ]]; then
        rename_map["$azenta_sample_name"]="$sample_name"
    fi
done < <(tail -n +2 "$CSV_FILE")  # Skip the header row

# Loop through files in the source directory, filtering only .cov files
for file in "$SOURCE_DIR"/*.cov; do
    filename=$(basename -- "$file")  # Extract filename

    # Extract sample ID from filename
    if [[ $filename =~ ([A-Za-z0-9]+)_R1_001 ]]; then
        old_id="${BASH_REMATCH[1]}"
        
        # Check if there's a match in the rename map
        if [[ -n "${rename_map[$old_id]}" ]]; then
            new_id="${rename_map[$old_id]}"
            new_filename="${filename/$old_id/$new_id}"
            cp "$file" "$DEST_DIR/$new_filename"
            echo "Copied: $filename → $DEST_DIR/$new_filename"
        fi
    fi
done

echo "Copying complete!"
```


did not work

```{bash}
cp ../output/15-Apul-bismark/*cov ../output/15.5-Apul-bismark
```


```{bash}
ls ../output/15.5-Apul-bismark | wc -l
```


```{bash}
# Read the CSV file and create an associative array
declare -A sample_map
while IFS=, read -r col1 sample_name col3 col4 azenta_sample_name colony_id timepoint col7 col8 col9 col10 col11 col12 col13 col14 col15 col16 col17 col18; do
  sample_map["$azenta_sample_name"]="${colony_id}-${timepoint}"
done < ../../M-multi-species/data/e5_DNA_Azenta_metadata.csv

cd ../output/15.5-Apul-bismark

# Iterate over all *.cov files and rename them
for file in *.cov; do
  # Extract the portion before the first underscore
  prefix="${file%%_*}"
  if [[ -n "${sample_map[$prefix]}" ]]; then
    new_file="${file//$prefix/${sample_map[$prefix]}}"
    mv "$file" "$new_file"
  fi
done

```


# Converting cov files to other typesl


```{bash}
cd ../output/15.5-Apul-bismark/

for f in *merged_CpG_evidence.cov
do
  STEM=$(basename "${f}" _R1_001.fastp-trim_bismark_bt2.CpG_report.merged_CpG_evidence.cov)
  cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \
  > "${STEM}"_5x.bedgraph
done
```


```{bash}
cd ../output/15.5-Apul-bismark/

for f in *merged_CpG_evidence.cov
do
  STEM=$(basename "${f}" _R1_001.fastp-trim_bismark_bt2.CpG_report.merged_CpG_evidence.cov)
  cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \
  > "${STEM}"_10x.bedgraph
done
```


```{bash}
head ../output/15.5-Apul-bismark/*229-TP1*
```

# take all 10x bedgraph and given CG loci unique ID

```{bash}
cd ../output/15.5-Apul-bismark/

awk '{print "CpG_"_$1"_"$2, $4}' ACR-229-TP1_10x.bedgraph | head
```

```{bash}
rm ../output/15.5-Apul-bismark/*processed*
```


```{bash}
for file in ../output/15.5-Apul-bismark/*10x.bedgraph; do
    awk '{print "CpG_"_$1"_"$2, $4}' "$file" > "${file%.bedgraph}_processed.txt"
done
```

```{bash}
for file in ../output/15.5-Apul-bismark/*10x_processed.txt; do
    head -1 $file
done

```


```{bash}
cd ../output/15.5-Apul-bismark/
awk '{print $1}' ACR-229-TP1_10x.bedgraph | sort | uniq
```


```{bash}
multiqc ../output/15-Apul-bismark/ \
-o ../output/15-Apul-bismark/
```

```{bash}
multiqc /gscratch/srlab/sam/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-WGBS-alignment-SLURM_array-bismark/ \
-o ../output/15-Apul-bismark/
```