{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WGBS on Ronit's Samples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ran with no limit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "#!/bin/bash\n",
    "## Job Name\n",
    "#SBATCH --job-name=hw-bsnlP\n",
    "## Allocation Definition\n",
    "#SBATCH --account=coenv\n",
    "#SBATCH --partition=coenv\n",
    "## Nodes\n",
    "#SBATCH --nodes=1\n",
    "## Walltime (days-hours:minutes:seconds format)\n",
    "#SBATCH --time=10-00:00:00\n",
    "## Memory per node\n",
    "#SBATCH --mem=100G\n",
    "#SBATCH --mail-type=ALL\n",
    "#SBATCH --mail-user=sr320@uw.edu\n",
    "## Specify the working directory for this job\n",
    "#SBATCH --chdir=/gscratch/scrubbed/sr320/021921-hw-bsnP\n",
    "  \n",
    "  \n",
    "  \n",
    "# Directories and programs\n",
    "bismark_dir=\"/gscratch/srlab/programs/Bismark-0.21.0\"\n",
    "bowtie2_dir=\"/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/\"\n",
    "samtools=\"/gscratch/srlab/programs/samtools-1.9/samtools\"\n",
    "reads_dir=\"/gscratch/srlab/sr320/data/cg/\"\n",
    "genome_folder=\"/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/\"\n",
    "  \n",
    "source /gscratch/srlab/programs/scripts/paths.sh\n",
    "  \n",
    "  \n",
    "  \n",
    "#${bismark_dir}/bismark_genome_preparation \\\n",
    "#--verbose \\\n",
    "#--parallel 28 \\\n",
    "#--path_to_aligner ${bowtie2_dir} \\\n",
    "#${genome_folder}\n",
    "  \n",
    "  \n",
    "#/zr3644_11_R2.fastp-trim.20201206.fq.gz\n",
    "  \n",
    "find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \\\n",
    "| xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -n 1 -P 6 -I{} ${bismark_dir}/bismark \\\n",
    "--path_to_bowtie ${bowtie2_dir} \\\n",
    "-genome ${genome_folder} \\\n",
    "-p 4 \\\n",
    "--non_directional \\\n",
    "-1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \\\n",
    "-2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \\\n",
    "  \n",
    "  \n",
    "  \n",
    "find *.bam | \\\n",
    "xargs basename -s .bam | \\\n",
    "xargs -I{} ${bismark_dir}/deduplicate_bismark \\\n",
    "--bam \\\n",
    "--paired \\\n",
    "{}.bam\n",
    "  \n",
    "  \n",
    "  \n",
    "${bismark_dir}/bismark_methylation_extractor \\\n",
    "--bedGraph --counts --scaffolds \\\n",
    "--multicore 14 \\\n",
    "--buffer_size 75% \\\n",
    "*deduplicated.bam\n",
    "  \n",
    "  \n",
    "  \n",
    "# Bismark processing report\n",
    "  \n",
    "${bismark_dir}/bismark2report\n",
    "  \n",
    "#Bismark summary report\n",
    "  \n",
    "${bismark_dir}/bismark2summary\n",
    "  \n",
    "  \n",
    "  \n",
    "# Sort files for methylkit and IGV\n",
    "  \n",
    "find *deduplicated.bam | \\\n",
    "xargs basename -s .bam | \\\n",
    "xargs -I{} ${samtools} \\\n",
    "sort --threads 28 {}.bam \\\n",
    "-o {}.sorted.bam\n",
    "  \n",
    "# Index sorted files for IGV\n",
    "# The \"-@ 16\" below specifies number of CPU threads to use.\n",
    "  \n",
    "find *.sorted.bam | \\\n",
    "xargs basename -s .sorted.bam | \\\n",
    "xargs -I{} ${samtools} \\\n",
    "index -@ 28 {}.sorted.bam\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "[sr320@mox1 021421-hw-bsn]$ head *report* | grep \"Mapping efficiency:\"\n",
    "Mapping efficiency:\t37.7% \n",
    "Mapping efficiency:\t37.7% \n",
    "Mapping efficiency:\t37.9% \n",
    "Mapping efficiency:\t37.7% \n",
    "Mapping efficiency:\t37.4% \n",
    "Mapping efficiency:\t38.1% \n",
    "Mapping efficiency:\t38.3% \n",
    "Mapping efficiency:\t38.5% \n",
    "Mapping efficiency:\t39.0% \n",
    "Mapping efficiency:\t38.1% \n",
    "Mapping efficiency:\t38.2% \n",
    "Mapping efficiency:\t34.2% \n",
    "Mapping efficiency:\t37.2% \n",
    "Mapping efficiency:\t38.6% \n",
    "Mapping efficiency:\t38.5% \n",
    "Mapping efficiency:\t38.5% \n",
    "Mapping efficiency:\t37.3% \n",
    "Mapping efficiency:\t37.5% \n",
    "Mapping efficiency:\t37.3% \n",
    "Mapping efficiency:\t36.8% \n",
    "Mapping efficiency:\t38.1% \n",
    "Mapping efficiency:\t37.2% \n",
    "Mapping efficiency:\t37.8% \n",
    "Mapping efficiency:\t38.8%\n",
    "```    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "multiQC https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/021421-hw-bsn/multiqc_report.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**And with the `-score_min L,0,-0.6`**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "#!/bin/bash\n",
    "## Job Name\n",
    "#SBATCH --job-name=hw-bs\n",
    "## Allocation Definition\n",
    "#SBATCH --account=srlab\n",
    "#SBATCH --partition=srlab\n",
    "## Nodes\n",
    "#SBATCH --nodes=1\n",
    "## Walltime (days-hours:minutes:seconds format)\n",
    "#SBATCH --time=20-00:00:00\n",
    "## Memory per node\n",
    "#SBATCH --mem=100G\n",
    "#SBATCH --mail-type=ALL\n",
    "#SBATCH --mail-user=sr320@uw.edu\n",
    "## Specify the working directory for this job\n",
    "#SBATCH --chdir=/gscratch/scrubbed/sr320/021321-hw-bs\n",
    " \n",
    " \n",
    " \n",
    "# Directories and programs\n",
    "bismark_dir=\"/gscratch/srlab/programs/Bismark-0.21.0\"\n",
    "bowtie2_dir=\"/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/\"\n",
    "samtools=\"/gscratch/srlab/programs/samtools-1.9/samtools\"\n",
    "reads_dir=\"/gscratch/srlab/sr320/data/cg/\"\n",
    "genome_folder=\"/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/\"\n",
    " \n",
    "source /gscratch/srlab/programs/scripts/paths.sh\n",
    " \n",
    " \n",
    " \n",
    "#${bismark_dir}/bismark_genome_preparation \\\n",
    "#--verbose \\\n",
    "#--parallel 28 \\\n",
    "#--path_to_aligner ${bowtie2_dir} \\\n",
    "#${genome_folder}\n",
    " \n",
    " \n",
    "#/zr3644_11_R2.fastp-trim.20201206.fq.gz\n",
    " \n",
    "find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \\\n",
    "| xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -I{} ${bismark_dir}/bismark \\\n",
    "--path_to_bowtie ${bowtie2_dir} \\\n",
    "-genome ${genome_folder} \\\n",
    "-p 4 \\\n",
    "-score_min L,0,-0.6 \\\n",
    "--non_directional \\\n",
    "-1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \\\n",
    "-2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \\\n",
    " \n",
    " \n",
    " \n",
    "find *.bam | \\\n",
    "xargs basename -s .bam | \\\n",
    "xargs -I{} ${bismark_dir}/deduplicate_bismark \\\n",
    "--bam \\\n",
    "--paired \\\n",
    "{}.bam\n",
    " \n",
    " \n",
    " \n",
    "${bismark_dir}/bismark_methylation_extractor \\\n",
    "--bedGraph --counts --scaffolds \\\n",
    "--multicore 14 \\\n",
    "--buffer_size 75% \\\n",
    "*deduplicated.bam\n",
    " \n",
    " \n",
    " \n",
    "# Bismark processing report\n",
    " \n",
    "${bismark_dir}/bismark2report\n",
    " \n",
    "#Bismark summary report\n",
    " \n",
    "${bismark_dir}/bismark2summary\n",
    " \n",
    " \n",
    " \n",
    "# Sort files for methylkit and IGV\n",
    " \n",
    "find *deduplicated.bam | \\\n",
    "xargs basename -s .bam | \\\n",
    "xargs -I{} ${samtools} \\\n",
    "sort --threads 28 {}.bam \\\n",
    "-o {}.sorted.bam\n",
    " \n",
    "# Index sorted files for IGV\n",
    "# The \"-@ 16\" below specifies number of CPU threads to use.\n",
    " \n",
    "find *.sorted.bam | \\\n",
    "xargs basename -s .sorted.bam | \\\n",
    "xargs -I{} ${samtools} \\\n",
    "index -@ 28 {}.sorted.bam\n",
    " \n",
    " \n",
    " \n",
    "# \n",
    "# \n",
    "# find *deduplicated.bismark.cov.gz \\\n",
    "# | xargs basename -s _R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz \\\n",
    "# | xargs -I{} ${bismark_dir}/coverage2cytosine \\\n",
    "# --genome_folder ${genome_folder} \\\n",
    "# -o {} \\\n",
    "# --merge_CpG \\\n",
    "# --zero_based \\\n",
    "# {}_R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz\n",
    "# \n",
    "# \n",
    "# #creating bedgraphs post merge\n",
    "# \n",
    "# for f in *merged_CpG_evidence.cov\n",
    "# do\n",
    "#   STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n",
    "#   cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \\\n",
    "#   > \"${STEM}\"_10x.bedgraph\n",
    "# done\n",
    "# \n",
    "# \n",
    "# \n",
    "# for f in *merged_CpG_evidence.cov\n",
    "# do\n",
    "#   STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n",
    "#   cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \\\n",
    "#   > \"${STEM}\"_5x.bedgraph\n",
    "# done\n",
    "# \n",
    "# \n",
    "# #creating tab files with raw count for glms\n",
    "# \n",
    "# for f in *merged_CpG_evidence.cov\n",
    "# do\n",
    "#   STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n",
    "#   cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4, $5, $6}}' \\\n",
    "#   > \"${STEM}\"_10x.tab\n",
    "# done\n",
    "# \n",
    "# \n",
    "# for f in *merged_CpG_evidence.cov\n",
    "# do\n",
    "#   STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n",
    "#   cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4, $5, $6}}' \\\n",
    "#   > \"${STEM}\"_5x.tab\n",
    "# done\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "[sr320@mox1 021321-hw-bs]$ head *report* | grep \"Mapping efficiency:\"\n",
    "Mapping efficiency:\t61.6% \n",
    "Mapping efficiency:\t61.6% \n",
    "Mapping efficiency:\t62.2% \n",
    "Mapping efficiency:\t61.9% \n",
    "Mapping efficiency:\t62.4% \n",
    "Mapping efficiency:\t62.1% \n",
    "Mapping efficiency:\t62.5% \n",
    "Mapping efficiency:\t62.1% \n",
    "Mapping efficiency:\t62.7% \n",
    "Mapping efficiency:\t62.3% \n",
    "Mapping efficiency:\t62.7% \n",
    "Mapping efficiency:\t61.3% \n",
    "Mapping efficiency:\t61.7% \n",
    "Mapping efficiency:\t62.5% \n",
    "Mapping efficiency:\t62.4% \n",
    "Mapping efficiency:\t62.6% \n",
    "Mapping efficiency:\t61.6% \n",
    "Mapping efficiency:\t61.6% \n",
    "Mapping efficiency:\t61.8% \n",
    "Mapping efficiency:\t61.8% \n",
    "Mapping efficiency:\t62.3% \n",
    "Mapping efficiency:\t61.6% \n",
    "Mapping efficiency:\t61.9% \n",
    "Mapping efficiency:\t61.9% \n",
    "```    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "MultiQC https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/021321-hw-bs/multiqc_report.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}