{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# WGBS on Ronit's Samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ran with no limit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "#!/bin/bash\n", "## Job Name\n", "#SBATCH --job-name=hw-bsnlP\n", "## Allocation Definition\n", "#SBATCH --account=coenv\n", "#SBATCH --partition=coenv\n", "## Nodes\n", "#SBATCH --nodes=1\n", "## Walltime (days-hours:minutes:seconds format)\n", "#SBATCH --time=10-00:00:00\n", "## Memory per node\n", "#SBATCH --mem=100G\n", "#SBATCH --mail-type=ALL\n", "#SBATCH --mail-user=sr320@uw.edu\n", "## Specify the working directory for this job\n", "#SBATCH --chdir=/gscratch/scrubbed/sr320/021921-hw-bsnP\n", " \n", " \n", " \n", "# Directories and programs\n", "bismark_dir=\"/gscratch/srlab/programs/Bismark-0.21.0\"\n", "bowtie2_dir=\"/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/\"\n", "samtools=\"/gscratch/srlab/programs/samtools-1.9/samtools\"\n", "reads_dir=\"/gscratch/srlab/sr320/data/cg/\"\n", "genome_folder=\"/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/\"\n", " \n", "source /gscratch/srlab/programs/scripts/paths.sh\n", " \n", " \n", " \n", "#${bismark_dir}/bismark_genome_preparation \\\n", "#--verbose \\\n", "#--parallel 28 \\\n", "#--path_to_aligner ${bowtie2_dir} \\\n", "#${genome_folder}\n", " \n", " \n", "#/zr3644_11_R2.fastp-trim.20201206.fq.gz\n", " \n", "find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \\\n", "| xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -n 1 -P 6 -I{} ${bismark_dir}/bismark \\\n", "--path_to_bowtie ${bowtie2_dir} \\\n", "-genome ${genome_folder} \\\n", "-p 4 \\\n", "--non_directional \\\n", "-1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \\\n", "-2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \\\n", " \n", " \n", " \n", "find *.bam | \\\n", "xargs basename -s .bam | \\\n", "xargs -I{} ${bismark_dir}/deduplicate_bismark \\\n", "--bam \\\n", "--paired \\\n", "{}.bam\n", " \n", " \n", " \n", "${bismark_dir}/bismark_methylation_extractor \\\n", "--bedGraph --counts --scaffolds \\\n", "--multicore 14 \\\n", "--buffer_size 75% \\\n", "*deduplicated.bam\n", " \n", " \n", " \n", "# Bismark processing report\n", " \n", "${bismark_dir}/bismark2report\n", " \n", "#Bismark summary report\n", " \n", "${bismark_dir}/bismark2summary\n", " \n", " \n", " \n", "# Sort files for methylkit and IGV\n", " \n", "find *deduplicated.bam | \\\n", "xargs basename -s .bam | \\\n", "xargs -I{} ${samtools} \\\n", "sort --threads 28 {}.bam \\\n", "-o {}.sorted.bam\n", " \n", "# Index sorted files for IGV\n", "# The \"-@ 16\" below specifies number of CPU threads to use.\n", " \n", "find *.sorted.bam | \\\n", "xargs basename -s .sorted.bam | \\\n", "xargs -I{} ${samtools} \\\n", "index -@ 28 {}.sorted.bam\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "[sr320@mox1 021421-hw-bsn]$ head *report* | grep \"Mapping efficiency:\"\n", "Mapping efficiency:\t37.7% \n", "Mapping efficiency:\t37.7% \n", "Mapping efficiency:\t37.9% \n", "Mapping efficiency:\t37.7% \n", "Mapping efficiency:\t37.4% \n", "Mapping efficiency:\t38.1% \n", "Mapping efficiency:\t38.3% \n", "Mapping efficiency:\t38.5% \n", "Mapping efficiency:\t39.0% \n", "Mapping efficiency:\t38.1% \n", "Mapping efficiency:\t38.2% \n", "Mapping efficiency:\t34.2% \n", "Mapping efficiency:\t37.2% \n", "Mapping efficiency:\t38.6% \n", "Mapping efficiency:\t38.5% \n", "Mapping efficiency:\t38.5% \n", "Mapping efficiency:\t37.3% \n", "Mapping efficiency:\t37.5% \n", "Mapping efficiency:\t37.3% \n", "Mapping efficiency:\t36.8% \n", "Mapping efficiency:\t38.1% \n", "Mapping efficiency:\t37.2% \n", "Mapping efficiency:\t37.8% \n", "Mapping efficiency:\t38.8%\n", "``` " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "multiQC https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/021421-hw-bsn/multiqc_report.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**And with the `-score_min L,0,-0.6`**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "#!/bin/bash\n", "## Job Name\n", "#SBATCH --job-name=hw-bs\n", "## Allocation Definition\n", "#SBATCH --account=srlab\n", "#SBATCH --partition=srlab\n", "## Nodes\n", "#SBATCH --nodes=1\n", "## Walltime (days-hours:minutes:seconds format)\n", "#SBATCH --time=20-00:00:00\n", "## Memory per node\n", "#SBATCH --mem=100G\n", "#SBATCH --mail-type=ALL\n", "#SBATCH --mail-user=sr320@uw.edu\n", "## Specify the working directory for this job\n", "#SBATCH --chdir=/gscratch/scrubbed/sr320/021321-hw-bs\n", " \n", " \n", " \n", "# Directories and programs\n", "bismark_dir=\"/gscratch/srlab/programs/Bismark-0.21.0\"\n", "bowtie2_dir=\"/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/\"\n", "samtools=\"/gscratch/srlab/programs/samtools-1.9/samtools\"\n", "reads_dir=\"/gscratch/srlab/sr320/data/cg/\"\n", "genome_folder=\"/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/\"\n", " \n", "source /gscratch/srlab/programs/scripts/paths.sh\n", " \n", " \n", " \n", "#${bismark_dir}/bismark_genome_preparation \\\n", "#--verbose \\\n", "#--parallel 28 \\\n", "#--path_to_aligner ${bowtie2_dir} \\\n", "#${genome_folder}\n", " \n", " \n", "#/zr3644_11_R2.fastp-trim.20201206.fq.gz\n", " \n", "find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \\\n", "| xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -I{} ${bismark_dir}/bismark \\\n", "--path_to_bowtie ${bowtie2_dir} \\\n", "-genome ${genome_folder} \\\n", "-p 4 \\\n", "-score_min L,0,-0.6 \\\n", "--non_directional \\\n", "-1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \\\n", "-2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \\\n", " \n", " \n", " \n", "find *.bam | \\\n", "xargs basename -s .bam | \\\n", "xargs -I{} ${bismark_dir}/deduplicate_bismark \\\n", "--bam \\\n", "--paired \\\n", "{}.bam\n", " \n", " \n", " \n", "${bismark_dir}/bismark_methylation_extractor \\\n", "--bedGraph --counts --scaffolds \\\n", "--multicore 14 \\\n", "--buffer_size 75% \\\n", "*deduplicated.bam\n", " \n", " \n", " \n", "# Bismark processing report\n", " \n", "${bismark_dir}/bismark2report\n", " \n", "#Bismark summary report\n", " \n", "${bismark_dir}/bismark2summary\n", " \n", " \n", " \n", "# Sort files for methylkit and IGV\n", " \n", "find *deduplicated.bam | \\\n", "xargs basename -s .bam | \\\n", "xargs -I{} ${samtools} \\\n", "sort --threads 28 {}.bam \\\n", "-o {}.sorted.bam\n", " \n", "# Index sorted files for IGV\n", "# The \"-@ 16\" below specifies number of CPU threads to use.\n", " \n", "find *.sorted.bam | \\\n", "xargs basename -s .sorted.bam | \\\n", "xargs -I{} ${samtools} \\\n", "index -@ 28 {}.sorted.bam\n", " \n", " \n", " \n", "# \n", "# \n", "# find *deduplicated.bismark.cov.gz \\\n", "# | xargs basename -s _R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz \\\n", "# | xargs -I{} ${bismark_dir}/coverage2cytosine \\\n", "# --genome_folder ${genome_folder} \\\n", "# -o {} \\\n", "# --merge_CpG \\\n", "# --zero_based \\\n", "# {}_R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz\n", "# \n", "# \n", "# #creating bedgraphs post merge\n", "# \n", "# for f in *merged_CpG_evidence.cov\n", "# do\n", "# STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n", "# cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \\\n", "# > \"${STEM}\"_10x.bedgraph\n", "# done\n", "# \n", "# \n", "# \n", "# for f in *merged_CpG_evidence.cov\n", "# do\n", "# STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n", "# cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \\\n", "# > \"${STEM}\"_5x.bedgraph\n", "# done\n", "# \n", "# \n", "# #creating tab files with raw count for glms\n", "# \n", "# for f in *merged_CpG_evidence.cov\n", "# do\n", "# STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n", "# cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4, $5, $6}}' \\\n", "# > \"${STEM}\"_10x.tab\n", "# done\n", "# \n", "# \n", "# for f in *merged_CpG_evidence.cov\n", "# do\n", "# STEM=$(basename \"${f}\" .CpG_report.merged_CpG_evidence.cov)\n", "# cat \"${f}\" | awk -F $'\\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4, $5, $6}}' \\\n", "# > \"${STEM}\"_5x.tab\n", "# done\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "[sr320@mox1 021321-hw-bs]$ head *report* | grep \"Mapping efficiency:\"\n", "Mapping efficiency:\t61.6% \n", "Mapping efficiency:\t61.6% \n", "Mapping efficiency:\t62.2% \n", "Mapping efficiency:\t61.9% \n", "Mapping efficiency:\t62.4% \n", "Mapping efficiency:\t62.1% \n", "Mapping efficiency:\t62.5% \n", "Mapping efficiency:\t62.1% \n", "Mapping efficiency:\t62.7% \n", "Mapping efficiency:\t62.3% \n", "Mapping efficiency:\t62.7% \n", "Mapping efficiency:\t61.3% \n", "Mapping efficiency:\t61.7% \n", "Mapping efficiency:\t62.5% \n", "Mapping efficiency:\t62.4% \n", "Mapping efficiency:\t62.6% \n", "Mapping efficiency:\t61.6% \n", "Mapping efficiency:\t61.6% \n", "Mapping efficiency:\t61.8% \n", "Mapping efficiency:\t61.8% \n", "Mapping efficiency:\t62.3% \n", "Mapping efficiency:\t61.6% \n", "Mapping efficiency:\t61.9% \n", "Mapping efficiency:\t61.9% \n", "``` " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "MultiQC https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/021321-hw-bs/multiqc_report.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }