{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Comparing genome wide CpG coverage of bismark alignments to v070 and v074 for _P.generosa_ juveniles (samples 205,206,214,215,220,221,226,and 227)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Variables to be set by user\n", "\n", "Re: two \"work_dir\" variables -\n", "Both are needed, as one is needed by Bash and the other by Python (respectively)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: work_dir=/Users/Shelly/Documents/GitHub/Shelly_Pgenerosa/analyses/JuviPgen_ALSL2lowd145/compare_coverage/\n" ] } ], "source": [ "%env work_dir = /Users/Shelly/Documents/GitHub/Shelly_Pgenerosa/analyses/JuviPgen_ALSL2lowd145/compare_coverage/\n", "work_dir = \"/Users/Shelly/Documents/GitHub/Shelly_Pgenerosa/analyses/JuviPgen_ALSL2lowd145/compare_coverage/\"\n", "output_plot = \"201900807_Pgenr_cov_comparison.png\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Import necessary modules" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas\n", "import os\n", "import numpy\n", "from IPython.display import display\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Make new working directory, download files and rename using wget ```--output-document``` argument" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Shelly/Documents/GitHub/Shelly_Pgenerosa/analyses/JuviPgen_ALSL2lowd145/compare_coverage\n" ] } ], "source": [ "cd $work_dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Loop through coverage files to calculate percent sequencing coverage for each Bismark subset\n", "\n", "** had to unzip my files first (gunzip *CpG_report.txt.gz)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-205_S26_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-206_S27_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-214_S30_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-215_S31_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-220_S32_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-221_S33_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-226_S34_L004_cytosine_CpG_cov_report.CpG_report.txt', '/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-227_S35_L004_cytosine_CpG_cov_report.CpG_report.txt']\n", "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-205_S26_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 6879517'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 1255993'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 420589'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 99839'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 24544891'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 21.9'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 4.0'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 1.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+00CGCGT0
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 0 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-206_S27_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 8851618'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 2075782'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 768131'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 182693'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.7'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 22572790'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 28.2'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 6.6'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 2.4'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.6'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+00CGCGT0
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 0 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-214_S30_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 11274157'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 3256840'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 1283308'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 283539'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 1.0'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 20150251'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 35.9'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 10.4'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 4.1'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.9'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+01CGCGT1
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 1 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 1 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-215_S31_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 8549789'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 2079399'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 801290'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 181652'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.7'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 22874619'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 27.2'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 6.6'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 2.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.6'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+01CGCGG1
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+01CGCGT1
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 1 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 1 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 1 \n", "3 CG CGG 0 \n", "4 CG CGT 1 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-220_S32_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 7168694'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 1300534'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 421024'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 96790'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 24255714'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 22.8'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 4.1'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 1.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+00CGCGT0
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 0 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-221_S33_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 7545672'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 1420487'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 467875'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 106656'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 23878736'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 24.0'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 4.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 1.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+01CGCGT1
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 1 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 1 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-226_S34_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 6183923'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 1035846'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 330392'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 75313'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.4'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 25240485'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 19.7'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 3.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 1.1'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.2'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+00CGCGT0
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 0 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "----------------------------------------------\n", "----------------------------------------------\n", "/Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/EPI-227_S35_L004_cytosine_CpG_cov_report.CpG_report.txt\n" ] }, { "data": { "text/plain": [ "'Total coverage: 7059572'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'3x coverage: 1231454'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'5x coverage: 387772'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'10x coverage: 87477'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Mean coverage: 0.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'No coverage: 24364836'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent coverage: 22.5'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 3x coverage: 3.9'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 5x coverage: 1.2'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Percent 10x coverage: 0.3'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromposstrandmethunmethC-contexttrinucleotidecoverage
0PGA_scaffold1__77_contigs__length_896438574+00CGCGC0
1PGA_scaffold1__77_contigs__length_896438575-00CGCGC0
2PGA_scaffold1__77_contigs__length_8964385718+00CGCGG0
3PGA_scaffold1__77_contigs__length_8964385719-00CGCGG0
4PGA_scaffold1__77_contigs__length_8964385754+00CGCGT0
\n", "
" ], "text/plain": [ " chrom pos strand meth unmeth \\\n", "0 PGA_scaffold1__77_contigs__length_89643857 4 + 0 0 \n", "1 PGA_scaffold1__77_contigs__length_89643857 5 - 0 0 \n", "2 PGA_scaffold1__77_contigs__length_89643857 18 + 0 0 \n", "3 PGA_scaffold1__77_contigs__length_89643857 19 - 0 0 \n", "4 PGA_scaffold1__77_contigs__length_89643857 54 + 0 0 \n", "\n", " C-context trinucleotide coverage \n", "0 CG CGC 0 \n", "1 CG CGC 0 \n", "2 CG CGG 0 \n", "3 CG CGG 0 \n", "4 CG CGT 0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Variable declaration\n", "bismark_subset_list = []\n", "mean_seq_coverage = []\n", "percent_seq_coverage = []\n", "percent_3x_seq_coverage = []\n", "percent_5x_seq_coverage = []\n", "percent_10x_seq_coverage = []\n", "\n", "# Create list of coverage files in current directory\n", "\n", "cov_files = !find /Volumes/web/metacarcinus/Pgenerosa/analyses/20190806_v074/*.CpG_report.txt\n", "print(cov_files)\n", "\n", "# Loop through coverage files\n", "for file in cov_files:\n", " print(\"----------------------------------------------\")\n", " print(\"----------------------------------------------\")\n", " print (file)\n", " subset_name = file[:-4] # Remove file suffix (.txt)\n", " bismark_subset_list.append(subset_name)\n", " #\n", " #\n", " # Create dataframe and add column names (taken from Bismark documentation)\n", " dataframe = pandas.read_csv(\n", " file,\n", " sep='\\t',\n", " header=None,\n", " names=[\"chrom\", \"pos\", \"strand\", \"meth\", \"unmeth\", \"C-context\", \"trinucleotide\"])\n", " \n", " dataframe['coverage'] = dataframe['meth'] + dataframe['unmeth'] # Sum of methylated and unmethylated coverage for each position.\n", " \n", " total_CpG = len(dataframe) # Count of all CpGs in genome.\n", " \n", " \n", " coverage = sum(dataframe['coverage']>0) # Count of all CpG positions with sequence coverage\n", " coverage_3x = sum(dataframe['coverage']>=3)\n", " coverage_5x = sum(dataframe['coverage']>=5)\n", " coverage_10x = sum(dataframe['coverage']>=10)\n", " mean_coverage = round(dataframe[\"coverage\"].mean(), 1)\n", " \n", " display(\"Total coverage: \" + str(coverage))\n", " display(\"3x coverage: \" + str(coverage_3x))\n", " display(\"5x coverage: \" + str(coverage_5x))\n", " display(\"10x coverage: \" + str(coverage_10x))\n", " display(\"Mean coverage: \" + str(mean_coverage))\n", " \n", " no_coverage = sum(dataframe['coverage']==0) # Count of all CpG posiitions with no sequence coverage\n", " percent_coverage = round((coverage / total_CpG * 100.0), 1) # Rounds to 1 decimal\n", " percent_3x_coverage = round((coverage_3x / total_CpG * 100.0), 1) # Rounds to 1 decimal\n", " percent_5x_coverage = round((coverage_5x / total_CpG * 100.0), 1) # Rounds to 1 decimal\n", " percent_10x_coverage = round((coverage_10x / total_CpG * 100.0), 1) # Rounds to 1 decimal\n", " \n", " mean_seq_coverage.append(mean_coverage)\n", " percent_seq_coverage.append(percent_coverage)\n", " percent_3x_seq_coverage.append(percent_3x_coverage)\n", " percent_5x_seq_coverage.append(percent_5x_coverage)\n", " percent_10x_seq_coverage.append(percent_10x_coverage)\n", " \n", " display(\"No coverage: \" + str(no_coverage))\n", " display(\"Percent coverage: \" + str(percent_coverage))\n", " display(\"Percent 3x coverage: \" + str(percent_3x_coverage))\n", " display(\"Percent 5x coverage: \" + str(percent_5x_coverage))\n", " display(\"Percent 10x coverage: \" + str(percent_10x_coverage))\n", " display(dataframe.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create new dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bismark SampleMean CoveragePercent 10x CoveragePercent 3x CoveragePercent 5x CoveragePercent Coverage
02050.50.34.01.321.9
12060.70.66.62.428.2
22141.00.910.44.135.9
32150.70.66.62.527.2
42200.50.34.11.322.8
52210.50.34.51.524.0
62260.40.23.31.119.7
72270.50.33.91.222.5
\n", "
" ], "text/plain": [ " Bismark Sample Mean Coverage Percent 10x Coverage Percent 3x Coverage \\\n", "0 205 0.5 0.3 4.0 \n", "1 206 0.7 0.6 6.6 \n", "2 214 1.0 0.9 10.4 \n", "3 215 0.7 0.6 6.6 \n", "4 220 0.5 0.3 4.1 \n", "5 221 0.5 0.3 4.5 \n", "6 226 0.4 0.2 3.3 \n", "7 227 0.5 0.3 3.9 \n", "\n", " Percent 5x Coverage Percent Coverage \n", "0 1.3 21.9 \n", "1 2.4 28.2 \n", "2 4.1 35.9 \n", "3 2.5 27.2 \n", "4 1.3 22.8 \n", "5 1.5 24.0 \n", "6 1.1 19.7 \n", "7 1.2 22.5 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coverage_dataframe = pandas.DataFrame(\n", " {\n", " 'Bismark Sample': ['205','206','214','215', '220', '221', '226', '227'],\n", " 'Mean Coverage': mean_seq_coverage,\n", " 'Percent Coverage': percent_seq_coverage,\n", " 'Percent 3x Coverage': percent_3x_seq_coverage,\n", " 'Percent 5x Coverage': percent_5x_seq_coverage,\n", " 'Percent 10x Coverage': percent_10x_seq_coverage,\n", " \n", " })\n", "\n", "coverage_dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create line plot overlayed on bar chart, showing percent sequencing coverage for each Bismark subset option" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Sort data\n", "coverage_dataframe = coverage_dataframe.sort_values(by=['Percent Coverage'])\n", "\n", "\n", "exclude = ['Mean Coverage']\n", "coverage_dataframe = coverage_dataframe.loc[:, coverage_dataframe.columns.difference(exclude)] # Don't plot Mean Coverage\n", "ax = coverage_dataframe.plot(x='Bismark Sample',marker='o') # Line plot with marker points.\n", "coverage_dataframe.plot(kind='bar', x='Bismark Sample', ax=ax) # Overlays bar chart using axes defined in line plot\n", "\n", "# Save figure to file.\n", "# \"bbox_inches\" argument needed to prevent x asis lables from getting cut off.\n", "plt.savefig(output_plot, bbox_inches = \"tight\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }