{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bedtools for finding overlap of DMLS and Genomic Regions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In examining _C viriginca_ gonad tissue for the influence of ocean acidification a bed file was created with coordinates of diffferentially methylated loci (DMLs)." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t346071\t346073\t-\t50\n", "NC_035780.1\t990995\t990997\t-\t-51\n", "NC_035780.1\t1958998\t1959000\t-\t53\n", "NC_035780.1\t2541726\t2541728\t-\t-58\n", "NC_035780.1\t2584492\t2584494\t+\t56\n", "NC_035780.1\t2586508\t2586510\t-\t-57\n", "NC_035780.1\t2588794\t2588796\t+\t-51\n", "NC_035780.1\t2588794\t2588796\t-\t-57\n", "NC_035780.1\t2589720\t2589722\t-\t50\n", "NC_035780.1\t4286286\t4286288\t+\t54\n" ] } ], "source": [ "!head R/0510/analyses/dml05251200.bed" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 422 R/0510/analyses/dml05251200.bed\n" ] } ], "source": [ "!wc -l R/0510/analyses/dml05251200.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Columns: Chromosome, start, end, stand, fold difference (with direction)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Genome tracks" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 731279 data/ref_C_virginica-3.0_Gnomon_exon.bed\n", "NC_035780.1\t13578\t13603\n", "NC_035780.1\t14237\t14290\n", "NC_035780.1\t14557\t14594\n", "NC_035780.1\t28961\t29073\n", "NC_035780.1\t30524\t31557\n", "NC_035780.1\t31736\t31887\n", "NC_035780.1\t31977\t32565\n", "NC_035780.1\t32959\t33324\n", "NC_035780.1\t66869\t66897\n", "NC_035780.1\t64123\t64334\n" ] } ], "source": [ "!wc -l data/ref_C_virginica-3.0_Gnomon_exon.bed\n", "!head data/ref_C_virginica-3.0_Gnomon_exon.bed" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 319262 data/ref_C_virginica-3.0_intron-mrna.merged.bed\n", "NC_035780.1\t28961\t28961\n", "NC_035780.1\t29074\t30524\n", "NC_035780.1\t31558\t31736\n", "NC_035780.1\t31888\t31977\n", "NC_035780.1\t32566\t32959\n", "NC_035780.1\t43110\t43112\n", "NC_035780.1\t44359\t45913\n", "NC_035780.1\t46507\t64123\n", "NC_035780.1\t64335\t66869\n", "NC_035780.1\t85606\t85606\n" ] } ], "source": [ "!wc -l data/ref_C_virginica-3.0_intron-mrna.merged.bed\n", "!head data/ref_C_virginica-3.0_intron-mrna.merged.bed" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 60201 data/ref_C_virginica-3.0_Gnomon_mRNA.gff3\n", "NC_035780.1\tGnomon\tmRNA\t28961\t33324\t.\t+\t.\tID=rna1;Parent=gene1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;Name=XM_022471938.1;gbkey=mRNA;gene=LOC111126949;model_evidence=Supporting evidence includes similarity to: 3 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\tmRNA\t43111\t66897\t.\t-\t.\tID=rna2;Parent=gene2;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;Name=XM_022447324.1;gbkey=mRNA;gene=LOC111110729;model_evidence=Supporting evidence includes similarity to: 1 Protein%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\n" ] } ], "source": [ "!wc -l data/ref_C_virginica-3.0_Gnomon_mRNA.gff3\n", "!head -2 data/ref_C_virginica-3.0_Gnomon_mRNA.gff3" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 14458703 analyses/Cvirg_v3_CG-motif.bed\n", "NC_007175.2\t16801\t16803\tmisc_feature\t0\t+\n", "NC_007175.2\t16924\t16926\tmisc_feature\t0\t+\n", "NC_007175.2\t16943\t16945\tmisc_feature\t0\t+\n", "NC_007175.2\t17006\t17008\tmisc_feature\t0\t+\n", "NC_007175.2\t17020\t17022\tmisc_feature\t0\t+\n", "NC_007175.2\t17045\t17047\tmisc_feature\t0\t+\n", "NC_007175.2\t17086\t17088\tmisc_feature\t0\t+\n", "NC_007175.2\t17120\t17122\tmisc_feature\t0\t+\n", "NC_007175.2\t17130\t17132\tmisc_feature\t0\t+\n", "NC_007175.2\t17164\t17166\tmisc_feature\t0\t+\n" ] } ], "source": [ "!wc -l analyses/Cvirg_v3_CG-motif.bed\n", "!tail analyses/Cvirg_v3_CG-motif.bed" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t28\t30\tCG_motif\n", "NC_035780.1\t54\t56\tCG_motif\n", "NC_035780.1\t75\t77\tCG_motif\n", "NC_035780.1\t93\t95\tCG_motif\n", "NC_035780.1\t103\t105\tCG_motif\n", "NC_035780.1\t116\t118\tCG_motif\n", "NC_035780.1\t134\t136\tCG_motif\n", "NC_035780.1\t159\t161\tCG_motif\n", "NC_035780.1\t209\t211\tCG_motif\n", "NC_035780.1\t224\t226\tCG_motif\n" ] } ], "source": [ "!sed 's/misc_feature/CG_motif/g' analyses/Cvirg_v3_CG-motif.bed \\\n", "| cut -f $1-4 > analyses/C_virginica-3.0_CG-motif.bed\n", "!head analyses/C_virginica-3.0_CG-motif.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## IGV CHECK" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![igv](img/igv_0530.png)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-wa \\\n", "-a data/ref_C_virginica-3.0_Gnomon_exon.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 \\\n", "> analyses/test.bed" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t28961\t29073\n", "NC_035780.1\t30524\t31557\n", "NC_035780.1\t31736\t31887\n", "NC_035780.1\t31977\t32565\n", "NC_035780.1\t32959\t33324\n", "NC_035780.1\t43111\t44358\n", "NC_035780.1\t43111\t44358\n", "NC_035780.1\t43111\t44358\n", "NC_035780.1\t43111\t44358\n", "NC_035780.1\t45913\t46506\n", "sort: Broken pipe\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-a data/ref_C_virginica-3.0_Gnomon_exon.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 | sort -k1,1 -k2,2n | head" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t13578\t13603\n", "NC_035780.1\t14237\t14290\n", "NC_035780.1\t14557\t14594\n", "NC_035780.1\t28961\t29073\n", "NC_035780.1\t30524\t31557\n", "NC_035780.1\t31736\t31887\n", "NC_035780.1\t31977\t32565\n", "NC_035780.1\t32959\t33324\n", "NC_035780.1\t66869\t66897\n", "NC_035780.1\t64123\t64334\n" ] } ], "source": [ "!head data/ref_C_virginica-3.0_Gnomon_exon.bed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "All tracks expanded. While intron has been merged, exon is not. **Really concerned about exonswhere there is no mRNA...!**" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cp data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 /Volumes/web/Cvirg_tracks/C_virginica-3.0_Gnomon_mRNA.gff3" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cp data/ref_C_virginica-3.0_Gnomon_exon.bed /Volumes/web/Cvirg_tracks/C_virginica-3.0_Gnomon_exon.bed" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cp data/ref_C_virginica-3.0_intron-mrna.merged.bed /Volumes/web/Cvirg_tracks/C_virginica-3.0_intron.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bedtools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Bedtools Documentation](http://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html) : [Specifically IntersectBed](http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Old Code\n", "```\n", "for i in (\"exon\",\"intron\",\"TE\",\"gene\",\"1k5p_gene_promoter\"):\n", " !intersectbed \\\n", " -wb \\\n", " -a cglarv_development_differ.bed \\\n", " -b ./genome_tracks/Cgigas_v9_{i}.gff \\\n", " > {i}_intersect_DML_dev_wb.txt \n", " !intersectbed \\\n", " -u \\\n", " -a cglarv_development_differ.bed \\\n", " -b ./genome_tracks/Cgigas_v9_{i}.gff \\\n", " > {i}_intersect_DML_dev_u.txt\n", " !wc -l {i}_intersect_DML_dev_u.txt\n", "``` " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 393\n", "overlaps with mRNA\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-u \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 \\\n", "| wc -l\n", "!echo \"overlaps with mRNA\"" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 321\n", "overlaps with exon\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-u \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_exon.bed \\\n", "| wc -l\n", "!echo \"overlaps with exon\"" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 78\n", "overlaps with intron\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-u \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_intron-mrna.merged.bed \\\n", "| wc -l\n", "!echo \"overlaps with intron\"" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "72" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "393-321" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t346071\t346073\t-\t50\tNC_035780.1\tGnomon\tmRNA\t341638\t349379\t.\t-\t.\tID=rna30;Parent=gene22;Dbxref=GeneID:111113503,Genbank:XM_022451800.1;Name=XM_022451800.1;gbkey=mRNA;gene=LOC111113503;model_evidence=Supporting evidence includes similarity to: 4 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 2 samples with support for all annotated introns;product=F-box only protein 47-like;transcript_id=XM_022451800.1\n", "NC_035780.1\t990995\t990997\t-\t-51\tNC_035780.1\tGnomon\tmRNA\t984471\t995318\t.\t-\t.\tID=rna117;Parent=gene66;Dbxref=GeneID:111137104,Genbank:XM_022488366.1;Name=XM_022488366.1;gbkey=mRNA;gene=LOC111137104;model_evidence=Supporting evidence includes similarity to: 1 EST%2C 3 Proteins%2C and 99%25 coverage of the annotated genomic feature by RNAseq alignments;product=SWI/SNF complex subunit SMARCC2-like;transcript_id=XM_022488366.1\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-wb \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 | head -2" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ID=rna30;Parent=gene22;Dbxref=GeneID:111113503,Genbank:XM_022451800.1;Name=XM_022451800.1;gbkey=mRNA;gene=LOC111113503;model_evidence=Supporting evidence includes similarity to: 4 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 2 samples with support for all annotated introns;product=F-box only protein 47-like;transcript_id=XM_022451800.1\n", "ID=rna117;Parent=gene66;Dbxref=GeneID:111137104,Genbank:XM_022488366.1;Name=XM_022488366.1;gbkey=mRNA;gene=LOC111137104;model_evidence=Supporting evidence includes similarity to: 1 EST%2C 3 Proteins%2C and 99%25 coverage of the annotated genomic feature by RNAseq alignments;product=SWI/SNF complex subunit SMARCC2-like;transcript_id=XM_022488366.1\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-wb \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 \\\n", "| cut -f $14 | head -2" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "product=F-box only protein 47-like\n", "product=SWI/SNF complex subunit SMARCC2-like\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X5\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X6\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X2\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X3\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X1\n", "product=double-stranded RNA-specific editase 1-like%2C transcript variant X4\n", "product=oxysterol-binding protein-related protein 8-like%2C transcript variant X1\n", "product=oxysterol-binding protein-related protein 8-like%2C transcript variant X4\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-wb \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 \\\n", "| cut -f $14 | cut -f8 -d$\";\" | head" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "F-box only protein 47-like\n", "SWI/SNF complex subunit SMARCC2-like\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X5\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X6\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X2\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X3\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X1\n", "double-stranded RNA-specific editase 1-like%2C transcript variant X4\n", "oxysterol-binding protein-related protein 8-like%2C transcript variant X1\n", "oxysterol-binding protein-related protein 8-like%2C transcript variant X4\n" ] } ], "source": [ "!/Applications/bioinfo/bedtools2/bin/intersectBed \\\n", "-wb \\\n", "-a R/0510/analyses/dml05251200.bed \\\n", "-b data/ref_C_virginica-3.0_Gnomon_mRNA.gff3 \\\n", "| cut -f $14 | cut -f8 -d$\";\" | cut -f2 -d$\"=\" | head" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }