{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# GOterm Annotation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook, I'll annotate the CpG background and DML lists from `methylKit` and `DSS` with GOterms. I will use these annotations for gene enrichment.\n", "\n", "1. Create master annotation table\n", "2. Match CpG background and DML lists with GOterms\n", "3. Modify lists for downstream gene enrichment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Prepare to run script" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 0a. Set working directory" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/Users/yaamini/Documents/project-oyster-oa/code/Haws'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaamini/Documents/project-oyster-oa/analyses\n" ] } ], "source": [ "cd ../../analyses/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!mkdir Haws_08-GOterm-annotation/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaamini/Documents/project-oyster-oa/analyses/Haws_08-GOterm-annotation\n" ] } ], "source": [ "cd Haws_08-GOterm-annotation/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Create master annotation table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1a. Isolate transcript ID from nucleotide sequence information" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413559.3\tD-aspartate oxidase, transcript variant X2\n", "XM_011413566.3\tuncharacterized LOC105317040, transcript variant X1\n", "XM_011413583.3\tPDZ and LIM domain protein Zasp, transcript variant X1\n", "XM_011413585.3\tPDZ and LIM domain protein Zasp, transcript variant X2\n", "XM_011413589.3\tzinc finger and BTB domain-containing protein 11, transcript variant X1\n", "XM_011413590.3\tzinc finger and BTB domain-containing protein 11, transcript variant X3\n", "XM_011413596.3\tuncharacterized LOC105317059\n", "XM_011413597.3\tuncharacterized LOC105317060\n", "XM_011413601.3\tmitogen-activated protein kinase HOG1, transcript variant X1\n", "XM_011413602.3\tmitogen-activated protein kinase HOG1, transcript variant X2\n", " 62377 cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab\n" ] } ], "source": [ "!head cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab\n", "!wc -l cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1a. Format `DIAMOND blastx` output" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-06-07 10:19:13-- https://gannet.fish.washington.edu/spartina/project-gigas-oa-meth/output/blastx/20210605-cgigas-roslin-mito-blastx.outfmt6\n", "Resolving gannet.fish.washington.edu (gannet.fish.washington.edu)... 128.95.149.52\n", "Connecting to gannet.fish.washington.edu (gannet.fish.washington.edu)|128.95.149.52|:443... connected.\n", "WARNING: cannot verify gannet.fish.washington.edu's certificate, issued by ‘CN=InCommon RSA Server CA,OU=InCommon,O=Internet2,L=Ann Arbor,ST=MI,C=US’:\n", " Unable to locally verify the issuer's authority.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 10442376 (10.0M)\n", "Saving to: ‘20210605-cgigas-roslin-mito-blastx.outfmt6’\n", "\n", "20210605-cgigas-ros 100%[===================>] 9.96M 46.6MB/s in 0.2s \n", "\n", "2021-06-07 10:19:14 (46.6 MB/s) - ‘20210605-cgigas-roslin-mito-blastx.outfmt6’ saved [10442376/10442376]\n", "\n" ] } ], "source": [ "#Download blastx output\n", "!wget https://gannet.fish.washington.edu/spartina/project-gigas-oa-meth/output/blastx/20210605-cgigas-roslin-mito-blastx.outfmt6 \\\n", "--no-check-certificate" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lcl|NC_047559.1_mrna_XM_034463066.1_26\tsp|A1KZ92|PXDNL_HUMAN\t24.891\t229\t153\t6\t867\t1535\t278\t493\t4.55e-05\t51.2\n", "lcl|NC_047559.1_mrna_XM_034463224.1_35\tsp|Q7LHG5|YI31B_YEAST\t50.658\t152\t71\t1\t1072\t1515\t617\t768\t8.84e-40\t157\n", "lcl|NC_047559.1_mrna_XM_034456887.1_39\tsp|A1A5V9|ELP5_DANRE\t28.873\t284\t194\t4\t356\t1198\t1\t279\t6.43e-29\t118\n", "lcl|NC_047559.1_mrna_XM_011455176.3_47\tsp|A1A5V9|ELP5_DANRE\t28.873\t284\t194\t4\t376\t1218\t1\t279\t7.04e-29\t118\n", "lcl|NC_047559.1_mrna_XM_011455177.3_48\tsp|A1A5V9|ELP5_DANRE\t28.873\t284\t194\t4\t245\t1087\t1\t279\t3.80e-29\t118\n", "lcl|NC_047559.1_mrna_XM_011419438.3_53\tsp|A0A159BP93|CITB_MONRU\t30.795\t302\t179\t11\t3053\t2199\t3\t291\t3.63e-29\t123\n", "lcl|NC_047559.1_mrna_XM_020064467.2_54\tsp|A0A159BP93|CITB_MONRU\t30.795\t302\t179\t11\t3307\t2453\t3\t291\t3.97e-29\t123\n", "lcl|NC_047559.1_mrna_XM_011419437.3_55\tsp|O35077|GPDA_RAT\t59.259\t351\t140\t3\t111\t1160\t1\t349\t4.39e-140\t434\n", "lcl|NC_047559.1_mrna_XM_011419460.2_56\tsp|A0A159BP93|CITB_MONRU\t30.795\t302\t179\t11\t1\t855\t3\t291\t4.56e-31\t122\n", "lcl|NC_047559.1_mrna_XM_011419435.3_57\tsp|Q2KJE0|TAXB1_BOVIN\t26.958\t779\t414\t20\t174\t2330\t7\t690\t1.25e-54\t209\n", " 93983 20210605-cgigas-roslin-mito-blastx.outfmt6\n" ] } ], "source": [ "#Check output\n", "!head 20210605-cgigas-roslin-mito-blastx.outfmt6\n", "!wc -l 20210605-cgigas-roslin-mito-blastx.outfmt6" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#convert pipes to tab to isolate Uniprot accession code\n", "!tr '|' '\\t' < 20210605-cgigas-roslin-mito-blastx.outfmt6 \\\n", "> cgigas-roslin-mito-blastx.outfmt6.codeIsolated" ] }, { "cell_type": "code", "execution_count": 149, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lcl\tNC_047559.1_mrna_XM_034463066.1_26\tsp\tA1KZ92\tPXDNL_HUMAN\t24.891\t229\t153\t6\t867\t1535\t278\t493\t4.55e-05\t51.2\n", "lcl\tNC_047559.1_mrna_XM_034463224.1_35\tsp\tQ7LHG5\tYI31B_YEAST\t50.658\t152\t71\t1\t1072\t1515\t617\t768\t8.84e-40\t157\n", "lcl\tNC_047559.1_mrna_XM_034456887.1_39\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t356\t1198\t1\t279\t6.43e-29\t118\n", "lcl\tNC_047559.1_mrna_XM_011455176.3_47\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t376\t1218\t1\t279\t7.04e-29\t118\n", "lcl\tNC_047559.1_mrna_XM_011455177.3_48\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t245\t1087\t1\t279\t3.80e-29\t118\n", "lcl\tNC_047559.1_mrna_XM_011419438.3_53\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t3053\t2199\t3\t291\t3.63e-29\t123\n", "lcl\tNC_047559.1_mrna_XM_020064467.2_54\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t3307\t2453\t3\t291\t3.97e-29\t123\n", "lcl\tNC_047559.1_mrna_XM_011419437.3_55\tsp\tO35077\tGPDA_RAT\t59.259\t351\t140\t3\t111\t1160\t1\t349\t4.39e-140\t434\n", "lcl\tNC_047559.1_mrna_XM_011419460.2_56\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t1\t855\t3\t291\t4.56e-31\t122\n", "lcl\tNC_047559.1_mrna_XM_011419435.3_57\tsp\tQ2KJE0\tTAXB1_BOVIN\t26.958\t779\t414\t20\t174\t2330\t7\t690\t1.25e-54\t209\n", " 93983 cgigas-roslin-mito-blastx.outfmt6.codeIsolated\n" ] } ], "source": [ "!head cgigas-roslin-mito-blastx.outfmt6.codeIsolated\n", "!wc -l cgigas-roslin-mito-blastx.outfmt6.codeIsolated" ] }, { "cell_type": "code", "execution_count": 147, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Extract column with transcript ID\n", "#Convert _ to tab\n", "#Extract column with transcript ID\n", "#Add XM_ to the front of each ID\n", "\n", "!cut -f2 cgigas-roslin-mito-blastx.outfmt6.codeIsolated \\\n", "| tr \"_\" \"\\t\" \\\n", "| cut -f5 \\\n", "| sed 's/^/XM_/' \\\n", "> cgigas-roslin-mito-blastx.outfmt6.transcriptID" ] }, { "cell_type": "code", "execution_count": 148, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_034463066.1\n", "XM_034463224.1\n", "XM_034456887.1\n", "XM_011455176.3\n", "XM_011455177.3\n", "XM_011419438.3\n", "XM_020064467.2\n", "XM_011419437.3\n", "XM_011419460.2\n", "XM_011419435.3\n", " 93983 cgigas-roslin-mito-blastx.outfmt6.transcriptID\n" ] } ], "source": [ "!head cgigas-roslin-mito-blastx.outfmt6.transcriptID\n", "!wc -l cgigas-roslin-mito-blastx.outfmt6.transcriptID" ] }, { "cell_type": "code", "execution_count": 150, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_034463066.1\tlcl\tNC_047559.1_mrna_XM_034463066.1_26\tsp\tA1KZ92\tPXDNL_HUMAN\t24.891\t229\t153\t6\t867\t1535\t278\t493\t4.55e-05\t51.2\r\n", "XM_034463224.1\tlcl\tNC_047559.1_mrna_XM_034463224.1_35\tsp\tQ7LHG5\tYI31B_YEAST\t50.658\t152\t71\t1\t1072\t1515\t617\t768\t8.84e-40\t157\r\n", "XM_034456887.1\tlcl\tNC_047559.1_mrna_XM_034456887.1_39\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t356\t1198\t1\t279\t6.43e-29\t118\r\n", "XM_011455176.3\tlcl\tNC_047559.1_mrna_XM_011455176.3_47\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t376\t1218\t1\t279\t7.04e-29\t118\r\n", "XM_011455177.3\tlcl\tNC_047559.1_mrna_XM_011455177.3_48\tsp\tA1A5V9\tELP5_DANRE\t28.873\t284\t194\t4\t245\t1087\t1\t279\t3.80e-29\t118\r\n", "XM_011419438.3\tlcl\tNC_047559.1_mrna_XM_011419438.3_53\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t3053\t2199\t3\t291\t3.63e-29\t123\r\n", "XM_020064467.2\tlcl\tNC_047559.1_mrna_XM_020064467.2_54\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t3307\t2453\t3\t291\t3.97e-29\t123\r\n", "XM_011419437.3\tlcl\tNC_047559.1_mrna_XM_011419437.3_55\tsp\tO35077\tGPDA_RAT\t59.259\t351\t140\t3\t111\t1160\t1\t349\t4.39e-140\t434\r\n", "XM_011419460.2\tlcl\tNC_047559.1_mrna_XM_011419460.2_56\tsp\tA0A159BP93\tCITB_MONRU\t30.795\t302\t179\t11\t1\t855\t3\t291\t4.56e-31\t122\r\n", "XM_011419435.3\tlcl\tNC_047559.1_mrna_XM_011419435.3_57\tsp\tQ2KJE0\tTAXB1_BOVIN\t26.958\t779\t414\t20\t174\t2330\t7\t690\t1.25e-54\t209\r\n" ] } ], "source": [ "#Paste original transcript ID with codeIsolated file\n", "#Check output\n", "\n", "!paste cgigas-roslin-mito-blastx.outfmt6.transcriptID cgigas-roslin-mito-blastx.outfmt6.codeIsolated \\\n", "> cgigas-roslin-mito-blastx.outfmt6.codeIsolated.transcriptID\n", "!head cgigas-roslin-mito-blastx.outfmt6.codeIsolated.transcriptID" ] }, { "cell_type": "code", "execution_count": 151, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Reduce the number of columns using awk: accession code, transcript ID, and e-value\n", "#Sort, and save as a new file.\n", "!awk -v OFS='\\t' '{print $5, $1, $15}' < cgigas-roslin-mito-blastx.outfmt6.codeIsolated.transcriptID | sort \\\n", "> gigas-blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 152, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A060WQA3\tXM_011455784.3\t4.79e-06\r\n", "A0A060WQA3\tXM_011455785.3\t4.65e-06\r\n", "A0A061ACU2\tXM_034483499.1\t0.0\r\n", "A0A061ACU2\tXM_034483499.1\t1.40e-12\r\n", "A0A061ACU2\tXM_034483499.1\t2.13e-27\r\n", "A0A061ACU2\tXM_034483500.1\t0.0\r\n", "A0A061ACU2\tXM_034483500.1\t1.40e-12\r\n", "A0A061ACU2\tXM_034483500.1\t2.44e-27\r\n", "A0A061ACU2\tXM_034483501.1\t0.0\r\n", "A0A061ACU2\tXM_034483501.1\t1.40e-12\r\n" ] } ], "source": [ "!head gigas-blast-sort.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1b. Join with GOterms" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 340M 100 340M 0 0 26.1M 0 0:00:12 0:00:12 --:--:-- 50.6M\n" ] } ], "source": [ "#Download Uniprot database with GOterm information\n", "!curl -O http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A023GPI8\tLECA_CANBL\treviewed\tLectin alpha chain (CboL) [Cleaved into: Lectin beta chain; Lectin gamma chain]\t\tCanavalia boliviana\t237\t\t\tmannose binding [GO:0005537]; metal ion binding [GO:0046872]\tmannose binding [GO:0005537]; metal ion binding [GO:0046872]\tGO:0005537; GO:0046872\n", "A0A023GPJ0\tCDII_ENTCC\treviewed\tImmunity protein CdiI\tcdiI ECL_04450.1\tEnterobacter cloacae subsp. cloacae (strain ATCC 13047 / DSM 30054 / NBRC 13535 / NCDC 279-56)\t145\t\t\t\t\t\n", "A0A023PXA5\tYA19A_YEAST\treviewed\tPutative uncharacterized protein YAL019W-A\tYAL019W-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t189\t\t\t\t\t\n", "A0A023PXB0\tYA019_YEAST\treviewed\tPutative uncharacterized protein YAR019W-A\tYAR019W-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t110\t\t\t\t\t\n", "A0A023PXB5\tIRC2_YEAST\treviewed\tPutative uncharacterized membrane protein IRC2 (Increased recombination centers protein 2)\tIRC2 YDR112W\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t102\t\tintegral component of membrane [GO:0016021]\t\tintegral component of membrane [GO:0016021]\tGO:0016021\n", "A0A023PXB9\tYD99W_YEAST\treviewed\tPutative uncharacterized membrane protein YDR199W\tYDR199W\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t121\t\tintegral component of membrane [GO:0016021]\t\tintegral component of membrane [GO:0016021]\tGO:0016021\n", "A0A023PXC2\tYE53A_YEAST\treviewed\tPutative uncharacterized membrane protein YEL053W-A\tYEL053W-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t115\t\tintegral component of membrane [GO:0016021]\t\tintegral component of membrane [GO:0016021]\tGO:0016021\n", "A0A023PXC7\tYE068_YEAST\treviewed\tPutative uncharacterized membrane protein YER068C-A\tYER068C-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t143\t\tintegral component of membrane [GO:0016021]\t\tintegral component of membrane [GO:0016021]\tGO:0016021\n", "A0A023PXD3\tYE88A_YEAST\treviewed\tPutative uncharacterized protein YER088C-A\tYER088C-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t107\t\t\t\t\t\n", "A0A023PXD5\tYE147_YEAST\treviewed\tPutative uncharacterized membrane protein YER147C-A\tYER147C-A\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t136\t\tintegral component of membrane [GO:0016021]\t\tintegral component of membrane [GO:0016021]\tGO:0016021\n", " 555101 uniprot-SP-GO.sorted\n" ] } ], "source": [ "!head uniprot-SP-GO.sorted\n", "!wc -l uniprot-SP-GO.sorted" ] }, { "cell_type": "code", "execution_count": 153, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A0A7DNP6\tXM_001305294.1\t8.75e-10\tGRHLP_RUDPH\treviewed\tPrepro-gonadotropin-releasing hormone-like protein (rp-GnRH) [Cleaved into: GnRH dodecapeptide; GnRH-associated peptide (GAP)]\t\tRuditapes philippinarum (Japanese littleneck clam) (Venerupis philippinarum)\t94\tneuropeptide signaling pathway [GO:0007218]\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]; neuropeptide signaling pathway [GO:0007218]\tGO:0005576; GO:0007218\n", "A0A0B5A7M7\tXM_001308866.2\t3.97e-12\tINS1_CONIM\treviewed\tCon-Ins Im1 (Insulin 1) [Cleaved into: Con-Ins I1 B chain; Con-Ins I1 A chain]\t\tConus imperialis (Imperial cone)\t150\tglucose metabolic process [GO:0006006]\textracellular region [GO:0005576]\thormone activity [GO:0005179]\textracellular region [GO:0005576]; hormone activity [GO:0005179]; glucose metabolic process [GO:0006006]\tGO:0005179; GO:0005576; GO:0006006\n", "A0A0B5A7M7\tXM_034475035.1\t1.48e-12\tINS1_CONIM\treviewed\tCon-Ins Im1 (Insulin 1) [Cleaved into: Con-Ins I1 B chain; Con-Ins I1 A chain]\t\tConus imperialis (Imperial cone)\t150\tglucose metabolic process [GO:0006006]\textracellular region [GO:0005576]\thormone activity [GO:0005179]\textracellular region [GO:0005576]; hormone activity [GO:0005179]; glucose metabolic process [GO:0006006]\tGO:0005179; GO:0005576; GO:0006006\n", "A0A0B5A8P8\tXM_011417422.3\t4.54e-15\tINS2_CONIM\treviewed\tCon-Ins Im2 (Insulin 2) [Cleaved into: Con-Ins I2 B chain; Con-Ins I2 A chain]\t\tConus imperialis (Imperial cone)\t140\tglucose metabolic process [GO:0006006]\textracellular region [GO:0005576]\thormone activity [GO:0005179]\textracellular region [GO:0005576]; hormone activity [GO:0005179]; glucose metabolic process [GO:0006006]\tGO:0005179; GO:0005576; GO:0006006\n", "A0A0F7YYX3\tXM_011419035.3\t2.09e-19\tCPROH_CONVC\treviewed\tNeuropeptide prohormone-4\t\tConus victoriae (Queen Victoria cone)\t196\t\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]\tGO:0005576\n", "A0A0F7YYX3\tXM_011419037.3\t2.71e-19\tCPROH_CONVC\treviewed\tNeuropeptide prohormone-4\t\tConus victoriae (Queen Victoria cone)\t196\t\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]\tGO:0005576\n", "A0A0F7YYX3\tXM_011419038.3\t2.28e-19\tCPROH_CONVC\treviewed\tNeuropeptide prohormone-4\t\tConus victoriae (Queen Victoria cone)\t196\t\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]\tGO:0005576\n", "A0A0F7YYX3\tXM_011419039.3\t2.39e-19\tCPROH_CONVC\treviewed\tNeuropeptide prohormone-4\t\tConus victoriae (Queen Victoria cone)\t196\t\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]\tGO:0005576\n", "A0A0F7YYX3\tXM_020064360.2\t1.79e-19\tCPROH_CONVC\treviewed\tNeuropeptide prohormone-4\t\tConus victoriae (Queen Victoria cone)\t196\t\textracellular region [GO:0005576]\t\textracellular region [GO:0005576]\tGO:0005576\n", "A0A0F7YZI5\tXM_011442798.3\t8.33e-33\tCTHB5_CONVC\treviewed\tThyrostimulin beta-5 subunit\t\tConus victoriae (Queen Victoria cone)\t135\t\textracellular region [GO:0005576]\thormone activity [GO:0005179]\textracellular region [GO:0005576]; hormone activity [GO:0005179]\tGO:0005179; GO:0005576\n", " 92197 gigas-blast-annot.tab\n" ] } ], "source": [ "#Join the first column in the first file with the first column in the second file\n", "#The files are tab delimited, and the output should also be tab delimited (-t $'\\t')\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "gigas-blast-sort.tab \\\n", "uniprot-SP-GO.sorted \\\n", "> gigas-blast-annot.tab\n", "!head gigas-blast-annot.tab\n", "!wc -l gigas-blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 154, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A0A7DNP6\tXM_001305294.1\tGO:0005576; GO:0007218\r\n", "A0A0B5A7M7\tXM_001308866.2\tGO:0005179; GO:0005576; GO:0006006\r\n", "A0A0B5A7M7\tXM_034475035.1\tGO:0005179; GO:0005576; GO:0006006\r\n", "A0A0B5A8P8\tXM_011417422.3\tGO:0005179; GO:0005576; GO:0006006\r\n", "A0A0F7YYX3\tXM_011419035.3\tGO:0005576\r\n", "A0A0F7YYX3\tXM_011419037.3\tGO:0005576\r\n", "A0A0F7YYX3\tXM_011419038.3\tGO:0005576\r\n", "A0A0F7YYX3\tXM_011419039.3\tGO:0005576\r\n", "A0A0F7YYX3\tXM_020064360.2\tGO:0005576\r\n", "A0A0F7YZI5\tXM_011442798.3\tGO:0005179; GO:0005576\r\n" ] } ], "source": [ "#Extract columns 1 (accession), 2 (transcript ID), and 14 (GOterms)\n", "#Save output\n", "!cut -f1,2,14 gigas-blast-annot.tab \\\n", "> _blast-annot.tab\n", "!head _blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 155, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash \n", "\n", "# This script was originally written to address a specific problem that Rhonda was having\n", "\n", "# input_file is the initial, \"problem\" file\n", "# file is an intermediate file that most of the program works upon\n", "# output_file is the final file produced by the script\n", "input_file=\"_blast-annot.tab\"\n", "file=\"_intermediate.file\"\n", "output_file=\"_blast-GO-unfolded.tab\"\n", "\n", "# sed command substitutes the \"; \" sequence to a tab and writes the new format to a new file.\n", "# This character sequence is how the GO terms are delimited in their field.\n", "sed $'s/; /\\t/g' \"$input_file\" > \"$file\"\n", "\n", "# Identify first field containing a GO term.\n", "# Search file with grep for \"GO:\" and pipe to awk.\n", "# Awk sets tab as field delimiter (-F'\\t'), runs a for loop that looks for \"GO:\" (~/GO:/), and then prints the field number).\n", "# Awk results are piped to sort, which sorts unique by number (-ug).\n", "# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; \"-n1\").\n", "begin_goterms=$(grep \"GO:\" \"$file\" | awk -F'\\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)\n", "\n", "# While loop to process each line of the input file.\n", "while read -r line\n", "\tdo\n", "\t\n", "\t# Send contents of the current line to awk.\n", "\t# Set the field separator as a tab (-F'\\t') and print the number of fields in that line.\n", "\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable \"max_field\".\n", "\tmax_field=$(echo \"$line\" | awk -F'\\t' '{print NF}')\n", "\n", "\t# Send contents of current line to cut.\n", "\t# Cut fields (i.e. retain those fields) 1-12.\n", "\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable \"fixed_fields\"\n", "\tfixed_fields=$(echo \"$line\" | cut -f1-2)\n", "\n", "\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n", "\t# evaluate the number of fields in each line to determine how to handle current line.\n", "\n", "\t# If the value in max_field is less than the field number where the GO terms begin,\n", "\t# then just print the current line (%s) followed by a newline (\\n).\n", "\tif (( \"$max_field\" < \"$begin_goterms\" ))\n", "\t\tthen printf \"%s\\n\" \"$line\"\n", "\t\t\telse\n", "\n", "\t\t\t# Send contents of current line (which contains GO terms) to cut.\n", "\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n", "\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable \"goterms\".\n", "\t\t\tgoterms=$(echo \"$line\" | cut -f\"$begin_goterms\"-\"$max_field\")\n", "\t\t\t\n", "\t\t\t# Assign values in the variable \"goterms\" to a new indexed array (called \"array\"), \n", "\t\t\t# with tab delimiter (IFS=$'\\t')\n", "\t\t\tIFS=$'\\t' read -r -a array <<<\"$goterms\"\n", "\t\t\t\n", "\t\t\t# Iterate through each element of the array.\n", "\t\t\t# Print the first 12 fields (i.e. the fields stored in \"fixed_fields\") followed by a tab (%s\\t).\n", "\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n", "\t\t\tfor element in \"${!array[@]}\"\t\n", "\t\t\t\tdo printf \"%s\\t%s\\n\" \"$fixed_fields\" \"${array[$element]}\"\n", "\t\t\tdone\n", "\tfi\n", "\n", "# Send the input file into the while loop and send the output to a file named \"rhonda_fixed.txt\".\n", "done < \"$file\" > \"$output_file\"" ] }, { "cell_type": "code", "execution_count": 156, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A0A7DNP6\tXM_001305294.1\tGO:0005576\r\n", "A0A0A7DNP6\tXM_001305294.1\tGO:0007218\r\n", "A0A0B5A7M7\tXM_001308866.2\tGO:0005179\r\n", "A0A0B5A7M7\tXM_001308866.2\tGO:0005576\r\n", "A0A0B5A7M7\tXM_001308866.2\tGO:0006006\r\n", "A0A0B5A7M7\tXM_034475035.1\tGO:0005179\r\n", "A0A0B5A7M7\tXM_034475035.1\tGO:0005576\r\n", "A0A0B5A7M7\tXM_034475035.1\tGO:0006006\r\n", "A0A0B5A8P8\tXM_011417422.3\tGO:0005179\r\n", "A0A0B5A8P8\tXM_011417422.3\tGO:0005576\r\n" ] } ], "source": [ "!head _blast-GO-unfolded.tab" ] }, { "cell_type": "code", "execution_count": 157, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Reorganize and sort columns\n", "!awk '{print $3\"\\t\"$2}' _blast-GO-unfolded.tab | gsort -V > _blast-GO-unfolded.sorted" ] }, { "cell_type": "code", "execution_count": 158, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GO:0000002\tXM_004596953.1\n", "GO:0000002\tXM_004599087.1\n", "GO:0000002\tXM_004599974.1\n", "GO:0000002\tXM_004602618.1\n", "GO:0000002\tXM_004604080.1\n", "GO:0000002\tXM_011413926.3\n", "GO:0000002\tXM_011416774.2\n", "GO:0000002\tXM_011428102.3\n", "GO:0000002\tXM_011430231.3\n", "GO:0000002\tXM_011434743.3\n", " 1565846 3129025 40681933 _blast-GO-unfolded.sorted\n" ] } ], "source": [ "!head _blast-GO-unfolded.sorted\n", "!wc _blast-GO-unfolded.sorted" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1c. Join with GO Slim terms" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 2314k 100 2314k 0 0 662k 0 0:00:03 0:00:03 --:--:-- 663k\n" ] } ], "source": [ "#Get GO to GOSlim matching\n", "!curl -O http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GO:0000001\tmitochondrion inheritance\tcell organization and biogenesis\tP\n", "GO:0000002\tmitochondrial genome maintenance\tcell organization and biogenesis\tP\n", "GO:0000003\treproduction\tother biological processes\tP\n", "GO:0000006\thigh affinity zinc uptake transmembrane transporter activity\ttransporter activity\tF\n", "GO:0000007\tlow-affinity zinc ion transmembrane transporter activity\ttransporter activity\tF\n", "GO:0000009\t\"alpha-1,6-mannosyltransferase activity\"\tother molecular function\tF\n", "GO:0000010\ttrans-hexaprenyltranstransferase activity\tother molecular function\tF\n", "GO:0000011\tvacuole inheritance\tcell organization and biogenesis\tP\n", "GO:0000012\tsingle strand break repair\tDNA metabolism\tP\n", "GO:0000012\tsingle strand break repair\tstress response\tP\n", " 30796 GO-GOslim.sorted\n" ] } ], "source": [ "!head GO-GOslim.sorted\n", "!wc -l GO-GOslim.sorted" ] }, { "cell_type": "code", "execution_count": 162, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_\tGO:0016021\tintegral to membrane\tother membranes\tC\n", "XM_001305288.1\tGO:0001501\tskeletal system development\tdevelopmental processes\tP\n", "XM_001305288.1\tGO:0005576\textracellular region\tnon-structural extracellular\tC\n", "XM_001305288.1\tGO:0005595\tcollagen type XII\textracellular matrix\tC\n", "XM_001305288.1\tGO:0005615\textracellular space\tnon-structural extracellular\tC\n", "XM_001305288.1\tGO:0005788\tendoplasmic reticulum lumen\tER/Golgi\tC\n", "XM_001305288.1\tGO:0007155\tcell adhesion\tcell adhesion\tP\n", "XM_001305288.1\tGO:0030020\textracellular matrix structural constituent conferring tensile strength\textracellular structural activity\tF\n", "XM_001305288.1\tGO:0030199\tcollagen fibril organization\tcell organization and biogenesis\tP\n", "XM_001305288.1\tGO:0030574\tcollagen catabolic process\tother metabolic processes\tP\n", " 754502 Blastquery-GOslim.tab\n" ] } ], "source": [ "#Join files to get GOslim for each query (with duplicate GOslim / query removed)\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "_blast-GO-unfolded.sorted \\\n", "GO-GOslim.sorted \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $2, $1, $3, $4, $5}' \\\n", "| sort \\\n", "> Blastquery-GOslim.tab\n", "!head Blastquery-GOslim.tab\n", "!wc -l Blastquery-GOslim.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Match CpG background and DML lists with GOterms" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### 2a. Obtain transcript and gene ID information from mRNA track" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_047559.1\tGnomon\tmRNA\t14114\t15804\t.\t+\t.\tID=rna-XM_034463183.1;Parent=gene-LOC109621113;Dbxref=GeneID:109621113,Genbank:XM_034463183.1;Name=XM_034463183.1;gbkey=mRNA;gene=LOC109621113;model_evidence=Supporting evidence includes similarity to: 78%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=uncharacterized LOC109621113;transcript_id=XM_034463183.1\n", "NC_047559.1\tGnomon\tmRNA\t16867\t19160\t.\t-\t.\tID=rna-XM_034463195.1;Parent=gene-LOC117687066;Dbxref=GeneID:117687066,Genbank:XM_034463195.1;Name=XM_034463195.1;gbkey=mRNA;gene=LOC117687066;model_evidence=Supporting evidence includes similarity to: 98%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 14 samples with support for all annotated introns;product=uncharacterized LOC117687066;transcript_id=XM_034463195.1\n", "NC_047559.1\tGnomon\tmRNA\t61887\t71038\t.\t-\t.\tID=rna-XM_034471753.1;Parent=gene-LOC117689737;Dbxref=GeneID:117689737,Genbank:XM_034471753.1;Name=XM_034471753.1;gbkey=mRNA;gene=LOC117689737;model_evidence=Supporting evidence includes similarity to: 1 long SRA read%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 3 samples with support for all annotated introns;product=uncharacterized LOC117689737%2C transcript variant X8;transcript_id=XM_034471753.1\n", "NC_047559.1\tGnomon\tmRNA\t80321\t86124\t.\t+\t.\tID=rna-XM_034463075.1;Parent=gene-LOC117687025;Dbxref=GeneID:117687025,Genbank:XM_034463075.1;Name=XM_034463075.1;gbkey=mRNA;gene=LOC117687025;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 16 samples with support for all annotated introns;product=uncharacterized LOC117687025;transcript_id=XM_034463075.1\n", "NC_047559.1\tGnomon\tmRNA\t97820\t126387\t.\t+\t.\tID=rna-XM_034463066.1;Parent=gene-LOC105333279;Dbxref=GeneID:105333279,Genbank:XM_034463066.1;Name=XM_034463066.1;gbkey=mRNA;gene=LOC105333279;model_evidence=Supporting evidence includes similarity to: 31 long SRA reads%2C and 99%25 coverage of the annotated genomic feature by RNAseq alignments;product=uncharacterized LOC105333279;transcript_id=XM_034463066.1\n", "NC_047559.1\tGnomon\tmRNA\t139293\t141527\t.\t-\t.\tID=rna-XM_034463207.1;Parent=gene-LOC105321471;Dbxref=GeneID:105321471,Genbank:XM_034463207.1;Name=XM_034463207.1;gbkey=mRNA;gene=LOC105321471;model_evidence=Supporting evidence includes similarity to: 16 Proteins;product=uncharacterized LOC105321471;transcript_id=XM_034463207.1\n", "NC_047559.1\tGnomon\tmRNA\t142829\t146045\t.\t-\t.\tID=rna-XM_034464216.1;Parent=gene-LOC105321472;Dbxref=GeneID:105321472,Genbank:XM_034464216.1;Name=XM_034464216.1;gbkey=mRNA;gene=LOC105321472;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 32 samples with support for all annotated introns;product=uncharacterized LOC105321472;transcript_id=XM_034464216.1\n", "NC_047559.1\tGnomon\tmRNA\t151758\t185673\t.\t+\t.\tID=rna-XM_034463215.1;Parent=gene-LOC117687070;Dbxref=GeneID:117687070,Genbank:XM_034463215.1;Name=XM_034463215.1;gbkey=mRNA;gene=LOC117687070;model_evidence=Supporting evidence includes similarity to: 1 long SRA read%2C and 92%25 coverage of the annotated genomic feature by RNAseq alignments;product=uncharacterized LOC117687070;transcript_id=XM_034463215.1\n", "NC_047559.1\tGnomon\tmRNA\t203632\t209757\t.\t+\t.\tID=rna-XM_034481799.1;Parent=gene-LOC105344774;Dbxref=GeneID:105344774,Genbank:XM_034481799.1;Name=XM_034481799.1;gbkey=mRNA;gene=LOC105344774;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments;product=uncharacterized LOC105344774;transcript_id=XM_034481799.1\n", "NC_047559.1\tGnomon\tmRNA\t265780\t267611\t.\t-\t.\tID=rna-XM_034463224.1;Parent=gene-LOC117687074;Dbxref=GeneID:117687074,Genbank:XM_034463224.1;Name=XM_034463224.1;gbkey=mRNA;gene=LOC117687074;model_evidence=Supporting evidence includes similarity to: 1 Protein;product=uncharacterized LOC117687074;transcript_id=XM_034463224.1\n", " 63341 ../../genome-feature-files/cgigas_uk_roslin_v1_mRNA.gff\n" ] } ], "source": [ "!head ../../genome-feature-files/cgigas_uk_roslin_v1_mRNA.gff\n", "!wc -l ../../genome-feature-files/cgigas_uk_roslin_v1_mRNA.gff" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate column with ID information\n", "#Convert multiple delimiters to tabs\n", "#Isolate transcript and gene ID columns\n", "#Ensure they are tab-delimited\n", "#Sort by gene ID\n", "#Save output\n", "\n", "!cut -f9 ../../genome-feature-files/cgigas_uk_roslin_v1_mRNA.gff \\\n", "| tr \"=;:-,\" \"\\t\" \\\n", "| cut -f3,9 \\\n", "| awk '{print $2\"\\t\"$1}' \\\n", "| sort \\\n", "> cgigas_uk_roslin_v1_mRNA.transcriptID.geneID" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105317035\tXM_011413559.3\n", "105317035\tXM_034443160.1\n", "105317035\tXM_034443162.1\n", "105317035\tXM_034443163.1\n", "105317035\tXM_034443164.1\n", "105317036\tXM_034443166.1\n", "105317036\tXM_034443167.1\n", "105317036\tXM_034443168.1\n", "105317040\tXM_011413566.3\n", "105317040\tXM_034443178.1\n", " 63341 cgigas_uk_roslin_v1_mRNA.transcriptID.geneID\n" ] } ], "source": [ "!head cgigas_uk_roslin_v1_mRNA.transcriptID.geneID\n", "!wc -l cgigas_uk_roslin_v1_mRNA.transcriptID.geneID" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2b. ploidy-DML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Format file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I want chr, start, end, and gene ID." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_047559.1\t3159595\t3159597\tNC_047559.1\tGnomon\tgene\t3158575\t3169070\t.\t+\t.\tID=gene-LOC105342725;Dbxref=GeneID:105342725;Name=LOC105342725;gbkey=Gene;gene=LOC105342725;gene_biotype=protein_coding\r\n", "NC_047559.1\t3159620\t3159622\tNC_047559.1\tGnomon\tgene\t3158575\t3169070\t.\t+\t.\tID=gene-LOC105342725;Dbxref=GeneID:105342725;Name=LOC105342725;gbkey=Gene;gene=LOC105342725;gene_biotype=protein_coding\r\n", "NC_047559.1\t30739063\t30739065\tNC_047559.1\tGnomon\tgene\t30728582\t30741948\t.\t-\t.\tID=gene-LOC105344651;Dbxref=GeneID:105344651;Name=LOC105344651;gbkey=Gene;gene=LOC105344651;gene_biotype=protein_coding\r\n", "NC_047559.1\t43886947\t43886949\tNC_047559.1\tGnomon\tgene\t43877299\t43899559\t.\t+\t.\tID=gene-LOC105339780;Dbxref=GeneID:105339780;Name=LOC105339780;gbkey=Gene;gene=LOC105339780;gene_biotype=protein_coding\r\n", "NC_047559.1\t44191406\t44191408\tNC_047559.1\tGnomon\tgene\t44187569\t44214377\t.\t+\t.\tID=gene-LOC117687755;Dbxref=GeneID:117687755;Name=LOC117687755;gbkey=Gene;gene=LOC117687755;gene_biotype=protein_coding\r\n", "NC_047559.1\t45984057\t45984059\tNC_047559.1\tGnomon\tgene\t45976550\t45993139\t.\t-\t.\tID=gene-LOC105333378;Dbxref=GeneID:105333378;Name=LOC105333378;gbkey=Gene;gene=LOC105333378;gene_biotype=protein_coding\r\n", "NC_047559.1\t47884062\t47884064\tNC_047559.1\tGnomon\tgene\t47880293\t47888292\t.\t+\t.\tID=gene-LOC117684625;Dbxref=GeneID:117684625;Name=LOC117684625;gbkey=Gene;gene=LOC117684625;gene_biotype=protein_coding\r\n", "NC_047559.1\t48771720\t48771722\tNC_047559.1\tGnomon\tgene\t48767452\t48775109\t.\t+\t.\tID=gene-LOC105341853;Dbxref=GeneID:105341853;Name=LOC105341853;gbkey=Gene;gene=LOC105341853;gene_biotype=protein_coding\r\n", "NC_047559.1\t50090321\t50090323\tNC_047559.1\tGnomon\tgene\t50064798\t50106863\t.\t-\t.\tID=gene-LOC105320585;Dbxref=GeneID:105320585;Name=LOC105320585;gbkey=Gene;gene=LOC105320585;gene_biotype=protein_coding\r\n", "NC_047559.1\t53771128\t53771130\tNC_047559.1\tGnomon\tgene\t53755998\t53781200\t.\t+\t.\tID=gene-LOC105341160;Dbxref=GeneID:105341160;Name=LOC105341160;gbkey=Gene;gene=LOC105341160;gene_biotype=protein_coding\r\n" ] } ], "source": [ "!head ../Haws_07-DML-characterization/DML-ploidy-DSS-Gene-wb.bed" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate column with gene ID information\n", "#Convert =, :, and ; to \\t\n", "#Isolate gene ID\n", "\n", "!cut -f12 ../Haws_07-DML-characterization/DML-ploidy-DSS-Gene-wb.bed \\\n", "| tr \"=:;\" \"\\t\" \\\n", "| cut -f5 \\\n", "> DML-ploidy-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105342725\n", "105342725\n", "105344651\n", "105339780\n", "117687755\n", "105333378\n", "117684625\n", "105341853\n", "105320585\n", "105341160\n", " 161 DML-ploidy-DSS.GeneIDs\n" ] } ], "source": [ "!head DML-ploidy-DSS.GeneIDs\n", "!wc -l DML-ploidy-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste DML-ploidy-DSS.GeneIDs ../Haws_07-DML-characterization/DML-ploidy-DSS-Gene-wb.bed \\\n", "| awk -F'\\t' -v OFS='\\t' '{print $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-ploidy-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105318605\tNC_047568.1\t27826391\t27826393\n", "105319896\tNC_047568.1\t34528505\t34528507\n", "105320032\tNC_047566.1\t54523863\t54523865\n", "105320110\tNC_047561.1\t13511702\t13511704\n", "105320498\tNC_047564.1\t31732003\t31732005\n", "105320498\tNC_047564.1\t31752912\t31752914\n", "105320585\tNC_047559.1\t50090321\t50090323\n", "105320771\tNC_047565.1\t2751626\t2751628\n", "105321034\tNC_047566.1\t30263504\t30263506\n", "105321354\tNC_047561.1\t29067202\t29067204\n", " 161 DML-ploidy-DSS.GeneIDs.geneOverlap\n" ] } ], "source": [ "!head DML-ploidy-DSS.GeneIDs.geneOverlap\n", "!wc -l DML-ploidy-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with transcript IDs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join files to get transcript ID for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidy-DSS.GeneIDs.geneOverlap \\\n", "cgigas_uk_roslin_v1_mRNA.transcriptID.geneID \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $5, $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\n", "XM_011420963.3\t105322323\tNC_047564.1\t32019304\t32019306\n", "XM_011420964.3\t105322323\tNC_047564.1\t32019304\t32019306\n", "XM_011420965.3\t105322323\tNC_047564.1\t32019304\t32019306\n", "XM_011420966.3\t105322324\tNC_047564.1\t32019304\t32019306\n", "XM_011423648.3\t105324542\tNC_047561.1\t40362698\t40362700\n", "XM_011423649.3\t105324542\tNC_047561.1\t40362698\t40362700\n", "XM_011424110.3\t105324874\tNC_047559.1\t53948058\t53948060\n", "XM_011424135.3\t105324874\tNC_047559.1\t53948058\t53948060\n", " 597 DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs\n" ] } ], "source": [ "#Col names: transcript IDs, gene IDs, chr, start, end\n", "!head DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs\n", "!wc -l DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with annotations" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#Join files to get GO annotations for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs \\\n", "Blastquery-GOslim.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tcell cycle and proliferation\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tsignal transduction\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tstress response\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0003677\tDNA binding\tnucleic acid binding activity\tF\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0005654\tnucleoplasm\tnucleus\tC\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0005794\tGolgi apparatus\tER/Golgi\tC\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006260\tDNA replication\tDNA metabolism\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006281\tDNA repair\tDNA metabolism\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006281\tDNA repair\tstress response\tP\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0010997\tanaphase-promoting complex binding\tother molecular function\tF\n", " 7292 DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n" ] } ], "source": [ "!head DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n", "!wc -l DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "#### Join with gene product names" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Join files to get product names for genes with DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot \\\n", "cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tcell cycle and proliferation\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tsignal transduction\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0000077\tDNA damage checkpoint\tstress response\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0003677\tDNA binding\tnucleic acid binding activity\tF\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0005654\tnucleoplasm\tnucleus\tC\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0005794\tGolgi apparatus\tER/Golgi\tC\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006260\tDNA replication\tDNA metabolism\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006281\tDNA repair\tDNA metabolism\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0006281\tDNA repair\tstress response\tP\tclaspin, transcript variant X2\n", "XM_011417910.3\t105320110\tNC_047561.1\t13511702\t13511704\tGO:0010997\tanaphase-promoting complex binding\tother molecular function\tF\tclaspin, transcript variant X2\n", " 7147 DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n" ] } ], "source": [ "!head DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n", "!wc -l DML-ploidy-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2c. pH-DML" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_047559.1\t41205913\t41205915\tNC_047559.1\tGnomon\tgene\t41204179\t41236908\t.\t-\t.\tID=gene-LOC105323174;Dbxref=GeneID:105323174;Name=LOC105323174;gbkey=Gene;gene=LOC105323174;gene_biotype=protein_coding\r\n", "NC_047559.1\t44191406\t44191408\tNC_047559.1\tGnomon\tgene\t44187569\t44214377\t.\t+\t.\tID=gene-LOC117687755;Dbxref=GeneID:117687755;Name=LOC117687755;gbkey=Gene;gene=LOC117687755;gene_biotype=protein_coding\r\n", "NC_047559.1\t47000336\t47000338\tNC_047559.1\tGnomon\tgene\t47000029\t47008715\t.\t-\t.\tID=gene-LOC105328838;Dbxref=GeneID:105328838;Name=LOC105328838;gbkey=Gene;gene=LOC105328838;gene_biotype=protein_coding\r\n", "NC_047559.1\t50090321\t50090323\tNC_047559.1\tGnomon\tgene\t50064798\t50106863\t.\t-\t.\tID=gene-LOC105320585;Dbxref=GeneID:105320585;Name=LOC105320585;gbkey=Gene;gene=LOC105320585;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561420\t4561422\tNC_047560.1\tGnomon\tgene\t4523027\t4567751\t.\t-\t.\tID=gene-LOC117687305;Dbxref=GeneID:117687305;Name=LOC117687305;gbkey=Gene;gene=LOC117687305;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561420\t4561422\tNC_047560.1\tGnomon\tgene\t4387457\t4700231\t.\t-\t.\tID=gene-LOC117687382;Dbxref=GeneID:117687382;Name=LOC117687382;gbkey=Gene;gene=LOC117687382;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561429\t4561431\tNC_047560.1\tGnomon\tgene\t4523027\t4567751\t.\t-\t.\tID=gene-LOC117687305;Dbxref=GeneID:117687305;Name=LOC117687305;gbkey=Gene;gene=LOC117687305;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561429\t4561431\tNC_047560.1\tGnomon\tgene\t4387457\t4700231\t.\t-\t.\tID=gene-LOC117687382;Dbxref=GeneID:117687382;Name=LOC117687382;gbkey=Gene;gene=LOC117687382;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561492\t4561494\tNC_047560.1\tGnomon\tgene\t4523027\t4567751\t.\t-\t.\tID=gene-LOC117687305;Dbxref=GeneID:117687305;Name=LOC117687305;gbkey=Gene;gene=LOC117687305;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561492\t4561494\tNC_047560.1\tGnomon\tgene\t4387457\t4700231\t.\t-\t.\tID=gene-LOC117687382;Dbxref=GeneID:117687382;Name=LOC117687382;gbkey=Gene;gene=LOC117687382;gene_biotype=protein_coding\r\n" ] } ], "source": [ "!head ../Haws_07-DML-characterization/DML-pH-DSS-Gene-wb.bed" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate column with gene ID information\n", "#Convert =, :, and ; to \\t\n", "#Isolate gene ID\n", "\n", "!cut -f12 ../Haws_07-DML-characterization/DML-pH-DSS-Gene-wb.bed \\\n", "| tr \"=:;\" \"\\t\" \\\n", "| cut -f5 \\\n", "> DML-pH-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105323174\n", "117687755\n", "105328838\n", "105320585\n", "117687305\n", "117687382\n", "117687305\n", "117687382\n", "117687305\n", "117687382\n", " 141 DML-pH-DSS.GeneIDs\n" ] } ], "source": [ "!head DML-pH-DSS.GeneIDs\n", "!wc -l DML-pH-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 94\r\n" ] } ], "source": [ "!cat DML-pH-DSS.GeneIDs | sort | uniq | wc -l" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste DML-pH-DSS.GeneIDs ../Haws_07-DML-characterization/DML-pH-DSS-Gene-wb.bed \\\n", "| awk -F'\\t' -v OFS='\\t' '{print $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-pH-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105317186\tNC_047561.1\t18022752\t18022754\n", "105318605\tNC_047568.1\t27826391\t27826393\n", "105320585\tNC_047559.1\t50090321\t50090323\n", "105320905\tNC_047560.1\t67053212\t67053214\n", "105322131\tNC_047561.1\t22518848\t22518850\n", "105322268\tNC_047565.1\t17719849\t17719851\n", "105322305\tNC_047565.1\t30437934\t30437936\n", "105323174\tNC_047559.1\t41205913\t41205915\n", "105323811\tNC_047561.1\t11873876\t11873878\n", "105324005\tNC_047561.1\t32523988\t32523990\n", " 141 DML-pH-DSS.GeneIDs.geneOverlap\n" ] } ], "source": [ "!head DML-pH-DSS.GeneIDs.geneOverlap\n", "!wc -l DML-pH-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105328744\r\n", "105318174\r\n", "117683699\r\n", "117683566\r\n", "105337362\r\n", "105338358\r\n", "105326431\r\n", "105342022\r\n", "105317492\r\n", "105329325\r\n" ] } ], "source": [ "!head /Users/yaamini/Documents/project-gigas-oa-meth/output/11-GOterm-annotation/DML-pH-50-Cov5-All.GeneIDs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Filter the gene ID and DML file based on common genes with Manchester data\n", "#Only retain the gene IDs (column 1)\n", "!awk 'BEGIN { while(getline <\"/Users/yaamini/Documents/project-gigas-oa-meth/output/11-GOterm-annotation/DML-pH-50-Cov5-All.GeneIDs\") id[$0]=1; } id[$1] ' DML-pH-DSS.GeneIDs.geneOverlap \\\n", "| cut -f1 \\\n", "> common-pH.GeneIDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with transcript IDs" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join files to get transcript ID for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-pH-DSS.GeneIDs.geneOverlap \\\n", "cgigas_uk_roslin_v1_mRNA.transcriptID.geneID \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $5, $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\n", "XM_011425890.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011425892.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011425894.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011425895.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011425896.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011425897.3\t105326056\tNC_047566.1\t3996134\t3996136\n", "XM_011428454.3\t105327826\tNC_047564.1\t48296668\t48296670\n", " 452 DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs\n" ] } ], "source": [ "#Col names: transcript IDs, gene IDs, chr, start, end\n", "!head DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs\n", "!wc -l DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with annotations" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#Join files to get GO annotations for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs \\\n", "Blastquery-GOslim.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0000460\tmaturation of 5.8S rRNA\tRNA metabolism\tP\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0000470\tmaturation of LSU-rRNA\tRNA metabolism\tP\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0003723\tRNA binding\tnucleic acid binding activity\tF\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0005730\tnucleolus\tnucleus\tC\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0030687\t\"preribosome, large subunit precursor\"\tother cellular component\tC\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\tGO:0005576\textracellular region\tnon-structural extracellular\tC\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\tGO:0016020\tmembrane\tother membranes\tC\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0005604\tbasement membrane\textracellular matrix\tC\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0005605\tbasal lamina\textracellular matrix\tC\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0007435\tsalivary gland morphogenesis\tdevelopmental processes\tP\n", " 4463 DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n" ] } ], "source": [ "!head DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n", "!wc -l DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with gene product names" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Join files to get product names for genes with DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot \\\n", "cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0000460\tmaturation of 5.8S rRNA\tRNA metabolism\tP\tribosome biogenesis protein NSA2 homolog\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0000470\tmaturation of LSU-rRNA\tRNA metabolism\tP\tribosome biogenesis protein NSA2 homolog\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0003723\tRNA binding\tnucleic acid binding activity\tF\tribosome biogenesis protein NSA2 homolog\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0005730\tnucleolus\tnucleus\tC\tribosome biogenesis protein NSA2 homolog\n", "XM_011413748.3\t105317186\tNC_047561.1\t18022752\t18022754\tGO:0030687\t\"preribosome, large subunit precursor\"\tother cellular component\tC\tribosome biogenesis protein NSA2 homolog\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\tGO:0005576\textracellular region\tnon-structural extracellular\tC\tuncharacterized LOC105320585\n", "XM_011418571.3\t105320585\tNC_047559.1\t50090321\t50090323\tGO:0016020\tmembrane\tother membranes\tC\tuncharacterized LOC105320585\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0005604\tbasement membrane\textracellular matrix\tC\tlaminin subunit gamma-1\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0005605\tbasal lamina\textracellular matrix\tC\tlaminin subunit gamma-1\n", "XM_011420945.3\t105322305\tNC_047565.1\t30437934\t30437936\tGO:0007435\tsalivary gland morphogenesis\tdevelopmental processes\tP\tlaminin subunit gamma-1\n", " 4365 DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n" ] } ], "source": [ "!head DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n", "!wc -l DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "serine protease HTRA2, mitochondrial\r\n", "uncharacterized LOC105327207, transcript variant X1\r\n", "uncharacterized LOC105326615, transcript variant X1\r\n", "uncharacterized LOC105326615, transcript variant X2\r\n", "Golgi pH regulator, transcript variant X1\r\n", "Golgi pH regulator, transcript variant X2\r\n", "Golgi pH regulator, transcript variant X3\r\n", "Golgi pH regulator, transcript variant X4\r\n", "Golgi pH regulator, transcript variant X6\r\n", "Golgi pH regulator, transcript variant X7\r\n", "integrin alpha pat-2, transcript variant X1\r\n", "integrin alpha pat-2, transcript variant X2\r\n", "integrin alpha pat-2, transcript variant X3\r\n", "integrin alpha pat-2, transcript variant X4\r\n", "integrin alpha pat-2, transcript variant X5\r\n", "integrin alpha pat-2, transcript variant X6\r\n", "integrin alpha pat-2, transcript variant X7\r\n", "semaphorin-2A-like, transcript variant X2\r\n", "semaphorin-2A-like, transcript variant X12\r\n", "semaphorin-2A-like, transcript variant X14\r\n", "semaphorin-2A-like, transcript variant X17\r\n", "E3 ubiquitin-protein ligase RNF213-like\r\n", "DNA-directed RNA polymerase III subunit RPC2, transcript variant X1\r\n", "DNA-directed RNA polymerase III subunit RPC2, transcript variant X2\r\n", "DNA-directed RNA polymerase III subunit RPC2, transcript variant X3\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X1\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X2\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X3\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X4\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X5\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X6\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X7\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X8\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X9\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X10\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X11\r\n", "mediator of RNA polymerase II transcription subunit 15, transcript variant X12\r\n" ] } ], "source": [ "#Filter the gene ID (column 2) based on common genes in Manchester and Hawaii data\n", "#Only retain the GO information and gene products (column 7-10)\n", "!awk 'BEGIN { while(getline <\"common-pH.GeneIDs\") id[$0]=1; } id[$2] ' DML-pH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID \\\n", "| cut -f10 | uniq" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2d. interaction-DML" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_047559.1\t3022288\t3022290\tNC_047559.1\tGnomon\tgene\t3020010\t3024608\t.\t-\t.\tID=gene-LOC105337361;Dbxref=GeneID:105337361;Name=LOC105337361;gbkey=Gene;gene=LOC105337361;gene_biotype=protein_coding\r\n", "NC_047559.1\t6445629\t6445631\tNC_047559.1\tGnomon\tgene\t6434298\t6448829\t.\t+\t.\tID=gene-LOC105342013;Dbxref=GeneID:105342013;Name=LOC105342013;gbkey=Gene;gene=LOC105342013;gene_biotype=protein_coding\r\n", "NC_047559.1\t46813912\t46813914\tNC_047559.1\tGnomon\tgene\t46813603\t46821281\t.\t-\t.\tID=gene-LOC105330521;Dbxref=GeneID:105330521;Name=LOC105330521;gbkey=Gene;gene=LOC105330521;gene_biotype=protein_coding\r\n", "NC_047559.1\t46813912\t46813914\tNC_047559.1\tGnomon\tgene\t46808865\t46814128\t.\t+\t.\tID=gene-LOC105330522;Dbxref=GeneID:105330522;Name=LOC105330522;gbkey=Gene;gene=LOC105330522;gene_biotype=protein_coding\r\n", "NC_047559.1\t47000336\t47000338\tNC_047559.1\tGnomon\tgene\t47000029\t47008715\t.\t-\t.\tID=gene-LOC105328838;Dbxref=GeneID:105328838;Name=LOC105328838;gbkey=Gene;gene=LOC105328838;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561492\t4561494\tNC_047560.1\tGnomon\tgene\t4523027\t4567751\t.\t-\t.\tID=gene-LOC117687305;Dbxref=GeneID:117687305;Name=LOC117687305;gbkey=Gene;gene=LOC117687305;gene_biotype=protein_coding\r\n", "NC_047560.1\t4561492\t4561494\tNC_047560.1\tGnomon\tgene\t4387457\t4700231\t.\t-\t.\tID=gene-LOC117687382;Dbxref=GeneID:117687382;Name=LOC117687382;gbkey=Gene;gene=LOC117687382;gene_biotype=protein_coding\r\n", "NC_047560.1\t55499797\t55499799\tNC_047560.1\tGnomon\tgene\t55225485\t55583389\t.\t+\t.\tID=gene-LOC105317430;Dbxref=GeneID:105317430;Name=LOC105317430;gbkey=Gene;gene=LOC105317430;gene_biotype=protein_coding\r\n", "NC_047560.1\t59701557\t59701559\tNC_047560.1\tGnomon\tgene\t59603941\t59706112\t.\t-\t.\tID=gene-LOC105348685;Dbxref=GeneID:105348685;Name=LOC105348685;gbkey=Gene;gene=LOC105348685;gene_biotype=protein_coding\r\n", "NC_047561.1\t25296188\t25296190\tNC_047561.1\tGnomon\tgene\t25296061\t25301591\t.\t-\t.\tID=gene-LOC105345208;Dbxref=GeneID:105345208;Name=LOC105345208;gbkey=Gene;gene=LOC105345208;gene_biotype=protein_coding\r\n" ] } ], "source": [ "!head ../Haws_07-DML-characterization/DML-ploidypH-DSS-Gene-wb.bed" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate column with gene ID information\n", "#Convert =, :, and ; to \\t\n", "#Isolate gene ID\n", "\n", "!cut -f12 ../Haws_07-DML-characterization/DML-ploidypH-DSS-Gene-wb.bed \\\n", "| tr \"=:;\" \"\\t\" \\\n", "| cut -f5 \\\n", "> DML-ploidypH-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105337361\n", "105342013\n", "105330521\n", "105330522\n", "105328838\n", "117687305\n", "117687382\n", "105317430\n", "105348685\n", "105345208\n", " 51 DML-ploidypH-DSS.GeneIDs\n" ] } ], "source": [ "!head DML-ploidypH-DSS.GeneIDs\n", "!wc -l DML-ploidypH-DSS.GeneIDs" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 29\r\n" ] } ], "source": [ "!cat DML-ploidypH-DSS.GeneIDs | sort | uniq | wc -l" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste DML-ploidypH-DSS.GeneIDs ../Haws_07-DML-characterization/DML-ploidypH-DSS-Gene-wb.bed \\\n", "| awk -F'\\t' -v OFS='\\t' '{print $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-ploidypH-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105317430\tNC_047560.1\t55499797\t55499799\n", "105317678\tNC_047565.1\t32545006\t32545008\n", "105325608\tNC_047568.1\t52333625\t52333627\n", "105327826\tNC_047564.1\t48296668\t48296670\n", "105328838\tNC_047559.1\t47000336\t47000338\n", "105330521\tNC_047559.1\t46813912\t46813914\n", "105330522\tNC_047559.1\t46813912\t46813914\n", "105332451\tNC_047567.1\t31559025\t31559027\n", "105332451\tNC_047567.1\t31559038\t31559040\n", "105332451\tNC_047567.1\t31559058\t31559060\n", " 51 DML-ploidypH-DSS.GeneIDs.geneOverlap\n" ] } ], "source": [ "!head DML-ploidypH-DSS.GeneIDs.geneOverlap\n", "!wc -l DML-ploidypH-DSS.GeneIDs.geneOverlap" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with transcript IDs" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join files to get transcript ID for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidypH-DSS.GeneIDs.geneOverlap \\\n", "cgigas_uk_roslin_v1_mRNA.transcriptID.geneID \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $5, $1, $2, $3, $4}' \\\n", "| sort \\\n", "> DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011414393.3\t105317678\tNC_047565.1\t32545006\t32545008\n", "XM_011414394.3\t105317678\tNC_047565.1\t32545006\t32545008\n", "XM_011414395.3\t105317678\tNC_047565.1\t32545006\t32545008\n", "XM_011414396.3\t105317678\tNC_047565.1\t32545006\t32545008\n", "XM_011414398.3\t105317678\tNC_047565.1\t32545006\t32545008\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\n", "XM_011425245.3\t105325608\tNC_047568.1\t52333625\t52333627\n", "XM_011425246.3\t105325608\tNC_047568.1\t52333625\t52333627\n", "XM_011425247.3\t105325608\tNC_047568.1\t52333625\t52333627\n", "XM_011428454.3\t105327826\tNC_047564.1\t48296668\t48296670\n", " 123 DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs\n" ] } ], "source": [ "#Col names: transcript IDs, gene IDs, chr, start, end\n", "!head DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs\n", "!wc -l DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with annotations" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#Join files to get GO annotations for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs \\\n", "Blastquery-GOslim.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0003924\tGTPase activity\tother molecular function\tF\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005509\tcalcium ion binding\tother molecular function\tF\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005525\tGTP binding\tother molecular function\tF\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0007264\tsmall GTPase mediated signal transduction\tsignal transduction\tP\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0019725\tcellular homeostasis\tother biological processes\tP\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0031307\tintegral to mitochondrial outer membrane\tmitochondrion\tC\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0031307\tintegral to mitochondrial outer membrane\tother membranes\tC\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0047497\tmitochondrion transport along microtubule\ttransport\tP\n", "XM_011425245.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0003924\tGTPase activity\tother molecular function\tF\n", "XM_011425245.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005509\tcalcium ion binding\tother molecular function\tF\n", " 1138 DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n" ] } ], "source": [ "!head DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n", "!wc -l DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with gene product names" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Join files to get product names for genes with DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot \\\n", "cgigas_uk_roslin_v1_rna_from_genomic_annot.transcript.tab \\\n", "| uniq \\\n", "| sort \\\n", "> DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0003924\tGTPase activity\tother molecular function\tF\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005509\tcalcium ion binding\tother molecular function\tF\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005525\tGTP binding\tother molecular function\tF\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0007264\tsmall GTPase mediated signal transduction\tsignal transduction\tP\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0019725\tcellular homeostasis\tother biological processes\tP\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0031307\tintegral to mitochondrial outer membrane\tmitochondrion\tC\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0031307\tintegral to mitochondrial outer membrane\tother membranes\tC\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425244.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0047497\tmitochondrion transport along microtubule\ttransport\tP\tmitochondrial Rho GTPase 1, transcript variant X1\n", "XM_011425245.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0003924\tGTPase activity\tother molecular function\tF\tmitochondrial Rho GTPase 1, transcript variant X2\n", "XM_011425245.3\t105325608\tNC_047568.1\t52333625\t52333627\tGO:0005509\tcalcium ion binding\tother molecular function\tF\tmitochondrial Rho GTPase 1, transcript variant X2\n", " 1126 DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n" ] } ], "source": [ "!head DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID\n", "!wc -l DML-ploidypH-DSS.GeneIDs.geneOverlap.transcriptIDs.GOAnnot.productID" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2e. CpG background (1x CpGs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Format file" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_047559.1\t10155\t10157\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10215\t10217\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10270\t10272\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10292\t10294\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10314\t10316\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10358\t10360\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10380\t10382\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10391\t10393\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10402\t10404\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n", "NC_047559.1\t10413\t10415\tNC_047559.1\tGnomon\tgene\t9839\t11386\t.\t+\t.\tID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA\r\n" ] } ], "source": [ "!head ../Haws_06-methylation-landscape/union_1x-Gene-wb.bed" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate column with gene ID information\n", "#Convert =, :, and ; to \\t\n", "#Isolate gene ID\n", "\n", "!cut -f12 ../Haws_06-methylation-landscape/union_1x-Gene-wb.bed \\\n", "| tr \"=:;\" \"\\t\" \\\n", "| cut -f5 \\\n", "> union_1x.GeneIDs" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", "117693020\n", " 7852582 union_1x.GeneIDs\n" ] } ], "source": [ "!head union_1x.GeneIDs\n", "!wc -l union_1x.GeneIDs" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste union_1x.GeneIDs ../Haws_06-methylation-landscape/union_1x-Gene-wb.bed \\\n", "| awk -F'\\t' -v OFS='\\t' '{print $1, $2, $3, $4}' \\\n", "| sort \\\n", "> union_1x.GeneIDs.geneOverlap" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "105317035\tNC_047564.1\t46227067\t46227069\n", "105317035\tNC_047564.1\t46227126\t46227128\n", "105317035\tNC_047564.1\t46227330\t46227332\n", "105317035\tNC_047564.1\t46227355\t46227357\n", "105317035\tNC_047564.1\t46227472\t46227474\n", "105317035\tNC_047564.1\t46227513\t46227515\n", "105317035\tNC_047564.1\t46227569\t46227571\n", "105317035\tNC_047564.1\t46227587\t46227589\n", "105317035\tNC_047564.1\t46227597\t46227599\n", "105317035\tNC_047564.1\t46227606\t46227608\n", " 7852582 union_1x.GeneIDs.geneOverlap\n" ] } ], "source": [ "!head union_1x.GeneIDs.geneOverlap\n", "!wc -l union_1x.GeneIDs.geneOverlap" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with transcript IDs" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join files to get transcript ID for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "union_1x.GeneIDs.geneOverlap \\\n", "cgigas_uk_roslin_v1_mRNA.transcriptID.geneID \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $5, $1, $2, $3, $4}' \\\n", "| sort \\\n", "> union_1x.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NM_001305288.1\t105326593\tNC_047559.1\t17277556\t17277558\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277568\t17277570\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277625\t17277627\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277631\t17277633\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277647\t17277649\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277650\t17277652\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277700\t17277702\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277734\t17277736\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277761\t17277763\n", "NM_001305288.1\t105326593\tNC_047559.1\t17277898\t17277900\n", " 25949861 union_1x.GeneIDs.geneOverlap.transcriptIDs\n" ] } ], "source": [ "#Col names: transcript IDs, gene IDs, chr, start, end\n", "!head union_1x.GeneIDs.geneOverlap.transcriptIDs\n", "!wc -l union_1x.GeneIDs.geneOverlap.transcriptIDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join with annotations" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "#Join files to get GO annotations for DML\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "union_1x.GeneIDs.geneOverlap.transcriptIDs \\\n", "Blastquery-GOslim.tab \\\n", "| sort \\\n", "| uniq \\\n", "> union_1x.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413559.3\t105317035\tNC_047564.1\t46227067\t46227069\tGO:0005777\tperoxisome\tother cytoplasmic organelle\tC\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227067\t46227069\tGO:0008445\tD-aspartate oxidase activity\tother molecular function\tF\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227067\t46227069\tGO:0046416\tD-amino acid metabolic process\tother metabolic processes\tP\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227126\t46227128\tGO:0005777\tperoxisome\tother cytoplasmic organelle\tC\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227126\t46227128\tGO:0008445\tD-aspartate oxidase activity\tother molecular function\tF\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227126\t46227128\tGO:0046416\tD-amino acid metabolic process\tother metabolic processes\tP\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227330\t46227332\tGO:0005777\tperoxisome\tother cytoplasmic organelle\tC\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227330\t46227332\tGO:0008445\tD-aspartate oxidase activity\tother molecular function\tF\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227330\t46227332\tGO:0046416\tD-amino acid metabolic process\tother metabolic processes\tP\n", "XM_011413559.3\t105317035\tNC_047564.1\t46227355\t46227357\tGO:0005777\tperoxisome\tother cytoplasmic organelle\tC\n", " 278789481 union_1x.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n" ] } ], "source": [ "!head union_1x.GeneIDs.geneOverlap.transcriptIDs.GOAnnot\n", "!wc -l union_1x.GeneIDs.geneOverlap.transcriptIDs.GOAnnot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Prepare for `topGO` enrichment analysis" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Extract transcript IDs\n", "#Filter and save unique IDs\n", "!cut -f1 union_1x.GeneIDs.geneOverlap.transcriptIDs.GOAnnot | uniq \\\n", "> union_1x.transcriptIDs.unique" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_011413559.3\n", "XM_011413583.3\n", "XM_011413585.3\n", "XM_011413590.3\n", "XM_011413601.3\n", "XM_011413602.3\n", "XM_011413626.3\n", "XM_011413636.3\n", "XM_011413642.3\n", "XM_011413649.3\n", " 43121 union_1x.transcriptIDs.unique\n" ] } ], "source": [ "!head union_1x.transcriptIDs.unique\n", "!wc -l union_1x.transcriptIDs.unique" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_001305294.1\tGO:0005576, GO:0007218\r\n", "XM_001308866.2\tGO:0005179, GO:0005576, GO:0006006\r\n", "XM_034475035.1\tGO:0005179, GO:0005576, GO:0006006\r\n", "XM_011417422.3\tGO:0005179, GO:0005576, GO:0006006\r\n", "XM_011419035.3\tGO:0005576\r\n", "XM_011419037.3\tGO:0005576\r\n", "XM_011419038.3\tGO:0005576\r\n", "XM_011419039.3\tGO:0005576\r\n", "XM_020064360.2\tGO:0005576\r\n", "XM_011442798.3\tGO:0005179, GO:0005576\r\n" ] } ], "source": [ "#Extract columns 2 and 3 to create list needed for topGO gene enrichment\n", "#Convert ; to , for topGO formatting\n", "!cut -f2,3 _blast-annot.tab \\\n", "| tr \";\" \",\" \\\n", "> geneid2go.tab\n", "!head geneid2go.tab" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Filter the gene ID-GO file based on transcript IDs in 1x union data\n", "!awk 'BEGIN { while(getline <\"union_1x.transcriptIDs.unique\") id[$0]=1; } id[$1] ' geneid2go.tab \\\n", "> geneid2go-union1x.tab" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XM_034475035.1\tGO:0005179, GO:0005576, GO:0006006\n", "XM_011417422.3\tGO:0005179, GO:0005576, GO:0006006\n", "XM_011419035.3\tGO:0005576\n", "XM_011419037.3\tGO:0005576\n", "XM_011419038.3\tGO:0005576\n", "XM_011419039.3\tGO:0005576\n", "XM_020064360.2\tGO:0005576\n", "XM_011442798.3\tGO:0005179, GO:0005576\n", "XM_011442799.3\tGO:0005179, GO:0005576\n", "XM_034482735.1\tGO:0001525, GO:0002250, GO:0004674, GO:0005524, GO:0005737, GO:0005829, GO:0005886, GO:0005942, GO:0005943, GO:0006468, GO:0006909, GO:0006954, GO:0014065, GO:0014870, GO:0016303, GO:0030168, GO:0035005, GO:0043201, GO:0045087, GO:0046854, GO:0046934, GO:0060326, GO:0071548, GO:1903544\n", " 81446 geneid2go-union1x.tab\n" ] } ], "source": [ "!head geneid2go-union1x.tab\n", "!wc -l geneid2go-union1x.tab" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }