{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpG ratio for the *Stylophora pistillata* transcriptome" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow calculates CpG ratio, or CpG O/E, for contigs in the *Stylophora pistillata* [transcriptome](http://data.centrescientifique.mc/Data/454Isotigs.fas.zip). CpG ratio is an estimate of germline DNA methylation.\n", "\n", "This workflow is an extension of another IPython notebook workflow, `Spist_blast_anno.ipynb`, that generates an annotation of the same transcriptome. This workflow assumes that you have created the directories and files specified in the annotation workflow." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Spist\n" ] } ], "source": [ "cd ../data/Spist" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Spi_contig00032 gene=isogroup00001 length=1566\n", "GgATCCATCGAAAGAAAAaGTCGTAGTTGACAGTGATTGCATGACTTCTGTACAACGTCCGATTAGTTTTtGCGTTTTATTGTCTtCTTTTGTACTGgAAATTCGACTCTAGCAAGTTGATTTCGTTTAGTTATGTCTGGCATTGCTTTGGGCAGATTATCAGAAGAGAGAAAAGCTTGGCGTAAAGACcATCCTTTCGGATTTGTGGCCAAACCGGTTAAAAATCCTGATGGTACTCTAAATCTGATGAACTGGGAATGTGCGATCCCaGGAAAGAAAGCGACCCCATGGGAAGGAGGCTCTTTCAAATTGAAGATGATATTTAAAGACGACTATCCATCCTCTCCGCCAAAATGTAAATTTGATCCTCCAATTTTCCATCCTAATGTATACCCATCTGGCACAGTGTGTCTGTCTCTTCTAGATGAAGAGAAAGACTGGAGACCTGCCGTCACTATAAAACAGATTTTATTGGGAATTCAAGACTTACTGAaTGATCCAAaCaTAaGaGATCCAGCTCAAGCAGAAGCATACaCCATTTACTGCCAAAACAGATCAGAATATGAGAAACGAGTCAGAAGTCAAGCTGCAAaGTTCTCAAGTTCATAGTGAATAAAGAAAAATATAAAATACTTCGCTAGCAGACGTGTAGGTTAACaGTGACTGGCAGAAATTGGATTTATTTTTtCTCcGTTtGTAAacAaCAaTGTGGAGTGGAGCAaGAATTCATCCgTAACTGCGTTAATACCACAGCTGTTGTTGTGAATTGGAaGGACAATCATGTCTGTTATTTtAGGAAGTTTCACAATGTACAGTTCAACTAACTTATACTAAAAAaCAGCTAAAGTtCACCTTAGTCTATGTtAGTTTtGTTAAAGCTTCTCTGTAgAAGGTTTGGGAGGgTtGTAATGGCTGTAACTATTCAGGgAGATCTGTCAAAGTACACTCTCTAATCCCATTGTTTCTTAAGGAATGAGTCATTATTTATTGTCTGGTGGGACGGaGGaTTTTAATCGTGTCACAATAAAATTTACCTGATCTTCCCatAagACCTGTAGTAATCTAaTGGTCCCTGTCATTGGCAGTCAaTTTTCTATAGTTTCCTGTTAACCTCTGTTAGCTACAACTGATGCCCCcTCTGTACCCCCCTAAAATTGTGTTCCCCCAAAgAAaTCCTTCCACCCACCCCATaCCTCCAGGTGATAACTAATGActGGTCCTTAATTCATTCTCTTTTTTTtGTTCTTGATTATTTGACTAAACCAAAGGCGatAAGTAAGATATACCCcAAtACGATTTGACGAACAATCTCTAGATTTTTTATTtAATCTTTAGTTGTAGATGTAATCAGAGGTGCAAAGCTTTACTGTTTtGAGTGTAAAGTGCCTGTTTCTAGGGATCCAGTGAAACCAGTAGTGTTGGGATTGTAAAGAGaTCAAAGAaTACCTCAAAGCACCAATCATAATGCTCTAATGTTGACAACTGCTAATGATTGGTtCAGCCTaTGGCTTTCATTAAAAaCGTCAAAAAaTAACGCTTCACTTCATTGCAAGTGATTCAGGTGGa\n", "\n", "number of seqs =\n", "15052\n" ] } ], "source": [ "#fasta file\n", "!head -2 Spist.fasta\n", "!echo \n", "!echo number of seqs =\n", "!fgrep -c \">\" Spist.fasta" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "awk: cannot open Spist.fasta (No such file or directory)\r\n" ] } ], "source": [ "#Just printing first line w/out comments and looking at contig names\n", "!awk '{print $1}' Spist.fasta > Spist2.fasta\n", "!head -10 Spist2.fasta\n", "!tail -10 Spist2.fasta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Converted 15052 FASTA records in 30104 lines to tabular format\r\n", "Total sequence length: 16227896\r\n", "\r\n" ] } ], "source": [ "#Converting FASTA to tabular format and placing output file in analyses directory\n", "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' \\\n", "Spist2.fasta > ../../analyses/Spist/fasta2tab" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jay/Documents/fish546-2015/Analyses/Spist\n" ] } ], "source": [ "cd ../../analyses/Spist" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032\t\tGgATCCATCGAAAGAAAAaGTCGTAGTTGACAGTGATTGCATGACTTCTGTACAACGTCCGATTAGTTTTtGCGTTTTATTGTCTtCTTTTGTACTGgAAATTCGACTCTAGCAAGTTGATTTCGTTTAGTTATGTCTGGCATTGCTTTGGGCAGATTATCAGAAGAGAGAAAAGCTTGGCGTAAAGACcATCCTTTCGGATTTGTGGCCAAACCGGTTAAAAATCCTGATGGTACTCTAAATCTGATGAACTGGGAATGTGCGATCCCaGGAAAGAAAGCGACCCCATGGGAAGGAGGCTCTTTCAAATTGAAGATGATATTTAAAGACGACTATCCATCCTCTCCGCCAAAATGTAAATTTGATCCTCCAATTTTCCATCCTAATGTATACCCATCTGGCACAGTGTGTCTGTCTCTTCTAGATGAAGAGAAAGACTGGAGACCTGCCGTCACTATAAAACAGATTTTATTGGGAATTCAAGACTTACTGAaTGATCCAAaCaTAaGaGATCCAGCTCAAGCAGAAGCATACaCCATTTACTGCCAAAACAGATCAGAATATGAGAAACGAGTCAGAAGTCAAGCTGCAAaGTTCTCAAGTTCATAGTGAATAAAGAAAAATATAAAATACTTCGCTAGCAGACGTGTAGGTTAACaGTGACTGGCAGAAATTGGATTTATTTTTtCTCcGTTtGTAAacAaCAaTGTGGAGTGGAGCAaGAATTCATCCgTAACTGCGTTAATACCACAGCTGTTGTTGTGAATTGGAaGGACAATCATGTCTGTTATTTtAGGAAGTTTCACAATGTACAGTTCAACTAACTTATACTAAAAAaCAGCTAAAGTtCACCTTAGTCTATGTtAGTTTtGTTAAAGCTTCTCTGTAgAAGGTTTGGGAGGgTtGTAATGGCTGTAACTATTCAGGgAGATCTGTCAAAGTACACTCTCTAATCCCATTGTTTCTTAAGGAATGAGTCATTATTTATTGTCTGGTGGGACGGaGGaTTTTAATCGTGTCACAATAAAATTTACCTGATCTTCCCatAagACCTGTAGTAATCTAaTGGTCCCTGTCATTGGCAGTCAaTTTTCTATAGTTTCCTGTTAACCTCTGTTAGCTACAACTGATGCCCCcTCTGTACCCCCCTAAAATTGTGTTCCCCCAAAgAAaTCCTTCCACCCACCCCATaCCTCCAGGTGATAACTAATGActGGTCCTTAATTCATTCTCTTTTTTTtGTTCTTGATTATTTGACTAAACCAAAGGCGatAAGTAAGATATACCCcAAtACGATTTGACGAACAATCTCTAGATTTTTTATTtAATCTTTAGTTGTAGATGTAATCAGAGGTGCAAAGCTTTACTGTTTtGAGTGTAAAGTGCCTGTTTCTAGGGATCCAGTGAAACCAGTAGTGTTGGGATTGTAAAGAGaTCAAAGAaTACCTCAAAGCACCAATCATAATGCTCTAATGTTGACAACTGCTAATGATTGGTtCAGCCTaTGGCTTTCATTAAAAaCGTCAAAAAaTAACGCTTCACTTCATTGCAAGTGATTCAGGTGGa\r\n", "Spi_contig00035\t\tAGCAGAGATCTATCAGACGACTATGTTACGACATGAAAaCCTTTTAGGCTTtGTAGCTGCTGACAaCAAaGATAATGGTGCCTGGACACAGCTTTGGCTGGTTACAGATTATTTagAAAGGGgCTCATTGTATGACTATCTTCAACTTGTTACCCTGAATGTTGAATCCATGCTGAAGTTAGCAGTGTCAATAGCCAGTGGACTTGCTCATCTCCACATtGAGATtGTAGGCACCCAAGgAAAGCCAGCTATTGCACATCGTGACTTAAAAaGTAAAAaCATTCTGGTTAAAAGAAATGGGACATGTTGCATAGGTGATTTGGGTCTTGCTGTTCGTCATAGTTCCATTACTGACACAGTTGATGTTCCTCCTGGGAACAGAgTTGGTACAAAGCGCTACATGCCACCTGAATTTTtAGATGACAATATACAGGTGCGGCACTTTGATGCATACAAGCGAGGTGATATGTATGCATTTGGTCTAGTGTTATGGGAAaTTGCAAGGACgTGTGTTTGTGGTGGATTGTGTGATGAATATCAGCTGCCCTACTATGACAGAGTTCCCTGTGATCCCAGCATTGAAGAtATGAGAAAGGTGGTCTGTGTGGAGAGGTATCGCCCTGCCTTTCCAAATAGGTGGATACAAGATCAGACACTACAGTCTATGTCCAAGTTGATGAAAGAATGTTGGTATGCCAATAGTGCTGCAAGACTTACAGCTCTCCGTGTTAAGAAGACTCTTACAACAcTGTGTCAGTTACATGATGTTGATATTATGGTTTAGAGGGaTCCATCAaGAAAACTCATGTAAAACAAGTgCAAGATGGGTGTATCACCAGGACTTATATAACAGGAGGAACCTCCTGATAATCCTTGCAGGCAATTTGTAGGTCCTGTAATGTGATACAGTACATGTCTACCATCATGCTATTGgTTTGCAGTTCAAACaGCAATGCTGTCTTTtATTTTTtGATAAAGCCATGAAGAATAAACGCGACAGAGAAAaTTTATGACTTCCTAGTTAACAAGAAGCATTCtCATTTTTTTTTTTTtttGCCTGCATAATTcTTGtATTTATATCCACAGCAACTGTCACTGACTCAAAGTgTtGgtAaTCatCAGCAACAAAGAACCCAACTTGgTTTTATGAAAGCACGTACATtCAAATGAGCAATTTAAATTGAAAACCTAAAAGGACATATCCATGTGCAACAACTGTAGAACTATGTATGAGCAGTGTCCACTATTCTCCTCCTTAGTTTATTTTAGATGTTTCGAATGAGAATACAGAATAGAGCTTCATCCTTTTGACATTGAAAGCATTTCATACTCAAGTATtGAAGGGTATGATTCTTTGTTGTGAGGAACGACACTCACCAAGTAATCCTCAAAATATTTAAGAGCATACATGGATTTTACCAAAAGATGTGGGTTGGTCATATCTACATTAAGAATTTGTTACAATAGTAAGGTAGTTGCAGTGCTGTAGCTAGCTCTTAGATGAAGCGTGTTTATTGCATTCAAATTGAAATGTTATTGTTTAAAGTGATTCTCAAATTAAAAGACATTGTATTTTTAGACAACTTTTTATCTTCGAAAACCCAGTGGTGTAAGGTAATTACATTATTTTATGTTCATAACAAATACATTTTAACTGTATTCCACCAAACAATGTGAAAGGGGCAGTTTAAGAATTAATGAAATCTTTCTTGATACCCGTTCTtcAGAACGCTTTATAGTATCTTCTTTATAGTTGTAAATAAaCCGAaTACTTTGCGTGTGAGGGGCATTGAaGAAACTGAAGGGTTTTGTTTGACTCTGTTCTAACAGAAtCAAACATAATGTAATTACTCGGAACTTCATTAAACTGGTGGCC\r\n" ] } ], "source": [ "#Checking header on new tabular format file\n", "!head -2 fasta2tab" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Added column with length of column 2 for 15052 lines.\r\n", "\r\n" ] } ], "source": [ "#Add column with length of sequence\n", "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' \\\n", "fasta2tab > tab_1" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 15052 45156 16565757 tab_1\r\n" ] } ], "source": [ "!wc tab_1" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#The file used to count Cs and Gs will only include the sequence\n", "!awk '{print $2}' tab_1 > tab_2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#This counts CGs - both cases\n", "!echo \"CG\" | awk -F\\[Cc][Gg] '{print NF-1}' tab_2 > CG " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Cs\n", "!echo \"C\" | awk -F\\[Cc] '{print NF-1}' tab_2 > C " ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Gs\n", "!echo \"G\" | awk -F\\[Gg] '{print NF-1}' tab_2 > G " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032\t\tGgATCCATCGAAAGAAAAaGTCGTAGTTGACAGTGATTGCATGACTTCTGTACAACGTCCGATTAGTTTTtGCGTTTTATTGTCTtCTTTTGTACTGgAAATTCGACTCTAGCAAGTTGATTTCGTTTAGTTATGTCTGGCATTGCTTTGGGCAGATTATCAGAAGAGAGAAAAGCTTGGCGTAAAGACcATCCTTTCGGATTTGTGGCCAAACCGGTTAAAAATCCTGATGGTACTCTAAATCTGATGAACTGGGAATGTGCGATCCCaGGAAAGAAAGCGACCCCATGGGAAGGAGGCTCTTTCAAATTGAAGATGATATTTAAAGACGACTATCCATCCTCTCCGCCAAAATGTAAATTTGATCCTCCAATTTTCCATCCTAATGTATACCCATCTGGCACAGTGTGTCTGTCTCTTCTAGATGAAGAGAAAGACTGGAGACCTGCCGTCACTATAAAACAGATTTTATTGGGAATTCAAGACTTACTGAaTGATCCAAaCaTAaGaGATCCAGCTCAAGCAGAAGCATACaCCATTTACTGCCAAAACAGATCAGAATATGAGAAACGAGTCAGAAGTCAAGCTGCAAaGTTCTCAAGTTCATAGTGAATAAAGAAAAATATAAAATACTTCGCTAGCAGACGTGTAGGTTAACaGTGACTGGCAGAAATTGGATTTATTTTTtCTCcGTTtGTAAacAaCAaTGTGGAGTGGAGCAaGAATTCATCCgTAACTGCGTTAATACCACAGCTGTTGTTGTGAATTGGAaGGACAATCATGTCTGTTATTTtAGGAAGTTTCACAATGTACAGTTCAACTAACTTATACTAAAAAaCAGCTAAAGTtCACCTTAGTCTATGTtAGTTTtGTTAAAGCTTCTCTGTAgAAGGTTTGGGAGGgTtGTAATGGCTGTAACTATTCAGGgAGATCTGTCAAAGTACACTCTCTAATCCCATTGTTTCTTAAGGAATGAGTCATTATTTATTGTCTGGTGGGACGGaGGaTTTTAATCGTGTCACAATAAAATTTACCTGATCTTCCCatAagACCTGTAGTAATCTAaTGGTCCCTGTCATTGGCAGTCAaTTTTCTATAGTTTCCTGTTAACCTCTGTTAGCTACAACTGATGCCCCcTCTGTACCCCCCTAAAATTGTGTTCCCCCAAAgAAaTCCTTCCACCCACCCCATaCCTCCAGGTGATAACTAATGActGGTCCTTAATTCATTCTCTTTTTTTtGTTCTTGATTATTTGACTAAACCAAAGGCGatAAGTAAGATATACCCcAAtACGATTTGACGAACAATCTCTAGATTTTTTATTtAATCTTTAGTTGTAGATGTAATCAGAGGTGCAAAGCTTTACTGTTTtGAGTGTAAAGTGCCTGTTTCTAGGGATCCAGTGAAACCAGTAGTGTTGGGATTGTAAAGAGaTCAAAGAaTACCTCAAAGCACCAATCATAATGCTCTAATGTTGACAACTGCTAATGATTGGTtCAGCCTaTGGCTTTCATTAAAAaCGTCAAAAAaTAACGCTTCACTTCATTGCAAGTGATTCAGGTGGa\t1566\t28\t297\t305\r\n" ] } ], "source": [ "#Combining counts\n", "!paste tab_1 \\\n", "CG \\\n", "C \\\n", "G \\\n", "> comb\n", "!head -1 comb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpGo/e based on [Gavery and Roberts (2010)](http://www.biomedcentral.com/1471-2164/11/483)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb > ID_CpG #use ^ instead of ** for exponent\n" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032 \t 0.484363\n", "Spi_contig00035 \t 0.335179\n", "Spi_contig00040 \t 0.854266\n", "Spi_contig00044 \t 0.867137\n", "Spi_contig00046 \t 0.196592\n", "Spi_contig00075 \t 0.871739\n", "Spi_contig00091 \t 0.937628\n", "Spi_contig00094 \t 0.131292\n", "Spi_contig00095 \t 0.684932\n", "Spi_contig00098 \t 0.584255\n", " 15052 30104 402540 ID_CpG\n" ] } ], "source": [ "!head ID_CpG\n", "!wc ID_CpG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now joining CpG to annotation, but first must sort files." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00035\tsp\tP80204\tTGFR1_RAT\t69.77\t258\t78\t0\t2\t775\t244\t501\t2e-129\t 397\r\n", "Spi_contig00040\tsp\tQ35101\tCOX1_METSE\t82.85\t449\t72\t1\t605\t1936\t70\t518\t0.0\t 665\r\n", "Spi_contig00046\tsp\tE7EY42\tPTSS2_DANRE\t56.41\t234\t101\t1\t19\t720\t26\t258\t6e-91\t 281\r\n", "Spi_contig00075\tsp\tQ37556\tNU1M_METSE\t80.16\t247\t48\t1\t1719\t2456\t29\t275\t3e-104\t 333\r\n", "Spi_contig00094\tsp\tP00519\tABL1_HUMAN\t48.15\t54\t28\t0\t2494\t2333\t443\t496\t2e-09\t65.1\r\n", "Spi_contig00095\tsp\tP20693\tFCER2_MOUSE\t28.57\t203\t122\t5\t831\t229\t128\t309\t6e-18\t87.0\r\n", "Spi_contig00834\tsp\tP19615\tMYP_STRPU\t32.94\t598\t321\t18\t50\t1741\t132\t683\t2e-84\t 294\r\n", "Spi_contig04187\tsp\tQ9CYA0\tCREL2_MOUSE\t43.57\t140\t77\t2\t82\t501\t193\t330\t1e-27\t 110\r\n", "Spi_isotig00002\tsp\tQ5I2B1\tACTPG_OULOR\t56.49\t131\t56\t1\t1429\t1037\t7\t136\t6e-41\t 151\r\n", "Spi_isotig00005\tsp\tC9EIC7\tACTP1_URTCR\t51.67\t120\t57\t1\t1444\t1085\t54\t172\t2e-31\t 126\r\n" ] } ], "source": [ "#Sorting Ahya Uniprot/Swissprot annotation file. This file was the result of work done in another notebook: Ahya_blast_anno.ipynb\n", "!sort Spist_blastx_uniprot_sql.tab | tail -n +2 > Spist_blastx_uniprot_sql.tab.sorted\n", "!head Spist_blastx_uniprot_sql.tab.sorted" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sort: open failed: Spist_GOSlim.tab: No such file or directory\r\n" ] } ], "source": [ "#Sorting GOSlim annotation file. This file was the result of work done in another notebook: Spist_blast_anno.ipynb\n", "!sort Spist_GOSlim.tab | tail -n +2 > Spist_GOSlim.sorted\n", "!head Spist_GOSlim.sorted" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032 \t 0.484363\r\n", "Spi_contig00035 \t 0.335179\r\n", "Spi_contig00040 \t 0.854266\r\n", "Spi_contig00044 \t 0.867137\r\n", "Spi_contig00046 \t 0.196592\r\n", "Spi_contig00075 \t 0.871739\r\n", "Spi_contig00091 \t 0.937628\r\n", "Spi_contig00094 \t 0.131292\r\n", "Spi_contig00095 \t 0.684932\r\n", "Spi_contig00098 \t 0.584255\r\n" ] } ], "source": [ "#Sorting CpG file\n", "!sort ID_CpG > ID_CpG.sorted\n", "!head ID_CpG.sorted" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# For this analysis, *Symbiodinium* sequences were removed. Using file generated from Spist_zoox_removal.ipynb, ID_CpG.sorted2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!join ID_CpG.sorted2 Spist_blastx_uniprot_sql.tab.sorted | awk '{print $1, \"\\t\", $2}' > Spist_cpg_anno" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00035 \t 0.335179\n", "Spi_contig00040 \t 0.854266\n", "Spi_contig00046 \t 0.196592\n", "Spi_contig00075 \t 0.871739\n", "Spi_contig00094 \t 0.131292\n", "Spi_contig00095 \t 0.684932\n", "Spi_contig00834 \t 0.654594\n", "Spi_contig04187 \t 0.876088\n", "Spi_isotig00002 \t 0.910243\n", "Spi_isotig00005 \t 0.900018\n", " 7061 14122 189272 Spist_cpg_anno\n" ] } ], "source": [ "!head Spist_cpg_anno\n", "!wc Spist_cpg_anno" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!join ID_CpG.sorted2 Spist_GOSlim.sorted > Spist_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032 0.484363 cell cycle and proliferation\r\n", "Spi_contig00032 0.484363 cell organization and biogenesis\r\n", "Spi_contig00032 0.484363 developmental processes\r\n", "Spi_contig00032 0.484363 other biological processes\r\n", "Spi_contig00032 0.484363 protein metabolism\r\n", "Spi_contig00035 0.335179 cell cycle and proliferation\r\n", "Spi_contig00035 0.335179 cell organization and biogenesis\r\n", "Spi_contig00035 0.335179 death\r\n", "Spi_contig00035 0.335179 developmental processes\r\n", "Spi_contig00035 0.335179 other biological processes\r\n" ] } ], "source": [ "!head Spist_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032 \t 0.484363 \t cell cycle and proliferation\r\n", "Spi_contig00032 \t 0.484363 \t cell organization and biogenesis\r\n", "Spi_contig00032 \t 0.484363 \t developmental processes \r\n", "Spi_contig00032 \t 0.484363 \t other biological processes \r\n", "Spi_contig00032 \t 0.484363 \t protein metabolism \r\n", "Spi_contig00035 \t 0.335179 \t cell cycle and proliferation\r\n", "Spi_contig00035 \t 0.335179 \t cell organization and biogenesis\r\n", "Spi_contig00035 \t 0.335179 \t death \r\n", "Spi_contig00035 \t 0.335179 \t developmental processes \r\n", "Spi_contig00035 \t 0.335179 \t other biological processes \r\n" ] } ], "source": [ "#Putting tabs in between columns\n", "!awk '{print $1, \"\\t\", $2, \"\\t\", $3, $4, $5, $6}' Spist_cpg_GOslim > Spist_cpg_GOslim.tab\n", "!head Spist_cpg_GOslim.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now time to plot data using pandas and matplot" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "
---|---|---|---|
0 | \n", "Spi_contig00032 | \n", "0.484363 | \n", "cell cycle and proliferation | \n", "
1 | \n", "Spi_contig00032 | \n", "0.484363 | \n", "cell organization and biogenesis | \n", "
2 | \n", "Spi_contig00032 | \n", "0.484363 | \n", "developmental processes | \n", "
3 | \n", "Spi_contig00032 | \n", "0.484363 | \n", "other biological processes | \n", "
4 | \n", "Spi_contig00032 | \n", "0.484363 | \n", "protein metabolism | \n", "
5 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "cell cycle and proliferation | \n", "
6 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "cell organization and biogenesis | \n", "
7 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "death | \n", "
8 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "developmental processes | \n", "
9 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "other biological processes | \n", "
10 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "other metabolic processes | \n", "
11 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "protein metabolism | \n", "
12 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "RNA metabolism | \n", "
13 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "signal transduction | \n", "
14 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "stress response | \n", "
15 | \n", "Spi_contig00035 | \n", "0.335179 | \n", "transport | \n", "
16 | \n", "Spi_contig00040 | \n", "0.854266 | \n", "other metabolic processes | \n", "
17 | \n", "Spi_contig00040 | \n", "0.854266 | \n", "transport | \n", "
18 | \n", "Spi_contig00046 | \n", "0.196592 | \n", "other metabolic processes | \n", "
19 | \n", "Spi_contig00075 | \n", "0.871739 | \n", "other metabolic processes | \n", "
20 | \n", "Spi_contig00075 | \n", "0.871739 | \n", "transport | \n", "
21 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "cell adhesion | \n", "
22 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "cell cycle and proliferation | \n", "
23 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "cell organization and biogenesis | \n", "
24 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "death | \n", "
25 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "developmental processes | \n", "
26 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "DNA metabolism | \n", "
27 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "other biological processes | \n", "
28 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "other metabolic processes | \n", "
29 | \n", "Spi_contig00094 | \n", "0.131292 | \n", "protein metabolism | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
13603 | \n", "Spi_isotig14901 | \n", "0.815894 | \n", "other metabolic processes | \n", "
13604 | \n", "Spi_isotig14913 | \n", "0.841359 | \n", "signal transduction | \n", "
13605 | \n", "Spi_isotig14926 | \n", "0.391261 | \n", "protein metabolism | \n", "
13606 | \n", "Spi_isotig14932 | \n", "0.626017 | \n", "protein metabolism | \n", "
13607 | \n", "Spi_isotig14936 | \n", "0.253558 | \n", "other metabolic processes | \n", "
13608 | \n", "Spi_isotig14955 | \n", "0.534380 | \n", "cell cycle and proliferation | \n", "
13609 | \n", "Spi_isotig14955 | \n", "0.534380 | \n", "cell organization and biogenesis | \n", "
13610 | \n", "Spi_isotig14955 | \n", "0.534380 | \n", "developmental processes | \n", "
13611 | \n", "Spi_isotig14956 | \n", "0.431596 | \n", "protein metabolism | \n", "
13612 | \n", "Spi_isotig14959 | \n", "0.958655 | \n", "cell organization and biogenesis | \n", "
13613 | \n", "Spi_isotig14962 | \n", "0.348816 | \n", "other metabolic processes | \n", "
13614 | \n", "Spi_isotig14962 | \n", "0.348816 | \n", "transport | \n", "
13615 | \n", "Spi_isotig14966 | \n", "0.683519 | \n", "protein metabolism | \n", "
13616 | \n", "Spi_isotig14969 | \n", "0.480887 | \n", "DNA metabolism | \n", "
13617 | \n", "Spi_isotig14969 | \n", "0.480887 | \n", "protein metabolism | \n", "
13618 | \n", "Spi_isotig14969 | \n", "0.480887 | \n", "signal transduction | \n", "
13619 | \n", "Spi_isotig14969 | \n", "0.480887 | \n", "stress response | \n", "
13620 | \n", "Spi_isotig14970 | \n", "0.269739 | \n", "cell organization and biogenesis | \n", "
13621 | \n", "Spi_isotig14970 | \n", "0.269739 | \n", "other biological processes | \n", "
13622 | \n", "Spi_isotig14970 | \n", "0.269739 | \n", "other metabolic processes | \n", "
13623 | \n", "Spi_isotig14970 | \n", "0.269739 | \n", "RNA metabolism | \n", "
13624 | \n", "Spi_isotig14970 | \n", "0.269739 | \n", "stress response | \n", "
13625 | \n", "Spi_isotig14981 | \n", "0.202625 | \n", "signal transduction | \n", "
13626 | \n", "Spi_isotig14993 | \n", "0.663237 | \n", "death | \n", "
13627 | \n", "Spi_isotig14993 | \n", "0.663237 | \n", "other biological processes | \n", "
13628 | \n", "Spi_isotig14993 | \n", "0.663237 | \n", "stress response | \n", "
13629 | \n", "Spi_isotig14993 | \n", "0.663237 | \n", "transport | \n", "
13630 | \n", "Spi_isotig14997 | \n", "0.264544 | \n", "other metabolic processes | \n", "
13631 | \n", "Spi_isotig14998 | \n", "0.575532 | \n", "other biological processes | \n", "
13632 | \n", "Spi_isotig15025 | \n", "0.885773 | \n", "protein metabolism | \n", "
13633 rows × 3 columns
\n", "\n", " | 0 | \n", "
---|---|
0 | \n", "0.484363 | \n", "
1 | \n", "0.335179 | \n", "
2 | \n", "0.854266 | \n", "
3 | \n", "0.867137 | \n", "
4 | \n", "0.196592 | \n", "
5 | \n", "0.871739 | \n", "
6 | \n", "0.937628 | \n", "
7 | \n", "0.131292 | \n", "
8 | \n", "0.684932 | \n", "
9 | \n", "0.584255 | \n", "
10 | \n", "0.857523 | \n", "
11 | \n", "1.046390 | \n", "
12 | \n", "0.981477 | \n", "
13 | \n", "0.654594 | \n", "
14 | \n", "0.631948 | \n", "
15 | \n", "0.798651 | \n", "
16 | \n", "0.876088 | \n", "
17 | \n", "0.926440 | \n", "
18 | \n", "0.910725 | \n", "
19 | \n", "0.910243 | \n", "
20 | \n", "0.896155 | \n", "
21 | \n", "0.895683 | \n", "
22 | \n", "0.900018 | \n", "
23 | \n", "0.844576 | \n", "
24 | \n", "0.932216 | \n", "
25 | \n", "0.987775 | \n", "
26 | \n", "0.987220 | \n", "
27 | \n", "0.908013 | \n", "
28 | \n", "0.972397 | \n", "
29 | \n", "0.971853 | \n", "
... | \n", "... | \n", "
14883 | \n", "0.077346 | \n", "
14884 | \n", "0.995899 | \n", "
14885 | \n", "0.805340 | \n", "
14886 | \n", "0.193988 | \n", "
14887 | \n", "1.762730 | \n", "
14888 | \n", "1.116090 | \n", "
14889 | \n", "0.735748 | \n", "
14890 | \n", "0.629234 | \n", "
14891 | \n", "1.161310 | \n", "
14892 | \n", "0.552857 | \n", "
14893 | \n", "0.122963 | \n", "
14894 | \n", "0.717703 | \n", "
14895 | \n", "0.896540 | \n", "
14896 | \n", "1.407900 | \n", "
14897 | \n", "0.808355 | \n", "
14898 | \n", "0.322698 | \n", "
14899 | \n", "1.115180 | \n", "
14900 | \n", "0.466681 | \n", "
14901 | \n", "0.419544 | \n", "
14902 | \n", "0.427099 | \n", "
14903 | \n", "0.843343 | \n", "
14904 | \n", "0.790032 | \n", "
14905 | \n", "0.885773 | \n", "
14906 | \n", "0.000000 | \n", "
14907 | \n", "1.242950 | \n", "
14908 | \n", "1.104080 | \n", "
14909 | \n", "0.430511 | \n", "
14910 | \n", "0.640071 | \n", "
14911 | \n", "0.899106 | \n", "
14912 | \n", "0.506757 | \n", "
14913 rows × 1 columns
\n", "