{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpG ratio for the *Acropora palmata* transcriptome" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow calculates CpG ratio, or CpG O/E, for contigs in the *Acropora palmata* [transcriptome](https://usegalaxy.org/datasets/cb51c4a06d7ae94e/display?to_ext=fasta). CpG ratio is an estimate of germline DNA methylation.\n", "\n", "This workflow is an extension of another IPython notebook workflow, `Apalm_blast_anno.ipynb`, that generates an annotation of the same transcriptome. This workflow assumes that you have created the directories and files specified in the annotation workflow." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Apalm\n" ] } ], "source": [ "cd .data/Apalm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">AOKF1013_g2_c length=710 Acc=Q9DE13 Description=Bromodomainadjacenttozincfingerdomainprotein2B\n", "AGGGCAATTGAGTCGCAAAGAAAACAAGAGGAGCGTGAAAGATTAAAGGAAGAGAAAAAAATGGAAAAGGAACTTCAAAGAGAGAAAAAGCTTGAGCAAAAGAGAAGGGAGATGATTTTAGCCCGTGAACTGAAAAAGCCAGTAGAAGATATGGTTTTAAAGGATAGCAAGACACTTCCTGCTTTCTCCAGAGTTGTGGGCCTTAAAATACCAGGGGACGCATTTGCTGACTTGTTGATGGTTCAGGAATTTGTGCACAATTTTAGTGAAGCCTTGGAACTTGATTCCAACGAAGTCCCTTCCTTGTGGGAAATGCAGTTGTCATTGTTAAATGACAGCAGTGAGGATGTCCTCGTGCCACTTTGTCAGAGTCTTCTGATGTCTGCATTAGAGGATCCTGGCTGTGAGGGGCCTGATTCATTCACAATGCTTGGAGTTGCATTAGCCAAAGTGGAATTGAATGAAACAAACTTCTCTGAAGTCTTGAGGCTGTTTATAATTTCAAGAAATGCTGGTGACCCTCATCCTTTGGCAGAAGCTTTCATCAGTACACCTTTCCAAGCACTCACCATGTCAGCTAAGGCTGGAGTCTTGGGTTACCTGTGCAATGAACTGCTGTGCAGTAGAACAATATGCAAGGAAATAGAGAATAGTATTGAACACATGTCAAATTTACGTCGAGATAAGTGGGTTGTGGAAGGCAAGTTTGG\n", "\n", "number of seqs =\n", "88020\n" ] } ], "source": [ "#fasta file generated in Apalm_blast_anno.ipynb\n", "!head -2 Apalmata_assembled.fasta\n", "!echo \n", "!echo number of seqs =\n", "!fgrep -c \">\" Apalmata_assembled.fasta" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">AOKF1013_g2_c\n", "AGGGCAATTGAGTCGCAAAGAAAACAAGAGGAGCGTGAAAGATTAAAGGAAGAGAAAAAAATGGAAAAGGAACTTCAAAGAGAGAAAAAGCTTGAGCAAAAGAGAAGGGAGATGATTTTAGCCCGTGAACTGAAAAAGCCAGTAGAAGATATGGTTTTAAAGGATAGCAAGACACTTCCTGCTTTCTCCAGAGTTGTGGGCCTTAAAATACCAGGGGACGCATTTGCTGACTTGTTGATGGTTCAGGAATTTGTGCACAATTTTAGTGAAGCCTTGGAACTTGATTCCAACGAAGTCCCTTCCTTGTGGGAAATGCAGTTGTCATTGTTAAATGACAGCAGTGAGGATGTCCTCGTGCCACTTTGTCAGAGTCTTCTGATGTCTGCATTAGAGGATCCTGGCTGTGAGGGGCCTGATTCATTCACAATGCTTGGAGTTGCATTAGCCAAAGTGGAATTGAATGAAACAAACTTCTCTGAAGTCTTGAGGCTGTTTATAATTTCAAGAAATGCTGGTGACCCTCATCCTTTGGCAGAAGCTTTCATCAGTACACCTTTCCAAGCACTCACCATGTCAGCTAAGGCTGGAGTCTTGGGTTACCTGTGCAATGAACTGCTGTGCAGTAGAACAATATGCAAGGAAATAGAGAATAGTATTGAACACATGTCAAATTTACGTCGAGATAAGTGGGTTGTGGAAGGCAAGTTTGG\n", ">AOKF1022_b2_c\n", "GGGCAAAACGAACAAATTTTGACAATAATCTCTCAAATCTGTCAAGTCACGGCAGGGCTGCAAATAGCTATCGGGGAGGCGCCGGTCACGTCCGGTCAAACATGATTTTGCTCGGACAAGACCCGCTTTTGGCCGGTCAAATTTTAACAGTCGTAACTCTTACGATAGTGAACCCAGATTGCGCAGTAATCCTTTTATAACTACAAAACAATTGAATCCAAGTCGGTTTGGCAATAAAAGGTACTACTTTTACCACTCTTTGCTTTTCGCACTTTGCAATAAATTCTACGTAGAGGATTCTTGGTGTAGCGAGATTATTCTTCGTGGGAGTGCTTTCCGATCATTCAATCAATCAATCAATCACTTTATTTGTGAGTCAATCACGGTATCTCTCCAAAGATAAAACCCTCTACCAAGTGGGAACACCTAAGGCTAATAAAAATAACGGACGACTCGATGATTTGCCGTCGTGACAGGACTTGATGACATCGTGGAAATTTTCTAGTACCGGGAATTTCACTACCAAGAATTTGTCTAGTTTTATATTCGTTTTTTTTTATCATACATGTCCCTCGTGATTATCAAATAGTTAAAACTTAAAACTTGTCTGAACGAGTGAATAAAGGGTT\n", ">AOKF1022_g2_c\n", "TTTGGGGGGGGGCCGGTCCCGTCCGGTCAAACATGATTTTGCTCGGACAAGACCCGCTTTTGGCCGGTCAAATTTTAACAGTGGTAACTCTTTCGATAGGGAACCCAGATTGGGCCGTAATCCTTTTTTAACTTCAAAACAATTGAATCCAAGTCGGTTTGGCAATAAAAGGGACTACTTTTACCCCTCTTTGCTTTTCGCCCTTTGCAAAAAATTTTACGTAGGGGGTTTTTGGGGTAGGGGGATTTTTTTTTGGGGGGGGGGTTTCCGGTCATTCAATCAATCAATCAATCCCTTTTTTTGGGGGGCAATCCCGGGATTTCTCCAAAGATAAAACCCTTTTCCAAGGGGGGACCCCTAAGGGTAATAAAAAAAACGGGCGGCCCGATGATTTGCCGTCGTGACAGGACTTGAAGACATCGGGGAAATTTTTTAGTTCCGGGAATTTCCCTCCCAAGAATTTGTCTAGTTTTAAATTCGTTTTTTTTTATCAAACAAGTCCCCCGGGGTTTTCAAAAAGTTAAAACTTAAAACTTGTTTGAACGGGGGAATAAAGGGTTTaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaagccgg\n", ">AOKF1024_g2_c\n", "TTTTTTTAAACCCCTTTTTTTAAACGGTAGGGGGCCAAAAAATGTTGTTAAAAATTCCTTTAACTAAGGGTTTTTTTTGGGAAAAAAAAAAAAGGGGGGCCTTGTCCTTTTTTTTTTTTTTTTTTTTTTaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaccc\n", ">AOKF1029_g2_c\n", "TAAGCTGCTCCAACCACTGGTACATACATTAAATTTTATTTCCACTGAAAGGGTATAAATGACCCGATCAATTTTCATGTTTTTTTCCCCTCAAAGACAGTATGCCAACTATGGTATTTTCCATTTTACACGATTCCTTGTTCTTTTTTTTTTTGAGAGACCTTGTTTCTGTAACATGCAAATTGTCCCCAAGCTGAGGTAGGCATAAGTGTCCTACTGTTGTGATGATTTTCTCTTAATATATTTTAACTGGACATCATTGTATAGTTGCATATAGTTGTTTGGCTTTGCCTAGAACAAGAGGGTAAGACATTTTCTAACCTACCGACCTATAATCTGACTTTTAATATAGAAGAATTTTCATGAATAAACTGTTTCATGTCTCTAGTTCACTAACATAACATGTTTCATAAAAAAGTTCTTTTGAAAGTAAAATAAAGCCATTATTGACTTCTTTCATAATTTTAAAAATTAATCCAGGAAAAATTTATTTGCAAAAAGAGAAAATGGAAACATCAATAAATCACCATCAGCTACTTTCTTTTAATCTCTTTATGCAAAACCAAAATTTGCATTGGTTGTAAATTAGTCTGGTAACTAAAGTTTCTACCAGTTTCAAAACTGGCAGCTTTGAGTACCAACTCGATACCAATAGTAAATCTGTTTAGATCTAATCCAGCTGTAATGATTGTCGAAGACCAGAGCGTACTGCCCTGCATCAACTAACCCCC\n", ">F66KHFO02JZYYU\n", "ACTAAGTCTGGATATTCTAGCTTGGACTGCAGGGATATTATTAATCAACACAATAGATAATAATAATAATAATATTATTACTTCAACATTTGACTGACTGATTGGCTGACAAATGGGAAATAGTACCCATAATAGTCTGTAACATGATTTACACAGATTTCTCCTCAGGGACAATACAGTTGCGAAAATGATTCAGTTTCATGGGTGGGAGCAGTCTCCATCTGAGAGATGGCTTCCAGTAGTGGAAGCTCACGACGTTGTTTCTCAGGAATCAAACCTTCACACTGCAACTCAGCTGTTAAACCAAAACATTAGCATGCATGCAATTCCTGACCTCAATGAAAATAATAACCTTGAATCAACTTTGTATCTGGAAACTGCCATTTCAAATTCCTCTGGCAGTAAAAGAACAACAGCACATGCAGTCTTCGACATCTGATTCACATTTGTNCAGATGACTTGGTCAATC\n", ">F66KHFO02JZZ42\n", "ATTTAACTCAGTATCAGAATTCATCCGCTCTACTTGTTTTGCACCGTGACAAGTTGTTTTCGGGTGAAAGGATTTCAAGCGGAACTGAGCTTCATCTCGCTTTCAACCCACTACACAGTGGCTGAAAGACGAGCCTTAAGCTCTCTAATAACGCAACAGCAATAAGGAGAAATGAACGCTGGGAGAACAGAAGACGAGTTTTTCTGCGAAGGAGATTCAAGAAGGCACCACCGCTCCGTCTCCCATTTGTGGCGCGTCTGGCACTTATACGAGGAATTTGGCTTCCTCTAATGTCCCTCATCTACTCAGCTTGTCGGGTGGCGAGGAAACACTATTATAAGAGCCCCCACATCAACTCAACAGGTGGTTCCTCTTTGTTTAACTTGGCAACAACTTTGTGGACTCATACCCGTAATTTTAGCTCTTTAGTAGAATAAACATCATGTCCACCACTTATGACATATGATTGATATTATTCTGGTATTT\n", ">F66KHFO02JZZ9J\n", "GCAACAATGAAGTGGACAAAAACACTGACAAGTTTGCGACTTGTCCAGGAATGCACTTTAACGAAATAAAAAACATTCAATAAGCATAATCAAATGAAGTCCAAGAAGGGCCGTGAATGAGATTGCGAATACTACAAAATTGCAAAATTTACAATAAATAGGATCAGGACAGTTAGGTAAATAGTCTGAGAAAAAGAACAAAAGTTATACAAAACCGAAATAAGGCCCTGAAGGGAAGGCTATACCTCTTGATACTTGATAAATGGTGTACATAGCAGCTGGTACTCCGGAGTTCAAACGACATGAATGATTAAAGCAATTTTGGGTAGGTTTATTTAAGCAAGGACTTTCTTGGGTGTTAAAAAAAATGAGAGGATTCAGGATAGTTTCCACGTATCCCTGAGTG\n", ">F66KHFO02JZZML\n", "ACTTTCATTCAAGGGAGNACCAAAGACAGCCTTCCACACAGTAAGATGTAAGACTGCATCTCTATGTTTCATAAAACTATTGCAATTTAACTGCCGCATTCCTCTCAATTTCCGCGGTCAAAAACACAGCTGTTTATAACCCCTTATATACTTTCTGTGAAATTCACACGCAAAGTTAAATTCTTGATCTTTGTATTGAAATTTACTTGTAAGTGGACGGGTATTCTGCAATCTAGCCTTTGCTTCTATTCATTTTAGATGTGAGAACTTGTCTACCGGCGAAGAAGATAATTTTACATTATATTTTTTCCGGCGTTTAAGCATAATTAGGAATAATTATGGTGAAAGATGAAACAGGATGTTGATCCGACAAAGCAGGAGCCATCTGGCTGTTCAGCGAGACC\n", ">F66KHFO02JZZVO\n", "GTGAATTTCCAACTGATAAGCTCTTCCTTTATCACTCAAAGTTCACTCGTGTGAGGCACACAGTCCTGGATGACCGCATTGTATGTCCGGAAGACTTTGTTCTTGATCTAAACCAAAGTTTCTAAGATCATGTTACCAATTTGAAATTTCGGCATTTGGAGCTTCTTTATCTTCTGCCACAAGAATCTTTGGAACTGTAACTCGAGGTGAAGGGGTACGACTAACACCCATCTTTGCAAACTCAAATTTAAACCCAGGCGTGTGGTGAAAGGGTACCACTAACACTCATCTTTGACAAACTCAAATTCAAACCCAGGTGTCTATTACTATATTTTACTCTAGCGGCAAGCGAACATCAAAGCCGAATGTATTGCCAAGAAAATCGCACCAAGCAAAATACCAAAAGGGGCGTACTATACAGAGACCAAAGTGACATTTAAATCGTGTTTATTGATTGATCTCGTAAACTGCAAATAAACGATAAAACGCAGGTATAAGACGTCTA\n" ] } ], "source": [ "#I remembered that this fasta is full of \" marks before some of the \">\"\n", "#Removing \" from fasta and printing first line w/out comments and looking at contig names\n", "!sed 's/\"//g' Apalmata_assembled.fasta | awk '{print $1}' > Apalm.fasta\n", "!head -10 Apalm.fasta\n", "!tail -10 Apalm.fasta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Converted 88020 FASTA records in 176040 lines to tabular format\r\n", "Total sequence length: 63829400\r\n", "\r\n" ] } ], "source": [ "#Converting FASTA to tabular format and placing output file in analyses directory\n", "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' \\\n", "Apalm.fasta > ../../analyses/Apalm/fasta2tab" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Apalm\n" ] } ], "source": [ "cd ../../analyses/Apalm" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1013_g2_c\t\tAGGGCAATTGAGTCGCAAAGAAAACAAGAGGAGCGTGAAAGATTAAAGGAAGAGAAAAAAATGGAAAAGGAACTTCAAAGAGAGAAAAAGCTTGAGCAAAAGAGAAGGGAGATGATTTTAGCCCGTGAACTGAAAAAGCCAGTAGAAGATATGGTTTTAAAGGATAGCAAGACACTTCCTGCTTTCTCCAGAGTTGTGGGCCTTAAAATACCAGGGGACGCATTTGCTGACTTGTTGATGGTTCAGGAATTTGTGCACAATTTTAGTGAAGCCTTGGAACTTGATTCCAACGAAGTCCCTTCCTTGTGGGAAATGCAGTTGTCATTGTTAAATGACAGCAGTGAGGATGTCCTCGTGCCACTTTGTCAGAGTCTTCTGATGTCTGCATTAGAGGATCCTGGCTGTGAGGGGCCTGATTCATTCACAATGCTTGGAGTTGCATTAGCCAAAGTGGAATTGAATGAAACAAACTTCTCTGAAGTCTTGAGGCTGTTTATAATTTCAAGAAATGCTGGTGACCCTCATCCTTTGGCAGAAGCTTTCATCAGTACACCTTTCCAAGCACTCACCATGTCAGCTAAGGCTGGAGTCTTGGGTTACCTGTGCAATGAACTGCTGTGCAGTAGAACAATATGCAAGGAAATAGAGAATAGTATTGAACACATGTCAAATTTACGTCGAGATAAGTGGGTTGTGGAAGGCAAGTTTGG\n", "AOKF1022_b2_c\t\tGGGCAAAACGAACAAATTTTGACAATAATCTCTCAAATCTGTCAAGTCACGGCAGGGCTGCAAATAGCTATCGGGGAGGCGCCGGTCACGTCCGGTCAAACATGATTTTGCTCGGACAAGACCCGCTTTTGGCCGGTCAAATTTTAACAGTCGTAACTCTTACGATAGTGAACCCAGATTGCGCAGTAATCCTTTTATAACTACAAAACAATTGAATCCAAGTCGGTTTGGCAATAAAAGGTACTACTTTTACCACTCTTTGCTTTTCGCACTTTGCAATAAATTCTACGTAGAGGATTCTTGGTGTAGCGAGATTATTCTTCGTGGGAGTGCTTTCCGATCATTCAATCAATCAATCAATCACTTTATTTGTGAGTCAATCACGGTATCTCTCCAAAGATAAAACCCTCTACCAAGTGGGAACACCTAAGGCTAATAAAAATAACGGACGACTCGATGATTTGCCGTCGTGACAGGACTTGATGACATCGTGGAAATTTTCTAGTACCGGGAATTTCACTACCAAGAATTTGTCTAGTTTTATATTCGTTTTTTTTTATCATACATGTCCCTCGTGATTATCAAATAGTTAAAACTTAAAACTTGTCTGAACGAGTGAATAAAGGGTT\n", "F66KHFO02JZZML\t\tACTTTCATTCAAGGGAGNACCAAAGACAGCCTTCCACACAGTAAGATGTAAGACTGCATCTCTATGTTTCATAAAACTATTGCAATTTAACTGCCGCATTCCTCTCAATTTCCGCGGTCAAAAACACAGCTGTTTATAACCCCTTATATACTTTCTGTGAAATTCACACGCAAAGTTAAATTCTTGATCTTTGTATTGAAATTTACTTGTAAGTGGACGGGTATTCTGCAATCTAGCCTTTGCTTCTATTCATTTTAGATGTGAGAACTTGTCTACCGGCGAAGAAGATAATTTTACATTATATTTTTTCCGGCGTTTAAGCATAATTAGGAATAATTATGGTGAAAGATGAAACAGGATGTTGATCCGACAAAGCAGGAGCCATCTGGCTGTTCAGCGAGACC\n", "F66KHFO02JZZVO\t\tGTGAATTTCCAACTGATAAGCTCTTCCTTTATCACTCAAAGTTCACTCGTGTGAGGCACACAGTCCTGGATGACCGCATTGTATGTCCGGAAGACTTTGTTCTTGATCTAAACCAAAGTTTCTAAGATCATGTTACCAATTTGAAATTTCGGCATTTGGAGCTTCTTTATCTTCTGCCACAAGAATCTTTGGAACTGTAACTCGAGGTGAAGGGGTACGACTAACACCCATCTTTGCAAACTCAAATTTAAACCCAGGCGTGTGGTGAAAGGGTACCACTAACACTCATCTTTGACAAACTCAAATTCAAACCCAGGTGTCTATTACTATATTTTACTCTAGCGGCAAGCGAACATCAAAGCCGAATGTATTGCCAAGAAAATCGCACCAAGCAAAATACCAAAAGGGGCGTACTATACAGAGACCAAAGTGACATTTAAATCGTGTTTATTGATTGATCTCGTAAACTGCAAATAAACGATAAAACGCAGGTATAAGACGTCTA\n" ] } ], "source": [ "#Checking header on new tabular format file\n", "!head -2 fasta2tab\n", "!tail -2 fasta2tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Added column with length of column 2 for 88020 lines.\r\n", "\r\n" ] } ], "source": [ "#Add column with length of sequence\n", "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' \\\n", "fasta2tab > tab_1\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 88020 264060 65591859 tab_1\r\n" ] } ], "source": [ "!wc tab_1" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#The file used to count Cs and Gs will only include the sequence\n", "!awk '{print $2}' tab_1 > tab_2" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#This counts CGs - both cases\n", "!echo \"CG\" | awk -F\\[Cc][Gg] '{print NF-1}' tab_2 > CG " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Cs\n", "!echo \"C\" | awk -F\\[Cc] '{print NF-1}' tab_2 > C " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Gs\n", "!echo \"G\" | awk -F\\[Gg] '{print NF-1}' tab_2 > G " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1013_g2_c\t\tAGGGCAATTGAGTCGCAAAGAAAACAAGAGGAGCGTGAAAGATTAAAGGAAGAGAAAAAAATGGAAAAGGAACTTCAAAGAGAGAAAAAGCTTGAGCAAAAGAGAAGGGAGATGATTTTAGCCCGTGAACTGAAAAAGCCAGTAGAAGATATGGTTTTAAAGGATAGCAAGACACTTCCTGCTTTCTCCAGAGTTGTGGGCCTTAAAATACCAGGGGACGCATTTGCTGACTTGTTGATGGTTCAGGAATTTGTGCACAATTTTAGTGAAGCCTTGGAACTTGATTCCAACGAAGTCCCTTCCTTGTGGGAAATGCAGTTGTCATTGTTAAATGACAGCAGTGAGGATGTCCTCGTGCCACTTTGTCAGAGTCTTCTGATGTCTGCATTAGAGGATCCTGGCTGTGAGGGGCCTGATTCATTCACAATGCTTGGAGTTGCATTAGCCAAAGTGGAATTGAATGAAACAAACTTCTCTGAAGTCTTGAGGCTGTTTATAATTTCAAGAAATGCTGGTGACCCTCATCCTTTGGCAGAAGCTTTCATCAGTACACCTTTCCAAGCACTCACCATGTCAGCTAAGGCTGGAGTCTTGGGTTACCTGTGCAATGAACTGCTGTGCAGTAGAACAATATGCAAGGAAATAGAGAATAGTATTGAACACATGTCAAATTTACGTCGAGATAAGTGGGTTGTGGAAGGCAAGTTTGG\t710\t8\t119\t183\r\n" ] } ], "source": [ "#Combining counts\n", "!paste tab_1 \\\n", "CG \\\n", "C \\\n", "G \\\n", "> comb\n", "!head -1 comb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpGo/e based on [Gavery and Roberts (2010)](http://www.biomedcentral.com/1471-2164/11/483)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Calculation of CpG o/e\n", "!awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb > ID_CpG #use ^ instead of ** for exponent" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1013_g2_c \t 0.261194\r\n", "AOKF1022_b2_c \t 1.21084\r\n", "AOKF1022_g2_c \t 0.933676\r\n", "AOKF1024_g2_c \t 0.46793\r\n", "AOKF1029_g2_c \t 0.305319\r\n", "AOKF1031_g2_c \t 0.476647\r\n", "AOKF1034_g2_c \t 0.250371\r\n", "AOKF1040_g2_c \t 1.11148\r\n", "AOKF1045_g2_c \t 0.415524\r\n", "AOKF1046_g2_c \t 0.278746\r\n" ] } ], "source": [ "!head ID_CpG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now joining CpG to annotation, but first must sort files." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1031_g2_c\tsp\tQ66I12\tCCD47_DANRE\t46.03\t239\t126\t3\t8\t721\t229\t465\t3e-48\t 171\r\n", "AOKF1045_g2_c\tsp\tB5DFQ4\tRHG26_XENTR\t45.99\t237\t126\t1\t2\t712\t74\t308\t1e-60\t 208\r\n", "AOKF1050_b2_c\tsp\tP81004\tVDAC2_XENLA\t67.41\t135\t44\t0\t315\t719\t3\t137\t5e-60\t 197\r\n", "AOKF1057_b2_c\tsp\tP56616\tUBE2C_XENLA\t55.48\t146\t60\t4\t109\t537\t34\t177\t3e-49\t 166\r\n", "AOKF1062_g2_c\tsp\tQ5PR73\tDIRA2_MOUSE\t37.35\t166\t99\t4\t4\t501\t7\t167\t2e-29\t 113\r\n", "AOKF1091_g2_c\tsp\tA2RRV3\tPATL1_DANRE\t38.69\t168\t83\t4\t195\t695\t402\t550\t4e-26\t 111\r\n", "AOKF1100_g2_c\tsp\tL0N7N1\tKIF14_MOUSE\t54.02\t87\t40\t0\t397\t657\t424\t510\t1e-26\t 112\r\n", "AOKF1114_g2_c\tsp\tQ9BX66\tSRBS1_HUMAN\t47.17\t53\t28\t0\t188\t346\t798\t850\t2e-10\t63.9\r\n", "AOKF1132_g2_c\tsp\tQ8VDS4\tRPR1A_MOUSE\t53.04\t247\t111\t2\t2\t727\t20\t266\t8e-84\t 259\r\n", "AOKF1164_g2_c\tsp\tQ9VHH9\tJHD1_DROME\t67.72\t127\t41\t0\t2\t382\t250\t376\t1e-59\t 209\r\n" ] } ], "source": [ "#Sorting Apalm Uniprot/Swissprot annotation file. This file was the result of work done in another notebook: \n", "#Apalm_blast_anno.ipynb\n", "!sort Apalm_blastx_uniprot_sql.tab | tail -n +2 > Apalm_blastx_uniprot_sql.tab.sorted\n", "!head Apalm_blastx_uniprot_sql.tab.sorted" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1045_g2_c\tcell organization and biogenesis\r", "\r\n", "AOKF1045_g2_c\tother biological processes\r", "\r\n", "AOKF1045_g2_c\tsignal transduction\r", "\r\n", "AOKF1050_b2_c\ttransport\r", "\r\n", "AOKF1057_b2_c\tcell cycle and proliferation\r", "\r\n", "AOKF1057_b2_c\tcell organization and biogenesis\r", "\r\n", "AOKF1057_b2_c\tother biological processes\r", "\r\n", "AOKF1057_b2_c\tprotein metabolism\r", "\r\n", "AOKF1062_g2_c\tother biological processes\r", "\r\n", "AOKF1062_g2_c\tother metabolic processes\r", "\r\n" ] } ], "source": [ "#Sorting Ahya GOSlim annotation file. This file was the result of work done in another notebook: Apalm_blast_anno.ipynb\n", "!sort Apalm_GOSlim.tab | tail -n +2 > Apalm_GOSlim.sorted\n", "!head Apalm_GOSlim.sorted" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1013_g2_c \t 0.261194\r\n", "AOKF1022_b2_c \t 1.21084\r\n", "AOKF1022_g2_c \t 0.933676\r\n", "AOKF1024_g2_c \t 0.46793\r\n", "AOKF1029_g2_c \t 0.305319\r\n", "AOKF1031_g2_c \t 0.476647\r\n", "AOKF1034_g2_c \t 0.250371\r\n", "AOKF1040_g2_c \t 1.11148\r\n", "AOKF1045_g2_c \t 0.415524\r\n", "AOKF1046_g2_c \t 0.278746\r\n" ] } ], "source": [ "#Sorting Ahya CpG file\n", "!sort ID_CpG > ID_CpG.sorted\n", "!head ID_CpG.sorted" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!join ID_CpG.sorted Apalm_blastx_uniprot_sql.tab.sorted | awk '{print $1, \"\\t\", $2}' > Apalm_cpg_anno" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1031_g2_c \t 0.476647\r\n", "AOKF1045_g2_c \t 0.415524\r\n", "AOKF1050_b2_c \t 0.405247\r\n", "AOKF1057_b2_c \t 0.337031\r\n", "AOKF1062_g2_c \t 1.00104\r\n", "AOKF1091_g2_c \t 0.503552\r\n", "AOKF1100_g2_c \t 0.616876\r\n", "AOKF1114_g2_c \t 0.964931\r\n", "AOKF1132_g2_c \t 0.228244\r\n", "AOKF1164_g2_c \t 0.905474\r\n" ] } ], "source": [ "!head Apalm_cpg_anno" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!join ID_CpG.sorted Apalm_GOSlim.sorted > Apalm_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1045_g2_c 0.415524 cell organization and biogenesis\r", "\r\n", "AOKF1045_g2_c 0.415524 other biological processes\r", "\r\n", "AOKF1045_g2_c 0.415524 signal transduction\r", "\r\n", "AOKF1050_b2_c 0.405247 transport\r", "\r\n", "AOKF1057_b2_c 0.337031 cell cycle and proliferation\r", "\r\n", "AOKF1057_b2_c 0.337031 cell organization and biogenesis\r", "\r\n", "AOKF1057_b2_c 0.337031 other biological processes\r", "\r\n", "AOKF1057_b2_c 0.337031 protein metabolism\r", "\r\n", "AOKF1062_g2_c 1.00104 other biological processes\r", "\r\n", "AOKF1062_g2_c 1.00104 other metabolic processes\r", "\r\n" ] } ], "source": [ "!head Apalm_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AOKF1045_g2_c \t 0.415524 \t cell organization and biogenesis\r", "\r\n", "AOKF1045_g2_c \t 0.415524 \t other biological processes\r", " \r\n", "AOKF1045_g2_c \t 0.415524 \t signal transduction\r", " \r\n", "AOKF1050_b2_c \t 0.405247 \t transport\r", " \r\n", "AOKF1057_b2_c \t 0.337031 \t cell cycle and proliferation\r", "\r\n", "AOKF1057_b2_c \t 0.337031 \t cell organization and biogenesis\r", "\r\n", "AOKF1057_b2_c \t 0.337031 \t other biological processes\r", " \r\n", "AOKF1057_b2_c \t 0.337031 \t protein metabolism\r", " \r\n", "AOKF1062_g2_c \t 1.00104 \t other biological processes\r", " \r\n", "AOKF1062_g2_c \t 1.00104 \t other metabolic processes\r", " \r\n" ] } ], "source": [ "#Putting tabs in between columns\n", "!awk '{print $1, \"\\t\", $2, \"\\t\", $3, $4, $5, $6}' Apalm_cpg_GOslim > Apalm_cpg_GOslim.tab\n", "!head Apalm_cpg_GOslim.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now time to plot data using pandas and matplot" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "
---|---|---|---|
0 | \n", "AOKF1045_g2_c | \n", "0.415524 | \n", "cell organization and biogenesis | \n", "
1 | \n", "AOKF1045_g2_c | \n", "0.415524 | \n", "other biological processes | \n", "
2 | \n", "\n", " | NaN | \n", "NaN | \n", "
3 | \n", "AOKF1045_g2_c | \n", "0.415524 | \n", "signal transduction | \n", "
4 | \n", "\n", " | NaN | \n", "NaN | \n", "
5 | \n", "AOKF1050_b2_c | \n", "0.405247 | \n", "transport | \n", "
6 | \n", "\n", " | NaN | \n", "NaN | \n", "
7 | \n", "AOKF1057_b2_c | \n", "0.337031 | \n", "cell cycle and proliferation | \n", "
8 | \n", "AOKF1057_b2_c | \n", "0.337031 | \n", "cell organization and biogenesis | \n", "
9 | \n", "AOKF1057_b2_c | \n", "0.337031 | \n", "other biological processes | \n", "
10 | \n", "\n", " | NaN | \n", "NaN | \n", "
11 | \n", "AOKF1057_b2_c | \n", "0.337031 | \n", "protein metabolism | \n", "
12 | \n", "\n", " | NaN | \n", "NaN | \n", "
13 | \n", "AOKF1062_g2_c | \n", "1.001040 | \n", "other biological processes | \n", "
14 | \n", "\n", " | NaN | \n", "NaN | \n", "
15 | \n", "AOKF1062_g2_c | \n", "1.001040 | \n", "other metabolic processes | \n", "
16 | \n", "\n", " | NaN | \n", "NaN | \n", "
17 | \n", "AOKF1062_g2_c | \n", "1.001040 | \n", "signal transduction | \n", "
18 | \n", "\n", " | NaN | \n", "NaN | \n", "
19 | \n", "AOKF1091_g2_c | \n", "0.503552 | \n", "RNA metabolism | \n", "
20 | \n", "\n", " | NaN | \n", "NaN | \n", "
21 | \n", "AOKF1091_g2_c | \n", "0.503552 | \n", "cell organization and biogenesis | \n", "
22 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "cell adhesion | \n", "
23 | \n", "\n", " | NaN | \n", "NaN | \n", "
24 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "cell organization and biogenesis | \n", "
25 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "other biological processes | \n", "
26 | \n", "\n", " | NaN | \n", "NaN | \n", "
27 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "other metabolic processes | \n", "
28 | \n", "\n", " | NaN | \n", "NaN | \n", "
29 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "signal transduction | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
133709 | \n", "F66KHFO02JZND7 | \n", "0.645784 | \n", "protein metabolism | \n", "
133710 | \n", "\n", " | NaN | \n", "NaN | \n", "
133711 | \n", "F66KHFO02JZND7 | \n", "0.645784 | \n", "signal transduction | \n", "
133712 | \n", "\n", " | NaN | \n", "NaN | \n", "
133713 | \n", "F66KHFO02JZO8X | \n", "0.858204 | \n", "RNA metabolism | \n", "
133714 | \n", "\n", " | NaN | \n", "NaN | \n", "
133715 | \n", "F66KHFO02JZO8X | \n", "0.858204 | \n", "other biological processes | \n", "
133716 | \n", "\n", " | NaN | \n", "NaN | \n", "
133717 | \n", "F66KHFO02JZO8X | \n", "0.858204 | \n", "stress response | \n", "
133718 | \n", "\n", " | NaN | \n", "NaN | \n", "
133719 | \n", "F66KHFO02JZOT7 | \n", "0.657806 | \n", "other metabolic processes | \n", "
133720 | \n", "\n", " | NaN | \n", "NaN | \n", "
133721 | \n", "F66KHFO02JZQBN | \n", "0.852378 | \n", "protein metabolism | \n", "
133722 | \n", "\n", " | NaN | \n", "NaN | \n", "
133723 | \n", "F66KHFO02JZTL8 | \n", "0.713145 | \n", "death | \n", "
133724 | \n", "\n", " | NaN | \n", "NaN | \n", "
133725 | \n", "F66KHFO02JZTL8 | \n", "0.713145 | \n", "transport | \n", "
133726 | \n", "\n", " | NaN | \n", "NaN | \n", "
133727 | \n", "F66KHFO02JZV0J | \n", "0.375928 | \n", "protein metabolism | \n", "
133728 | \n", "\n", " | NaN | \n", "NaN | \n", "
133729 | \n", "F66KHFO02JZWQ6 | \n", "0.488831 | \n", "transport | \n", "
133730 | \n", "\n", " | NaN | \n", "NaN | \n", "
133731 | \n", "F66KHFO02JZX3R | \n", "0.916672 | \n", "other metabolic processes | \n", "
133732 | \n", "\n", " | NaN | \n", "NaN | \n", "
133733 | \n", "F66KHFO02JZX9S | \n", "0.545160 | \n", "other biological processes | \n", "
133734 | \n", "\n", " | NaN | \n", "NaN | \n", "
133735 | \n", "F66KHFO02JZXKF | \n", "0.926412 | \n", "other biological processes | \n", "
133736 | \n", "\n", " | NaN | \n", "NaN | \n", "
133737 | \n", "F66KHFO02JZXKF | \n", "0.926412 | \n", "other metabolic processes | \n", "
133738 | \n", "\n", " | NaN | \n", "NaN | \n", "
133739 rows × 3 columns
\n", "\n", " | 0 | \n", "1 | \n", "
---|---|---|
0 | \n", "AOKF1031_g2_c | \n", "0.476647 | \n", "
1 | \n", "AOKF1045_g2_c | \n", "0.415524 | \n", "
2 | \n", "AOKF1050_b2_c | \n", "0.405247 | \n", "
3 | \n", "AOKF1057_b2_c | \n", "0.337031 | \n", "
4 | \n", "AOKF1062_g2_c | \n", "1.001040 | \n", "
5 | \n", "AOKF1091_g2_c | \n", "0.503552 | \n", "
6 | \n", "AOKF1100_g2_c | \n", "0.616876 | \n", "
7 | \n", "AOKF1114_g2_c | \n", "0.964931 | \n", "
8 | \n", "AOKF1132_g2_c | \n", "0.228244 | \n", "
9 | \n", "AOKF1164_g2_c | \n", "0.905474 | \n", "
10 | \n", "AOKF1206_g2_c | \n", "0.820971 | \n", "
11 | \n", "AOKF1221_g2_c | \n", "0.424964 | \n", "
12 | \n", "AOKF1230_g2_c | \n", "0.716592 | \n", "
13 | \n", "AOKF1238_g2_c | \n", "0.487211 | \n", "
14 | \n", "AOKF1251_g2_c | \n", "0.253120 | \n", "
15 | \n", "AOKF1269_g2_c | \n", "0.330159 | \n", "
16 | \n", "AOKF1336_g2_c | \n", "0.368121 | \n", "
17 | \n", "AOKF1356_g2_c | \n", "0.461341 | \n", "
18 | \n", "AOKF1427_b2_c | \n", "0.764298 | \n", "
19 | \n", "AOKF1478_g2_c | \n", "0.936591 | \n", "
20 | \n", "AOKF1514_g2_c | \n", "0.295433 | \n", "
21 | \n", "AOKF1531_b2_c | \n", "0.237932 | \n", "
22 | \n", "AOKF1549_b2_c | \n", "0.881497 | \n", "
23 | \n", "AOKF1585_g2_c | \n", "0.720776 | \n", "
24 | \n", "AOKF1597_g2_c | \n", "0.435898 | \n", "
25 | \n", "AOKF1614_g2_c | \n", "0.720420 | \n", "
26 | \n", "AOKF1658_g2_c | \n", "0.285014 | \n", "
27 | \n", "AOKF1670_g2_c | \n", "0.326164 | \n", "
28 | \n", "AOKF1737_g2_c | \n", "0.563409 | \n", "
29 | \n", "AOKF1741_g2_c | \n", "0.272728 | \n", "
... | \n", "... | \n", "... | \n", "
35273 | \n", "F66KHFO02JZ9IQ | \n", "0.481931 | \n", "
35274 | \n", "F66KHFO02JZ9XY | \n", "1.224440 | \n", "
35275 | \n", "F66KHFO02JZCNS | \n", "0.779820 | \n", "
35276 | \n", "F66KHFO02JZCTR | \n", "0.524576 | \n", "
35277 | \n", "F66KHFO02JZD9L | \n", "0.242735 | \n", "
35278 | \n", "F66KHFO02JZDTN | \n", "0.583960 | \n", "
35279 | \n", "F66KHFO02JZFC0 | \n", "0.668225 | \n", "
35280 | \n", "F66KHFO02JZG27 | \n", "0.846390 | \n", "
35281 | \n", "F66KHFO02JZG3R | \n", "0.852755 | \n", "
35282 | \n", "F66KHFO02JZG9W | \n", "0.825320 | \n", "
35283 | \n", "F66KHFO02JZH8B | \n", "0.594062 | \n", "
35284 | \n", "F66KHFO02JZJET | \n", "0.320926 | \n", "
35285 | \n", "F66KHFO02JZJKP | \n", "0.548232 | \n", "
35286 | \n", "F66KHFO02JZK7V | \n", "0.438391 | \n", "
35287 | \n", "F66KHFO02JZL1Y | \n", "0.429955 | \n", "
35288 | \n", "F66KHFO02JZLMO | \n", "0.494197 | \n", "
35289 | \n", "F66KHFO02JZMKS | \n", "0.631184 | \n", "
35290 | \n", "F66KHFO02JZMWV | \n", "0.838499 | \n", "
35291 | \n", "F66KHFO02JZND4 | \n", "0.282860 | \n", "
35292 | \n", "F66KHFO02JZND7 | \n", "0.645784 | \n", "
35293 | \n", "F66KHFO02JZO8X | \n", "0.858204 | \n", "
35294 | \n", "F66KHFO02JZOT7 | \n", "0.657806 | \n", "
35295 | \n", "F66KHFO02JZQBN | \n", "0.852378 | \n", "
35296 | \n", "F66KHFO02JZRK4 | \n", "0.620927 | \n", "
35297 | \n", "F66KHFO02JZTL8 | \n", "0.713145 | \n", "
35298 | \n", "F66KHFO02JZV0J | \n", "0.375928 | \n", "
35299 | \n", "F66KHFO02JZWQ6 | \n", "0.488831 | \n", "
35300 | \n", "F66KHFO02JZX3R | \n", "0.916672 | \n", "
35301 | \n", "F66KHFO02JZX9S | \n", "0.545160 | \n", "
35302 | \n", "F66KHFO02JZXKF | \n", "0.926412 | \n", "
35303 rows × 2 columns
\n", "\n", " | 0 | \n", "1 | \n", "2 | \n", "
---|---|---|---|
0 | \n", "AOKF1050_b2_c | \n", "0.405247 | \n", "transport | \n", "
1 | \n", "\n", " | NaN | \n", "NaN | \n", "
2 | \n", "AOKF386_g2_c | \n", "1.023260 | \n", "cell organization and biogenesis | \n", "
3 | \n", "AOKG1730_b2_c | \n", "0.852003 | \n", "RNA metabolism | \n", "
4 | \n", "\n", " | NaN | \n", "NaN | \n", "
5 | \n", "AOKG1730_b2_c | \n", "0.852003 | \n", "cell organization and biogenesis | \n", "
6 | \n", "CAOG977_b1_c | \n", "0.424244 | \n", "protein metabolism | \n", "
7 | \n", "\n", " | NaN | \n", "NaN | \n", "
8 | \n", "CAOH2436_g1_c | \n", "0.546901 | \n", "other biological processes | \n", "
9 | \n", "\n", " | NaN | \n", "NaN | \n", "
10 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "cell adhesion | \n", "
11 | \n", "\n", " | NaN | \n", "NaN | \n", "
12 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "cell organization and biogenesis | \n", "
13 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "developmental processes | \n", "
14 | \n", "\n", " | NaN | \n", "NaN | \n", "
15 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "other biological processes | \n", "
16 | \n", "\n", " | NaN | \n", "NaN | \n", "
17 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "other metabolic processes | \n", "
18 | \n", "\n", " | NaN | \n", "NaN | \n", "
19 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "protein metabolism | \n", "
20 | \n", "\n", " | NaN | \n", "NaN | \n", "
21 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "signal transduction | \n", "
22 | \n", "\n", " | NaN | \n", "NaN | \n", "
23 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "stress response | \n", "
24 | \n", "\n", " | NaN | \n", "NaN | \n", "
25 | \n", "CAOI641_b2_c | \n", "0.898445 | \n", "other metabolic processes | \n", "
26 | \n", "\n", " | NaN | \n", "NaN | \n", "
27 | \n", "CAOI641_b2_c | \n", "0.898445 | \n", "protein metabolism | \n", "
28 | \n", "\n", " | NaN | \n", "NaN | \n", "
29 | \n", "CAWS1371_b2_c | \n", "0.331062 | \n", "RNA metabolism | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
3531 | \n", "\n", " | NaN | \n", "NaN | \n", "
3532 | \n", "F66KHFO02JQPWC | \n", "0.436226 | \n", "other biological processes | \n", "
3533 | \n", "\n", " | NaN | \n", "NaN | \n", "
3534 | \n", "F66KHFO02JQPWC | \n", "0.436226 | \n", "other metabolic processes | \n", "
3535 | \n", "\n", " | NaN | \n", "NaN | \n", "
3536 | \n", "F66KHFO02JQPWC | \n", "0.436226 | \n", "transport | \n", "
3537 | \n", "\n", " | NaN | \n", "NaN | \n", "
3538 | \n", "F66KHFO02JRJIB | \n", "0.379576 | \n", "cell-cell signaling | \n", "
3539 | \n", "\n", " | NaN | \n", "NaN | \n", "
3540 | \n", "F66KHFO02JRJIB | \n", "0.379576 | \n", "developmental processes | \n", "
3541 | \n", "\n", " | NaN | \n", "NaN | \n", "
3542 | \n", "F66KHFO02JRJIB | \n", "0.379576 | \n", "other metabolic processes | \n", "
3543 | \n", "\n", " | NaN | \n", "NaN | \n", "
3544 | \n", "F66KHFO02JSUBP | \n", "0.724542 | \n", "other metabolic processes | \n", "
3545 | \n", "\n", " | NaN | \n", "NaN | \n", "
3546 | \n", "F66KHFO02JUJ1P | \n", "0.259713 | \n", "other biological processes | \n", "
3547 | \n", "\n", " | NaN | \n", "NaN | \n", "
3548 | \n", "F66KHFO02JUJ1P | \n", "0.259713 | \n", "stress response | \n", "
3549 | \n", "\n", " | NaN | \n", "NaN | \n", "
3550 | \n", "F66KHFO02JUJ1P | \n", "0.259713 | \n", "transport | \n", "
3551 | \n", "\n", " | NaN | \n", "NaN | \n", "
3552 | \n", "F66KHFO02JUUKW | \n", "0.593258 | \n", "RNA metabolism | \n", "
3553 | \n", "\n", " | NaN | \n", "NaN | \n", "
3554 | \n", "F66KHFO02JVQ28 | \n", "0.469748 | \n", "cell organization and biogenesis | \n", "
3555 | \n", "F66KHFO02JVQ28 | \n", "0.469748 | \n", "other biological processes | \n", "
3556 | \n", "\n", " | NaN | \n", "NaN | \n", "
3557 | \n", "F66KHFO02JVQ28 | \n", "0.469748 | \n", "other metabolic processes | \n", "
3558 | \n", "\n", " | NaN | \n", "NaN | \n", "
3559 | \n", "F66KHFO02JVXP5 | \n", "0.203597 | \n", "other metabolic processes | \n", "
3560 | \n", "\n", " | NaN | \n", "NaN | \n", "
3561 rows × 3 columns
\n", "\n", " | 0 | \n", "1 | \n", "
---|---|---|
0 | \n", "AOKF1050_b2_c | \n", "0.405247 | \n", "
1 | \n", "AOKF386_g2_c | \n", "1.023260 | \n", "
2 | \n", "AOKG1730_b2_c | \n", "0.852003 | \n", "
3 | \n", "AOKG1840_b2_c | \n", "0.623692 | \n", "
4 | \n", "CAOG977_b1_c | \n", "0.424244 | \n", "
5 | \n", "CAOH2044_b1_c | \n", "0.720682 | \n", "
6 | \n", "CAOH2436_g1_c | \n", "0.546901 | \n", "
7 | \n", "CAOH2554_b1_c | \n", "0.589779 | \n", "
8 | \n", "CAOI2629_b1_c | \n", "1.070110 | \n", "
9 | \n", "CAOI641_b2_c | \n", "0.898445 | \n", "
10 | \n", "CAWS1371_b2_c | \n", "0.331062 | \n", "
11 | \n", "CAWS1482_b2_c | \n", "0.706679 | \n", "
12 | \n", "CCHX11969_b1_c | \n", "0.932186 | \n", "
13 | \n", "CCHX12472_b1_c | \n", "0.844341 | \n", "
14 | \n", "CCHX12647_b1_c | \n", "0.909217 | \n", "
15 | \n", "CCHX1314_b1_c | \n", "0.788706 | \n", "
16 | \n", "CCHX13764_b1_c | \n", "1.162150 | \n", "
17 | \n", "CCHX14952_b1_c | \n", "0.363122 | \n", "
18 | \n", "CCHX15242_b1_c | \n", "0.408760 | \n", "
19 | \n", "CCHX16619_b1_c | \n", "0.783179 | \n", "
20 | \n", "CCHX2147_b1_c | \n", "0.287108 | \n", "
21 | \n", "CCHX3889_b1_c | \n", "0.041551 | \n", "
22 | \n", "CCHX4084_b1_c | \n", "0.752735 | \n", "
23 | \n", "CCHX5422_b1_c | \n", "0.549186 | \n", "
24 | \n", "CCHX7039_b1_c | \n", "0.308911 | \n", "
25 | \n", "CCHX7275_b1_c | \n", "0.905557 | \n", "
26 | \n", "CCHX8585_b1_c | \n", "0.209281 | \n", "
27 | \n", "CCHX9155_b1_c | \n", "0.750502 | \n", "
28 | \n", "CCHX9618_b1_c | \n", "0.726452 | \n", "
29 | \n", "Contig_10042 | \n", "0.886648 | \n", "
... | \n", "... | \n", "... | \n", "
964 | \n", "F66KHFO02IGNHO | \n", "0.558197 | \n", "
965 | \n", "F66KHFO02IHEKI | \n", "0.691784 | \n", "
966 | \n", "F66KHFO02II348 | \n", "0.215927 | \n", "
967 | \n", "F66KHFO02IJ17R | \n", "0.797105 | \n", "
968 | \n", "F66KHFO02IKGMI | \n", "0.962609 | \n", "
969 | \n", "F66KHFO02ILIKA | \n", "0.256732 | \n", "
970 | \n", "F66KHFO02ILJOW | \n", "0.829998 | \n", "
971 | \n", "F66KHFO02ILW21 | \n", "0.873883 | \n", "
972 | \n", "F66KHFO02IN4WY | \n", "0.301022 | \n", "
973 | \n", "F66KHFO02INB9A | \n", "0.708336 | \n", "
974 | \n", "F66KHFO02INJ32 | \n", "0.753148 | \n", "
975 | \n", "F66KHFO02IO4B5 | \n", "0.737794 | \n", "
976 | \n", "F66KHFO02IOCNQ | \n", "0.982825 | \n", "
977 | \n", "F66KHFO02IRSL9 | \n", "0.862706 | \n", "
978 | \n", "F66KHFO02J2KQP | \n", "0.859879 | \n", "
979 | \n", "F66KHFO02J2R55 | \n", "0.703131 | \n", "
980 | \n", "F66KHFO02J3DCX | \n", "0.777464 | \n", "
981 | \n", "F66KHFO02J3VJ6 | \n", "1.039200 | \n", "
982 | \n", "F66KHFO02JCQIY | \n", "0.831741 | \n", "
983 | \n", "F66KHFO02JEH8U | \n", "0.473770 | \n", "
984 | \n", "F66KHFO02JH11M | \n", "0.380762 | \n", "
985 | \n", "F66KHFO02JH6QK | \n", "0.813869 | \n", "
986 | \n", "F66KHFO02JLTK7 | \n", "0.677554 | \n", "
987 | \n", "F66KHFO02JQPWC | \n", "0.436226 | \n", "
988 | \n", "F66KHFO02JRJIB | \n", "0.379576 | \n", "
989 | \n", "F66KHFO02JSUBP | \n", "0.724542 | \n", "
990 | \n", "F66KHFO02JUJ1P | \n", "0.259713 | \n", "
991 | \n", "F66KHFO02JUUKW | \n", "0.593258 | \n", "
992 | \n", "F66KHFO02JVQ28 | \n", "0.469748 | \n", "
993 | \n", "F66KHFO02JVXP5 | \n", "0.203597 | \n", "
994 rows × 2 columns
\n", "