{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculating CpG ratio for *Pocillopora damicornis* transcriptome"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This workflow calculates CpG ratio, or CpG O/E, for contigs in the *Pocillopora damicornis* [transcriptome](http://2ei.univ-perp.fr/telechargement/transcriptomes/blast2go_fasta_Pdamv2.zip). CpG ratio is an estimate of germline DNA methylation.\n",
    "\n",
    "This workflow is an extension of another IPython notebook workflow, `Pdam_blast_anno.ipynb`, that generates an annotation of the same transcriptome. This workflow assumes that you have created the directories and files specified in the annotation workflow.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Pdam\n"
     ]
    }
   ],
   "source": [
    "cd ../data/Pdam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1|spectrin alpha chain\n",
      "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n",
      "\n",
      "number of seqs =\n",
      "72890\n"
     ]
    }
   ],
   "source": [
    "#fasta file\n",
    "!head -2 blast2go_fasta_Pdamv2.fasta\n",
    "!echo \n",
    "!echo number of seqs =\n",
    "!fgrep -c \">\" blast2go_fasta_Pdamv2.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\n",
      "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n",
      ">Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin\n",
      "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttagtgagagactgatagagctcagcctcagtaacatagagacgttttccaccctcagtgagcgccttgaaggcgttgatgacttcctggctagagccaacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n",
      ">Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3tvitellogenin\n",
      "gctcgagtttgtgtgatgcctctttacagagagcacagattccttggtttggctcggcctcaaacaaagcttgcttcccacaggtgtaacaaagacgattttcatatgacgcggatgcagtccgtcactcttctcagcattatgcgaaggcgaattcaagggcaagtcactgatatagcgccatttctcctcatatcccttctgtggttcaatatcttcgatcatgtgttggaataacgtgacttgatccgtgttcatagggacggttgcgcgggtccattccacagggttcttggatgcatctttgtgaatatacacaactgtcgcgaggcgaacatgacggcgaccattcacgaatgtgatcttgaagagcttggggtagcgattatcatgcacagtctctaaacacccttgggaatttggcctcaccttaacgaatccctttccggtgacaaaaagtttcacaaattgatgatttccgtcgataaagaagactttgaagacagtaggtagctcgcttaagacgagttcattgtgctcattcacagacaggtattttccatttcctgccttcagagtccagttggcataacgatgatgatgatgtcccctggcaatctcagtaacttctccgatctcaaatccgtccagctctgtgctcctgtagattgaagaccaactcttggaacctgtgacaggtttcctgtacatcagccaggtgttgctcgccttgttagttgataaaatctcctgtctctcttgtggcaagtccatttctgtggtgtagctttgatttctgatgttgtactggaatctgccacctaatccagtagtgatattggtgtggatcgttagcttgctaccagtctccaactgaggcagagatactttcatttttccccaaagagaatgaagaacactaatgttcaattttccactaagagtgacttgatcgggaagaggaatgaagaatcggggttgaacctcagctttgaccgtggtgttcaaacgcaggaacaaaacgctgtccacagagaaatccaggggcagaccggcacacgagggctggcgtatcttcacttcaattggacggaggattttggagatattccattttttacctttcttcagcaggtccttcagatcgttgctaactctgatgactccatcttccattaactctttgatgtcatcataggtgaaataattatagaacagttcttttccaaaaactttgaagtagaaggaacctttggctttctcctcttcatgttcagcggttggcagctggtgttggatttccttgatttctttcccattcagactgttgtctaagcttcgtttcggtctgagcaagccaaacagactcttatcttcgctgtagtgtccccttggtcccatgacacgttggagcagctcttggattccctcaccacggatgccagtctcgaatagattcagagattttccaagcataccggtgtgtagtttgactttagagttgatagctctcgggatgaagcttgatggcgtggaaatcatgcgaagatcaactgcacttcccatctggagaagatcggagaagtcgccaaagtgaagccatttggagttcactgggtttatcttgtatctcttaagcgctcgaagggcgattcggagagcgattcgtgcggtttgagccattttttcgtcacatgggtatctagagaaagccatccccttgatgtaagacaccacaaaagatcgcacttgattggtgcgctcgtaatagagctgcttgactaccaagttgaacaccgctggaccaggtttgcagtcgcagatgatcacaaaacaagccatgcgcagttcggcgtcgttgtttggttgacggaagatctctaacaacactggcaataccttttgtgaaattttcggagcgattcttcgtagagcatacacagctgtgacgcgtaattccagagagttcctgttgtccttgataatttccaaaagaggttggtaaaaatctctgtgaccgaaattaccgatgcccttcaagatgaagatcttttcacggtaactggcatactgcaaacgactcttaatctcagccacttggttggcaaggcaagtgacgtccttgtctttgcagaagtcatgcatcaaggcaccggcagtaaggtaacaggttttcttcagagtctcgtctgcttggacaacagaaccagaacagagatccacaacggtcaagcacatctttcctgttgggtttggagttaaggctagaccacgaataagtgagactgccctcccagagtccagttccaatgtcttgattttttctgagatgagttcaaagacctcctccttgtacacataaggcagagcttccagcagccattgcctgcgtttctcgtcattgtagcagttgtcccagatcttgataagagtttttttgcaggtcctgcgtaaggtttccaccaaacgactgaactgccaagacatcaaattattctcctccacgctcttgatcaggtcgtcgatcaattttctcaccatgtttgcagtctgggtacgttgttcctcagtctctgctttgtcctcatcctcatccacagtcatttccaatgtcctagctgtaacaccactaacggtatagctttgggattcagacctcacttccttgaatttcaggtattgcttggtcaaagttctggcatttcctttcatctgtgttggtgcaaaaatgtatcttcccatcgttctacaatccttcaccacaaattgttcctcggttccgaaaagatcacatgatgctgacacagagatctgtaggggctcatcaatgacgctgcagtcctcgctaagacagcgcactcctggtacgtttgaaaacacgcggggttttccaatgcagttgtgcaggtttaaggtctttgtaatttgcaattccttgattgtgttggtctccagtgtgttgtgacgcacatcatacaggttttcacacattccatgagtgctgagctcccagttcttgtacagtcttggttcacggtgatgtaatgaagactcaggcttggtgaagttgtggtggatcatggaaagaactcctcgtttgatgttcaaaatgtactcaggctcatcgtcattgcagaagatctgaccaacttttccatctttgtactcaaacttgattggtcgttccagcaacggcttcaattcattggagactggggagacagcgtgtgacacagacttctacttcgaaacagacactttggcttatttgaagataagaaacccaattctgtacgaagttaacgtaacatccttcgaccaatcagagtcacacactgtctccccagtctccaatgaattgaagccgttgctggaacgaccaatcaagtttgagtacaaagatggaaaagttggtcagatcttctgcaatgacgatgagcctgagtacattttgaacatcaaacgaggagttctttccatgatccaccacaacttcaccaagcctgagtcttcattacatcaccgtgaaccaagactgtacaagaactgggagctcagcactcatggaatgtgtgaaaacctgtatgatgtgcgtcacaacacactggagaccaacacaatcaaggaattgcaaattacaaagaccttaaacctgcacaactgcattggaaaaccccgcgtgttttcaaacgtaccaggagtgcgctgtcttagcgaggactgcagcgtcattgatgagcccctacagatctctgtgtcagcatcatgtgatcttttcggaaccgaggaacaatttgtggtgaaggattgtagaacgatgggaagatacatttttgcaccaacacagatgaaaggaaatgccagaactttgaccaagcaatacctgaaattcaaggaagtgaggtctgaatcccaaagctataccgttagtggtgttacagctaggacattggaaatgactgtggatgaggatgaggacaaagcagagactgaggaacaacgtacccagactgcaaacatggtgagaaaattgatcgacgacctgatcaagagcgtggaggagaataatttgatgtcttggcagttcagtcgtttggtggaaaccttacgcaggacctgcaaaaaaactcttatcaagatctgggacaactgctacaatgacgagaaacgcaggcaatggctgctggaagctctgccttatgtgtacaaggaggaggtctttgaactcatctcagaaaaaatcaagacattggaactggactctgggagggcagtctcacttattcgtggtctagccttaactccaaacccaacaggaaagatgtgcttgaccgttgtggatctctgttctggttctgttgtccaagcagacgagactctgaagaaaacctgttaccttactgccggtgccttgatgcatgacttctgcaaagacaaggacgtcacttgccttgccaaccaagtggctgagattaagagtcgtttgcagtatgccagttaccgtgaaaagatcttcatcttgaagggcatcggtaatttcggtcacagagatttttaccaacctcttttggaaattatcaaggacaacaggaactctctggaattacgcgtcacagctgtgtatgctctacgaagaatcgctccgaaaatttcacaaaaggtattgccagtgttgttagagatcttccgtcaaccaaacaacgacgccgaactgcgcatggcttgttttgtgatcatctgcgactgcaaacctggtccagcggtgttcaacttggtagtcaagcagctctattacgagcgcaccaatcaagtgcgatcttttgtggtgtcttacatcaaggggatggctttctctagatacccatgtgacgaaaaaatggctcaaaccgcacgaatcgctctccgaatcgcccttcgagcgcttaagagatacaagataaacccagtgaactccaaatggcttcactttggcgacttctccgatcttctccagatgggaagtgcagttgatcttcgcatgatttccacgccatcaagcttcatcccgagagctatcaactctaaagtcaaactacacaccggtatgcttggaaaatctctgaatctattcgagactggcatccgtggtgagggaatccaagagctgctccaacgtgtcatgggaccaaggggacactacagcgaagataagagtctgtttggcttgctcagaccgaaacgaagcttagacaacagtctgaatgggaaagaaatcaaggaaatccaacaccagctgccaaccgctgaacatgaagaggagaaagccaaaggttccttctacttcaaagtttttggaaaagaactgttctataattatttcacctatgatgacatcaaagagttaatggaagatggagtcatcagagttagcaacgatctgaaggacctgctgaagaaaggtaaaaaatggaatatctccaaaatcctccgtccaattgaagtgaagatacgccagccctcgtgtgccggtctgcccctggatttctctgtggacagcgttttgttcctgcgtttgaacaccacggtcaaagctgaggttcaaccccgattcttcattcctcttcccgatcaagtcactcttagtggaaaattgaacattagtgttcttcattctctttggggaaaaatgaaagtatctctgcctcagttggagactggtagcaagctaacgatccacaccaatatcactactggattaggtggcagattccagtacaacatcagaaatcaaagctacaccacagaaatggacttgccacaagagagacaggagattttatcaactaacaaggcgagcaacacctggctgatgtacaggaaacctgtcacaggttccaagagttggtcttcaatctacaggagcacagagctggacggatttgagatcggagaagttactgagattgccaggggacatcatcatcatcgttatgccaactggactctgaaggcaggaaatggaaaatacctgtctgtgaatgagcacaatgaactcgtcttaagcgagctacctactgtcttcaaagtcttctttatcgacggaaatcatcaatttgtgaaactttttgtcaccggaaagggattcgttaaggtgaggccaaattcccaagggtgtttagagactgtgcatgataatcgctaccccaagctcttcaagatcacattcgtgaatggtcgccgtcatgttcgcctcgcgacagttgtgtatattcacaaagatgcatccaagaaccctgtggaatggacccgcgcaaccgtccctatgaacacggatcaagtcacgttattccaacacatgatcgaagatattgaaccacagaagggatatgaggagaaatggcgctatatcagtgacttgcccttgaattcgccttcgcataatgctgagaagagtgacggactgcatccgcgtcatatgaaaatcgtctttgttacacctgtgggaagcaagctttgtttgaggccgagccaaaccaaggaatctgtgctctctgtaaagaggcatcacacaaactcgagcgaactgtgttatggcaaacaaatgct\n",
      ">transcripts_v2_5t---NA---\n",
      "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaayggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggtgtaagtcattacttgccaatctatggacagtcgacaagcactttctcattggtgaatgcaaattttttggacattgtacttgtcagttgtggcatttattccaaaattttttttgtatgtgcaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\n",
      ">Locus_180_Transcript_15/16_Confidence_0.327_Length_6143_transcripts_v2_6t---NA---\n",
      "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaacggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggcgtaagtcattacttgccaatctatggacagtcgacaagctctttctcgttggtgaatgcaaattttttggacattgtacttgtcagttgtggcattcattccaaaattttttttgtatgtgaaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\n",
      ">Locus_9682_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72920t---NA---\n",
      "ataaaaataccaggaattgaggaagcaagcagcggcatgccggagcagacaggcaaaaaggaaaaccagaatacgaaaagaaaggaagacgggaatctgcaagacgctttggtgga\n",
      ">Locus_9570_Transcript_1/1_Confidence_1.000_Length_112_transcripts_v2_72921t---NA---\n",
      "gctagtttcaggtgtgcattcattgaataaatgtatttgtatttagtacgagtgtataataaagcagtaaatacaaatacatttattcaatgaatgcacacctgaaactagc\n",
      ">Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922t---NA---\n",
      "tgcgtagctcggtggatgtatagagaatgggaattcagtttcagattaggtatgagaccatggatatttgtagnnnnnnnnnnncagcactctcagcacctgttgtagcag\n",
      ">transcripts_v2_72923t---NA---\n",
      "ggacgatgaggannnnnnnnnnnnnnctgatgacagtaacgatgatgatcttgatgatgatagcgttgacgagaacgacgaggatgaagactatgaagtga\n",
      ">Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\n",
      "ttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\n"
     ]
    }
   ],
   "source": [
    "#Removing pipes from fasta and replacing with tab, then printing first line w/out comments and looking at contig names\n",
    "!sed 's/|/\\t/g' blast2go_fasta_Pdamv2.fasta | awk '{print $1}' > Pdam.fasta\n",
    "!head -10 Pdam.fasta\n",
    "!tail -10 Pdam.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Converted 72890 FASTA records in 145780 lines to tabular format\r\n",
      "Total sequence length: 28141387\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "#Converting FASTA to tabular format and placing output file in analyses directory\n",
    "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' \\\n",
    "Pdam.fasta > ../../analyses/Pdam/fasta2tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Pdam\n"
     ]
    }
   ],
   "source": [
    "cd ../../analyses/Pdam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r\n",
      "Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttagtgagagactgatagagctcagcctcagtaacatagagacgttttccaccctcagtgagcgccttgaaggcgttgatgacttcctggctagagccaacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r\n"
     ]
    }
   ],
   "source": [
    "#Checking header on new tabular format file\n",
    "!head -2 fasta2tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Added column with length of column 2 for 72890 lines.\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "#Add column with length of sequence\n",
    "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' \\\n",
    "fasta2tab > tab_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   72890  218670 34198048 tab_1\r\n"
     ]
    }
   ],
   "source": [
    "!wc tab_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#File used to count Cs and Gs will only include the sequence\n",
    "!awk '{print $2}' tab_1 > tab_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#This counts CGs - both cases\n",
    "!echo \"CG\" | awk -F\\[Cc][Gg] '{print NF-1}' tab_2 > CG "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Counts Cs\n",
    "!echo \"C\" | awk -F\\[Cc] '{print NF-1}' tab_2 > C "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Counts Gs\n",
    "!echo \"G\" | awk -F\\[Gg] '{print NF-1}' tab_2 > G "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\t7457\t185\t1914\t1444\n",
      "Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\t\tttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\t101\t1\t7\t12\n"
     ]
    }
   ],
   "source": [
    "#Combining counts\n",
    "!paste tab_1 \\\n",
    "CG \\\n",
    "C \\\n",
    "G \\\n",
    "> comb\n",
    "!head -1 comb\n",
    "!tail -1 comb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_9419_Transcript_1/1_Confidence_1.000_Length_142_transcripts_v2_72905t---NA---\t\tttggctcatccttcttgtctgtcttcttggccctctttcctctggtctttggctcctgatctgcctcctcgcccttctcctcttcctttggctcatccttcttgtctgtcttcttggccctctttcctctggtctttggctt\t142\t1\t51\t23\r\n",
      "Locus_9418_Transcript_2/3_Confidence_0.286_Length_141_transcripts_v2_72906t---NA---\t\tgaaagagggccaagaagacagacaagaaggatgagccaaaggaagaggagaaggtcgagaagaatgaggatgaagggaaggaagatgagaagccaaagaccagaggaaagagggccaagaagacagacaacaaggatgagc\t141\t1\t17\t52\r\n",
      "transcripts_v2_72907t---NA---\t\tcaccttaggaaatgattagagaatagaagggagaatatacatactgatgttaggatttaatgggtcactttaaccctttaaaccctaacatcagtatgtatattctccnnnnnnnnnnnncttctattctctaatca\t137\t0\t23\t19\r\n",
      "transcripts_v2_72908t---NA---\t\tttgttggatgggtagagaatgggaattcagtttcagattaggtatgagaccatggatagttgtagctttctcagcacctgttgtagcagcttttgcagtatttgttgtttatnnnnnnnnnnggacaagctagcttttt\t139\t0\t16\t35\r\n",
      "transcripts_v2_72909thypothetical\t\tgaactacgtcacgtgatgacaaacttgggagagaaacttacagatgaggaagttgatgagatgatccgagaagcagatactgacagtgaagaggagatcaaggaagcctttagagtgtttgacaaagatggaaacg\t136\t4\t18\t41\r\n",
      "Locus_9529_Transcript_1/1_Confidence_1.000_Length_136_transcripts_v2_72910tp700\t\tccaggattggacgaatagatcatttggttcatttatataccctatcggtccaggggatttatatgtacatcatgcaatagcacttggcttacatgtaactgtcctcatcctactaaagggaggtcttgaagctcgt\t136\t3\t28\t29\r\n",
      "transcripts_v2_72911t---NA---\t\tgttaagtgtagctgatggagattaatcctttgtaatgttgagggtgaccataaatgaaaaaaaaagtgtgaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnna\t131\t0\t5\t19\r\n",
      "Locus_990_Transcript_3/3_Confidence_0.600_Length_134_transcripts_v2_72912t---NA---\t\tgctatttctgatgacaacatataagaattaccacaccacatagtaattagacacaatatgagcacaacacaaggacaagagtagatcatactaaacatagaagatgtacctaattgagaagatgtaatgatgaa\t134\t0\t22\t22\r\n",
      "transcripts_v2_72913t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnna\t127\t4\t19\t8\r\n",
      "Locus_9106_Transcript_3/7_Confidence_0.200_Length_128_transcripts_v2_72914t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnnnn\t128\t4\t19\t8\r\n",
      "Locus_9106_Transcript_7/7_Confidence_0.300_Length_128_transcripts_v2_72915t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnnct\t128\t4\t20\t8\r\n",
      "Locus_9936_Transcript_1/1_Confidence_1.000_Length_124_transcripts_v2_72916t---NA---\t\ttagtcgattagttactggaataaaaagtgcaagtctgttgagggtttgccatgtaatttgctgcaaacacggcaaaccctcaacagacttgcactttttattccagtaactaatcgactagtaa\t124\t3\t24\t24\r\n",
      "Locus_9701_Transcript_1/1_Confidence_1.000_Length_123_transcripts_v2_72917t---NA---\t\tggtgcttttgcagctgctcctgctggcgctggcgccagagcaggtgctgctggagccgcctctggcggtggtgcttttgcagctgctcctgctggcgctggcgccagagcaggtgctgctgga\t123\t6\t36\t48\r\n",
      "Locus_9530_Transcript_1/1_Confidence_1.000_Length_118_transcripts_v2_72918t---NA---\t\tagacaggcagacaaacagacaggcagacaaacagacaggcagacaaacagacagacagacaaacagacaggcagacaaacagacaggcagacaaacagacaggcagacaaacagacag\t118\t0\t29\t29\r\n",
      "Locus_9226_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72919t---NA---\t\taataaatcatcttctgaagggtttgttattgggtgcacagagtcaatgaaatggggataattttgtttgactctgtgcacccaataacaaacccttcagaagatgatttattaaca\t116\t0\t18\t23\r\n",
      "Locus_9682_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72920t---NA---\t\tataaaaataccaggaattgaggaagcaagcagcggcatgccggagcagacaggcaaaaaggaaaaccagaatacgaaaagaaaggaagacgggaatctgcaagacgctttggtgga\t116\t5\t19\t35\r\n",
      "Locus_9570_Transcript_1/1_Confidence_1.000_Length_112_transcripts_v2_72921t---NA---\t\tgctagtttcaggtgtgcattcattgaataaatgtatttgtatttagtacgagtgtataataaagcagtaaatacaaatacatttattcaatgaatgcacacctgaaactagc\t112\t1\t15\t19\r\n",
      "Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922t---NA---\t\ttgcgtagctcggtggatgtatagagaatgggaattcagtttcagattaggtatgagaccatggatatttgtagnnnnnnnnnnncagcactctcagcacctgttgtagcag\t111\t2\t16\t29\r\n",
      "transcripts_v2_72923t---NA---\t\tggacgatgaggannnnnnnnnnnnnnctgatgacagtaacgatgatgatcttgatgatgatagcgttgacgagaacgacgaggatgaagactatgaagtga\t101\t6\t10\t29\r\n",
      "Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\t\tttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\t101\t1\t7\t12\r\n"
     ]
    }
   ],
   "source": [
    "!tail -20 comb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculating CpGo/e based on [Gavery and Roberts (2010)](http://www.biomedcentral.com/1471-2164/11/483)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "
"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "awk: division by zero\r\n",
      " input record number 9164, file comb\r\n",
      " source line number 1\r\n"
     ]
    }
   ],
   "source": [
    "!awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb > ID_CpG #use ^ instead of ** for exponent\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin \t 0.499212\n",
      "Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin \t 0.494242\n",
      "Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3tvitellogenin \t 0.669218\n",
      "transcripts_v2_5t---NA--- \t 0.266393\n",
      "Locus_180_Transcript_15/16_Confidence_0.327_Length_6143_transcripts_v2_6t---NA--- \t 0.279666\n",
      "Locus_180_Transcript_14/16_Confidence_0.308_Length_6142_transcripts_v2_7t---NA--- \t 0.279907\n",
      "Locus_180_Transcript_13/16_Confidence_0.308_Length_6140_transcripts_v2_8t---NA--- \t 0.280334\n",
      "Locus_180_Transcript_7/16_Confidence_0.288_Length_6119_transcripts_v2_9t---NA--- \t 0.280651\n",
      "Locus_180_Transcript_11/16_Confidence_0.250_Length_5898_transcripts_v2_10t---NA--- \t 0.292432\n",
      "Locus_1199_Transcript_3/4_Confidence_0.750_Length_5569_transcripts_v2_11tserine \t 0.464167\n",
      "transcripts_v2_9163 \t 0.849883\n",
      "Locus_17783_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9164tmps \t 0\n",
      "transcripts_v2_9165t---NA--- \t 0.636027\n",
      "Locus_17917_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9166t---NA--- \t 0.410266\n",
      "Locus_18343_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9167t---NA--- \t 0.562175\n",
      "Locus_18403_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9168t---NA--- \t 0.814834\n",
      "Locus_18577_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9169t---NA--- \t 0\n",
      "Locus_18650_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9170t---NA--- \t 0\n",
      "transcripts_v2_9171t---NA--- \t 0.496778\n",
      "Locus_18916_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9172t---NA--- \t     9163   18327  729602 ID_CpG\n"
     ]
    }
   ],
   "source": [
    "!head ID_CpG\n",
    "!tail ID_CpG\n",
    "!wc ID_CpG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Now joining CpG to annotation, but first must sort files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tnogo-b\tsp\tQ99LJ8\tNGBR_MOUSE\t35.48\t93\t57\t2\t283\t5\t155\t244\t6e-12\t63.5\r\n",
      "Locus_10015_Transcript_1/2_Confidence_1.000_Length_905_transcripts_v2_1500\tsp\tQ86UC2\tRSPH3_HUMAN\t71.43\t175\t50\t0\t17\t541\t297\t471\t3e-50\t  180\r\n",
      "Locus_10015_Transcript_2/2_Confidence_1.000_Length_896_transcripts_v2_1531\tsp\tQ86UC2\tRSPH3_HUMAN\t71.43\t175\t50\t0\t17\t541\t297\t471\t3e-50\t  180\r\n",
      "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529\tadp-ribosylation\tsp\tQ99PE9\tARL4D_MOUSE\t41.98\t131\t71\t2\t29\t409\t1\t130\t2e-31\t  116\r\n",
      "Locus_10027_Transcript_1/1_Confidence_1.000_Length_375_transcripts_v2_4989\tkelch\tsp\tQ5R8W1\tKLDC4_PONAB\t28.81\t118\t79\t4\t23\t364\t136\t252\t2e-06\t49.3\r\n",
      "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337\thypothetical\tsp\tC3YZ51\tUBA5_BRAFL\t62.24\t143\t47\t2\t2\t421\t239\t377\t1e-40\t  145\r\n",
      "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420\tprotein\tsp\tQ22A30\tRL15_TETTS\t70.09\t107\t32\t0\t321\t1\t1\t107\t2e-39\t  137\r\n",
      "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512\tphotosystem\tsp\tQ5ENP6\tPSAL_ISOGA\t57.58\t66\t27\t1\t139\t336\t10\t74\t7e-18\t79.0\r\n",
      "Locus_10044_Transcript_1/1_Confidence_1.000_Length_273_transcripts_v2_6833\ttransmembrane\tsp\tQ96HH6\tTMM19_HUMAN\t58.14\t86\t34\t1\t5\t262\t174\t257\t5e-14\t69.7\r\n",
      "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383\tsp\tQ9QY36\tNAA10_MOUSE\t70.28\t212\t44\t1\t1\t579\t2\t213\t5e-99\t  295\r\n"
     ]
    }
   ],
   "source": [
    "#Sorting Pdam Uniprot/Swissprot annotation file. This file was the result of work done in another notebook: \n",
    "#Pdam_blast_anno.ipynb\n",
    "!sort Pdam_blastx_uniprot_sql.tab | tail -n +2 > Pdam_blastx_uniprot_sql.tab.sorted\n",
    "!head Pdam_blastx_uniprot_sql.tab.sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976\ttransport\r",
      "\r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tdevelopmental processes\r",
      "\r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tother biological processes\r",
      "\r\n",
      "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529\tsignal transduction\r",
      "\r\n",
      "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337\tother metabolic processes\r",
      "\r\n",
      "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420\tprotein metabolism\r",
      "\r\n",
      "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512\tother metabolic processes\r",
      "\r\n",
      "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383\tprotein metabolism\r",
      "\r\n",
      "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011\ttransport\r",
      "\r\n",
      "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761\tother biological processes\r",
      "\r\n"
     ]
    }
   ],
   "source": [
    "#Sorting GOSlim annotation file. This file was the result of work done in another notebook: Pdam_blast_anno.ipynb\n",
    "!sort Pdam_GOSlim.tab | tail -n +2 > Pdam_GOSlim.sorted\n",
    "!head Pdam_GOSlim.sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_10000_Transcript_2/3_Confidence_0.667_Length_676_transcripts_v2_2486 \t 0.761861\r\n",
      "Locus_10001_Transcript_1/1_Confidence_1.000_Length_199_transcripts_v2_9515 \t 0.635946\r\n",
      "Locus_10002_Transcript_1/1_Confidence_1.000_Length_695_transcripts_v2_2386 \t 0.695709\r\n",
      "Locus_10003_Transcript_1/1_Confidence_1.000_Length_609_transcripts_v2_2870 \t 0.12449\r\n",
      "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 \t 0.230119\r\n",
      "Locus_10005_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9134 \t 0.530625\r\n",
      "Locus_10006_Transcript_1/1_Confidence_1.000_Length_167_transcripts_v2_11475 \t 1.34405\r\n",
      "Locus_10007_Transcript_1/2_Confidence_0.857_Length_1261_transcripts_v2_788 \t 0.746746\r\n",
      "Locus_10007_Transcript_2/2_Confidence_0.857_Length_1272_transcripts_v2_779 \t 0.758383\r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367\r\n"
     ]
    }
   ],
   "source": [
    "#Sorting CpG file\n",
    "!sort ID_CpG > ID_CpG.sorted\n",
    "!head ID_CpG.sorted"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# For this analysis, *Symbiodinium* sequences were removed. Using file generated from Pdam_zoox_removal.ipynb, ID_CpG.sorted2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!join ID_CpG.sorted2 Pdam_blastx_uniprot_sql.tab.sorted | awk '{print $1, \"\\t\", $2}' > Pdam_cpg_anno"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367\n",
      "Locus_10015_Transcript_1/2_Confidence_1.000_Length_905_transcripts_v2_1500 \t 0.364383\n",
      "Locus_10015_Transcript_2/2_Confidence_1.000_Length_896_transcripts_v2_1531 \t 0.368691\n",
      "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 \t 1.4668\n",
      "Locus_10027_Transcript_1/1_Confidence_1.000_Length_375_transcripts_v2_4989 \t 0.847464\n",
      "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 \t 0.578951\n",
      "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 \t 0.914415\n",
      "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 \t 0.52846\n",
      "Locus_10044_Transcript_1/1_Confidence_1.000_Length_273_transcripts_v2_6833 \t 0.533378\n",
      "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 \t 0.206723\n",
      "   19133   38266 1468482 Pdam_cpg_anno\n"
     ]
    }
   ],
   "source": [
    "!head Pdam_cpg_anno\n",
    "!wc Pdam_cpg_anno"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!join ID_CpG.sorted2 Pdam_GOSlim.sorted > Pdam_cpg_GOslim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 0.230119 transport\r",
      "\r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 0.266367 developmental processes\r",
      "\r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 0.266367 other biological processes\r",
      "\r\n",
      "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 1.4668 signal transduction\r",
      "\r\n",
      "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 0.578951 other metabolic processes\r",
      "\r\n",
      "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 0.914415 protein metabolism\r",
      "\r\n",
      "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 0.52846 other metabolic processes\r",
      "\r\n",
      "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 0.206723 protein metabolism\r",
      "\r\n",
      "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011 0.384943 transport\r",
      "\r\n",
      "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761 0.499315 other biological processes\r",
      "\r\n"
     ]
    }
   ],
   "source": [
    "!head Pdam_cpg_GOslim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 \t 0.230119 \t transport\r",
      "   \r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367 \t developmental processes\r",
      "  \r\n",
      "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367 \t other biological processes\r",
      " \r\n",
      "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 \t 1.4668 \t signal transduction\r",
      "  \r\n",
      "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 \t 0.578951 \t other metabolic processes\r",
      " \r\n",
      "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 \t 0.914415 \t protein metabolism\r",
      "  \r\n",
      "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 \t 0.52846 \t other metabolic processes\r",
      " \r\n",
      "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 \t 0.206723 \t protein metabolism\r",
      "  \r\n",
      "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011 \t 0.384943 \t transport\r",
      "   \r\n",
      "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761 \t 0.499315 \t other biological processes\r",
      " \r\n"
     ]
    }
   ],
   "source": [
    "#Putting tabs in between columns\n",
    "!awk '{print $1, \"\\t\", $2, \"\\t\", $3, $4, $5, $6}' Pdam_cpg_GOslim > Pdam_cpg_GOslim.tab\n",
    "!head Pdam_cpg_GOslim.tab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Now time to plot data using pandas and matplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "
| \n", " | 0 | \n", "1 | \n", "2 | \n", "
|---|---|---|---|
| 0 | \n", "Locus_10004_Transcript_1/1_Confidence_1.000_Le... | \n", "0.230119 | \n", "transport | \n", "
| 1 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 2 | \n", "Locus_1000_Transcript_1/1_Confidence_1.000_Len... | \n", "0.266367 | \n", "developmental processes | \n", "
| 3 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 4 | \n", "Locus_1000_Transcript_1/1_Confidence_1.000_Len... | \n", "0.266367 | \n", "other biological processes | \n", "
| 5 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 6 | \n", "Locus_10024_Transcript_1/1_Confidence_1.000_Le... | \n", "1.466800 | \n", "signal transduction | \n", "
| 7 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 8 | \n", "Locus_10037_Transcript_1/1_Confidence_1.000_Le... | \n", "0.578951 | \n", "other metabolic processes | \n", "
| 9 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 10 | \n", "Locus_1003_Transcript_1/1_Confidence_1.000_Len... | \n", "0.914415 | \n", "protein metabolism | \n", "
| 11 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 12 | \n", "Locus_10043_Transcript_1/1_Confidence_1.000_Le... | \n", "0.528460 | \n", "other metabolic processes | \n", "
| 13 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 14 | \n", "Locus_10045_Transcript_1/1_Confidence_1.000_Le... | \n", "0.206723 | \n", "protein metabolism | \n", "
| 15 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 16 | \n", "Locus_10059_Transcript_1/1_Confidence_1.000_Le... | \n", "0.384943 | \n", "transport | \n", "
| 17 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 18 | \n", "Locus_10063_Transcript_1/1_Confidence_1.000_Le... | \n", "0.499315 | \n", "other biological processes | \n", "
| 19 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 20 | \n", "Locus_10068_Transcript_1/1_Confidence_1.000_Le... | \n", "0.689883 | \n", "transport | \n", "
| 21 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 22 | \n", "Locus_10069_Transcript_1/1_Confidence_1.000_Le... | \n", "0.348955 | \n", "other biological processes | \n", "
| 23 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 24 | \n", "Locus_10069_Transcript_1/1_Confidence_1.000_Le... | \n", "0.348955 | \n", "other metabolic processes | \n", "
| 25 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 26 | \n", "Locus_10073_Transcript_1/1_Confidence_1.000_Le... | \n", "0.640578 | \n", "protein metabolism | \n", "
| 27 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 28 | \n", "Locus_10078_Transcript_1/1_Confidence_1.000_Le... | \n", "0.157409 | \n", "cell cycle and proliferation | \n", "
| 29 | \n", "Locus_10078_Transcript_1/1_Confidence_1.000_Le... | \n", "0.157409 | \n", "other metabolic processes | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 73380 | \n", "transcripts_v2_977 | \n", "1.236260 | \n", "signal transduction | \n", "
| 73381 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73382 | \n", "transcripts_v2_98 | \n", "0.341826 | \n", "other biological processes | \n", "
| 73383 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73384 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "other biological processes | \n", "
| 73385 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73386 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "other metabolic processes | \n", "
| 73387 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73388 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "protein metabolism | \n", "
| 73389 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73390 | \n", "transcripts_v2_983 | \n", "0.508182 | \n", "other metabolic processes | \n", "
| 73391 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73392 | \n", "transcripts_v2_983 | \n", "0.508182 | \n", "transport | \n", "
| 73393 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73394 | \n", "transcripts_v2_9880 | \n", "0.891478 | \n", "protein metabolism | \n", "
| 73395 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73396 | \n", "transcripts_v2_9896 | \n", "0.394843 | \n", "other metabolic processes | \n", "
| 73397 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73398 | \n", "transcripts_v2_991 | \n", "0.638445 | \n", "RNA metabolism | \n", "
| 73399 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73400 | \n", "transcripts_v2_991 | \n", "0.638445 | \n", "developmental processes | \n", "
| 73401 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73402 | \n", "transcripts_v2_9910 | \n", "0.761721 | \n", "stress response | \n", "
| 73403 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73404 | \n", "transcripts_v2_9932 | \n", "0.386127 | \n", "other metabolic processes | \n", "
| 73405 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73406 | \n", "transcripts_v2_9932 | \n", "0.386127 | \n", "protein metabolism | \n", "
| 73407 | \n", "\n", " | NaN | \n", "NaN | \n", "
| 73408 | \n", "transcripts_v2_9936 | \n", "0.892228 | \n", "RNA metabolism | \n", "
| 73409 | \n", "\n", " | NaN | \n", "NaN | \n", "
73410 rows × 3 columns
\n", "| \n", " | 0 | \n", "
|---|---|
| 0 | \n", "0.761861 | \n", "
| 1 | \n", "0.635946 | \n", "
| 2 | \n", "0.695709 | \n", "
| 3 | \n", "0.124490 | \n", "
| 4 | \n", "0.230119 | \n", "
| 5 | \n", "0.530625 | \n", "
| 6 | \n", "1.344050 | \n", "
| 7 | \n", "0.746746 | \n", "
| 8 | \n", "0.758383 | \n", "
| 9 | \n", "0.266367 | \n", "
| 10 | \n", "0.158412 | \n", "
| 11 | \n", "0.736118 | \n", "
| 12 | \n", "0.978761 | \n", "
| 13 | \n", "1.466680 | \n", "
| 14 | \n", "0.364383 | \n", "
| 15 | \n", "0.368691 | \n", "
| 16 | \n", "0.779823 | \n", "
| 17 | \n", "0.110361 | \n", "
| 18 | \n", "0.560740 | \n", "
| 19 | \n", "0.898968 | \n", "
| 20 | \n", "0.281310 | \n", "
| 21 | \n", "0.082143 | \n", "
| 22 | \n", "0.687209 | \n", "
| 23 | \n", "0.219951 | \n", "
| 24 | \n", "1.466800 | \n", "
| 25 | \n", "0.847464 | \n", "
| 26 | \n", "0.676556 | \n", "
| 27 | \n", "0.853481 | \n", "
| 28 | \n", "0.661166 | \n", "
| 29 | \n", "0.162397 | \n", "
| ... | \n", "... | \n", "
| 69920 | \n", "0.000000 | \n", "
| 69921 | \n", "0.961957 | \n", "
| 69922 | \n", "0.635342 | \n", "
| 69923 | \n", "0.000000 | \n", "
| 69924 | \n", "0.508182 | \n", "
| 69925 | \n", "0.245346 | \n", "
| 69926 | \n", "0.850773 | \n", "
| 69927 | \n", "0.623037 | \n", "
| 69928 | \n", "0.066227 | \n", "
| 69929 | \n", "0.893366 | \n", "
| 69930 | \n", "0.891478 | \n", "
| 69931 | \n", "0.394843 | \n", "
| 69932 | \n", "0.638445 | \n", "
| 69933 | \n", "0.761721 | \n", "
| 69934 | \n", "1.173440 | \n", "
| 69935 | \n", "1.535910 | \n", "
| 69936 | \n", "0.386127 | \n", "
| 69937 | \n", "0.892228 | \n", "
| 69938 | \n", "0.674579 | \n", "
| 69939 | \n", "0.881767 | \n", "
| 69940 | \n", "1.053640 | \n", "
| 69941 | \n", "1.083160 | \n", "
| 69942 | \n", "0.850852 | \n", "
| 69943 | \n", "0.558495 | \n", "
| 69944 | \n", "0.480530 | \n", "
| 69945 | \n", "1.000290 | \n", "
| 69946 | \n", "0.804978 | \n", "
| 69947 | \n", "0.787074 | \n", "
| 69948 | \n", "0.000000 | \n", "
| 69949 | \n", "0.660990 | \n", "
69950 rows × 1 columns
\n", "