{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpG ratio for *Pocillopora damicornis* transcriptome" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow calculates CpG ratio, or CpG O/E, for contigs in the *Pocillopora damicornis* [transcriptome](http://2ei.univ-perp.fr/telechargement/transcriptomes/blast2go_fasta_Pdamv2.zip). CpG ratio is an estimate of germline DNA methylation.\n", "\n", "This workflow is an extension of another IPython notebook workflow, `Pdam_blast_anno.ipynb`, that generates an annotation of the same transcriptome. This workflow assumes that you have created the directories and files specified in the annotation workflow.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Pdam\n" ] } ], "source": [ "cd ../data/Pdam" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1|spectrin alpha chain\n", "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n", "\n", "number of seqs =\n", "72890\n" ] } ], "source": [ "#fasta file\n", "!head -2 blast2go_fasta_Pdamv2.fasta\n", "!echo \n", "!echo number of seqs =\n", "!fgrep -c \">\" blast2go_fasta_Pdamv2.fasta" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\n", "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n", ">Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin\n", "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttagtgagagactgatagagctcagcctcagtaacatagagacgttttccaccctcagtgagcgccttgaaggcgttgatgacttcctggctagagccaacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\n", ">Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3tvitellogenin\n", "gctcgagtttgtgtgatgcctctttacagagagcacagattccttggtttggctcggcctcaaacaaagcttgcttcccacaggtgtaacaaagacgattttcatatgacgcggatgcagtccgtcactcttctcagcattatgcgaaggcgaattcaagggcaagtcactgatatagcgccatttctcctcatatcccttctgtggttcaatatcttcgatcatgtgttggaataacgtgacttgatccgtgttcatagggacggttgcgcgggtccattccacagggttcttggatgcatctttgtgaatatacacaactgtcgcgaggcgaacatgacggcgaccattcacgaatgtgatcttgaagagcttggggtagcgattatcatgcacagtctctaaacacccttgggaatttggcctcaccttaacgaatccctttccggtgacaaaaagtttcacaaattgatgatttccgtcgataaagaagactttgaagacagtaggtagctcgcttaagacgagttcattgtgctcattcacagacaggtattttccatttcctgccttcagagtccagttggcataacgatgatgatgatgtcccctggcaatctcagtaacttctccgatctcaaatccgtccagctctgtgctcctgtagattgaagaccaactcttggaacctgtgacaggtttcctgtacatcagccaggtgttgctcgccttgttagttgataaaatctcctgtctctcttgtggcaagtccatttctgtggtgtagctttgatttctgatgttgtactggaatctgccacctaatccagtagtgatattggtgtggatcgttagcttgctaccagtctccaactgaggcagagatactttcatttttccccaaagagaatgaagaacactaatgttcaattttccactaagagtgacttgatcgggaagaggaatgaagaatcggggttgaacctcagctttgaccgtggtgttcaaacgcaggaacaaaacgctgtccacagagaaatccaggggcagaccggcacacgagggctggcgtatcttcacttcaattggacggaggattttggagatattccattttttacctttcttcagcaggtccttcagatcgttgctaactctgatgactccatcttccattaactctttgatgtcatcataggtgaaataattatagaacagttcttttccaaaaactttgaagtagaaggaacctttggctttctcctcttcatgttcagcggttggcagctggtgttggatttccttgatttctttcccattcagactgttgtctaagcttcgtttcggtctgagcaagccaaacagactcttatcttcgctgtagtgtccccttggtcccatgacacgttggagcagctcttggattccctcaccacggatgccagtctcgaatagattcagagattttccaagcataccggtgtgtagtttgactttagagttgatagctctcgggatgaagcttgatggcgtggaaatcatgcgaagatcaactgcacttcccatctggagaagatcggagaagtcgccaaagtgaagccatttggagttcactgggtttatcttgtatctcttaagcgctcgaagggcgattcggagagcgattcgtgcggtttgagccattttttcgtcacatgggtatctagagaaagccatccccttgatgtaagacaccacaaaagatcgcacttgattggtgcgctcgtaatagagctgcttgactaccaagttgaacaccgctggaccaggtttgcagtcgcagatgatcacaaaacaagccatgcgcagttcggcgtcgttgtttggttgacggaagatctctaacaacactggcaataccttttgtgaaattttcggagcgattcttcgtagagcatacacagctgtgacgcgtaattccagagagttcctgttgtccttgataatttccaaaagaggttggtaaaaatctctgtgaccgaaattaccgatgcccttcaagatgaagatcttttcacggtaactggcatactgcaaacgactcttaatctcagccacttggttggcaaggcaagtgacgtccttgtctttgcagaagtcatgcatcaaggcaccggcagtaaggtaacaggttttcttcagagtctcgtctgcttggacaacagaaccagaacagagatccacaacggtcaagcacatctttcctgttgggtttggagttaaggctagaccacgaataagtgagactgccctcccagagtccagttccaatgtcttgattttttctgagatgagttcaaagacctcctccttgtacacataaggcagagcttccagcagccattgcctgcgtttctcgtcattgtagcagttgtcccagatcttgataagagtttttttgcaggtcctgcgtaaggtttccaccaaacgactgaactgccaagacatcaaattattctcctccacgctcttgatcaggtcgtcgatcaattttctcaccatgtttgcagtctgggtacgttgttcctcagtctctgctttgtcctcatcctcatccacagtcatttccaatgtcctagctgtaacaccactaacggtatagctttgggattcagacctcacttccttgaatttcaggtattgcttggtcaaagttctggcatttcctttcatctgtgttggtgcaaaaatgtatcttcccatcgttctacaatccttcaccacaaattgttcctcggttccgaaaagatcacatgatgctgacacagagatctgtaggggctcatcaatgacgctgcagtcctcgctaagacagcgcactcctggtacgtttgaaaacacgcggggttttccaatgcagttgtgcaggtttaaggtctttgtaatttgcaattccttgattgtgttggtctccagtgtgttgtgacgcacatcatacaggttttcacacattccatgagtgctgagctcccagttcttgtacagtcttggttcacggtgatgtaatgaagactcaggcttggtgaagttgtggtggatcatggaaagaactcctcgtttgatgttcaaaatgtactcaggctcatcgtcattgcagaagatctgaccaacttttccatctttgtactcaaacttgattggtcgttccagcaacggcttcaattcattggagactggggagacagcgtgtgacacagacttctacttcgaaacagacactttggcttatttgaagataagaaacccaattctgtacgaagttaacgtaacatccttcgaccaatcagagtcacacactgtctccccagtctccaatgaattgaagccgttgctggaacgaccaatcaagtttgagtacaaagatggaaaagttggtcagatcttctgcaatgacgatgagcctgagtacattttgaacatcaaacgaggagttctttccatgatccaccacaacttcaccaagcctgagtcttcattacatcaccgtgaaccaagactgtacaagaactgggagctcagcactcatggaatgtgtgaaaacctgtatgatgtgcgtcacaacacactggagaccaacacaatcaaggaattgcaaattacaaagaccttaaacctgcacaactgcattggaaaaccccgcgtgttttcaaacgtaccaggagtgcgctgtcttagcgaggactgcagcgtcattgatgagcccctacagatctctgtgtcagcatcatgtgatcttttcggaaccgaggaacaatttgtggtgaaggattgtagaacgatgggaagatacatttttgcaccaacacagatgaaaggaaatgccagaactttgaccaagcaatacctgaaattcaaggaagtgaggtctgaatcccaaagctataccgttagtggtgttacagctaggacattggaaatgactgtggatgaggatgaggacaaagcagagactgaggaacaacgtacccagactgcaaacatggtgagaaaattgatcgacgacctgatcaagagcgtggaggagaataatttgatgtcttggcagttcagtcgtttggtggaaaccttacgcaggacctgcaaaaaaactcttatcaagatctgggacaactgctacaatgacgagaaacgcaggcaatggctgctggaagctctgccttatgtgtacaaggaggaggtctttgaactcatctcagaaaaaatcaagacattggaactggactctgggagggcagtctcacttattcgtggtctagccttaactccaaacccaacaggaaagatgtgcttgaccgttgtggatctctgttctggttctgttgtccaagcagacgagactctgaagaaaacctgttaccttactgccggtgccttgatgcatgacttctgcaaagacaaggacgtcacttgccttgccaaccaagtggctgagattaagagtcgtttgcagtatgccagttaccgtgaaaagatcttcatcttgaagggcatcggtaatttcggtcacagagatttttaccaacctcttttggaaattatcaaggacaacaggaactctctggaattacgcgtcacagctgtgtatgctctacgaagaatcgctccgaaaatttcacaaaaggtattgccagtgttgttagagatcttccgtcaaccaaacaacgacgccgaactgcgcatggcttgttttgtgatcatctgcgactgcaaacctggtccagcggtgttcaacttggtagtcaagcagctctattacgagcgcaccaatcaagtgcgatcttttgtggtgtcttacatcaaggggatggctttctctagatacccatgtgacgaaaaaatggctcaaaccgcacgaatcgctctccgaatcgcccttcgagcgcttaagagatacaagataaacccagtgaactccaaatggcttcactttggcgacttctccgatcttctccagatgggaagtgcagttgatcttcgcatgatttccacgccatcaagcttcatcccgagagctatcaactctaaagtcaaactacacaccggtatgcttggaaaatctctgaatctattcgagactggcatccgtggtgagggaatccaagagctgctccaacgtgtcatgggaccaaggggacactacagcgaagataagagtctgtttggcttgctcagaccgaaacgaagcttagacaacagtctgaatgggaaagaaatcaaggaaatccaacaccagctgccaaccgctgaacatgaagaggagaaagccaaaggttccttctacttcaaagtttttggaaaagaactgttctataattatttcacctatgatgacatcaaagagttaatggaagatggagtcatcagagttagcaacgatctgaaggacctgctgaagaaaggtaaaaaatggaatatctccaaaatcctccgtccaattgaagtgaagatacgccagccctcgtgtgccggtctgcccctggatttctctgtggacagcgttttgttcctgcgtttgaacaccacggtcaaagctgaggttcaaccccgattcttcattcctcttcccgatcaagtcactcttagtggaaaattgaacattagtgttcttcattctctttggggaaaaatgaaagtatctctgcctcagttggagactggtagcaagctaacgatccacaccaatatcactactggattaggtggcagattccagtacaacatcagaaatcaaagctacaccacagaaatggacttgccacaagagagacaggagattttatcaactaacaaggcgagcaacacctggctgatgtacaggaaacctgtcacaggttccaagagttggtcttcaatctacaggagcacagagctggacggatttgagatcggagaagttactgagattgccaggggacatcatcatcatcgttatgccaactggactctgaaggcaggaaatggaaaatacctgtctgtgaatgagcacaatgaactcgtcttaagcgagctacctactgtcttcaaagtcttctttatcgacggaaatcatcaatttgtgaaactttttgtcaccggaaagggattcgttaaggtgaggccaaattcccaagggtgtttagagactgtgcatgataatcgctaccccaagctcttcaagatcacattcgtgaatggtcgccgtcatgttcgcctcgcgacagttgtgtatattcacaaagatgcatccaagaaccctgtggaatggacccgcgcaaccgtccctatgaacacggatcaagtcacgttattccaacacatgatcgaagatattgaaccacagaagggatatgaggagaaatggcgctatatcagtgacttgcccttgaattcgccttcgcataatgctgagaagagtgacggactgcatccgcgtcatatgaaaatcgtctttgttacacctgtgggaagcaagctttgtttgaggccgagccaaaccaaggaatctgtgctctctgtaaagaggcatcacacaaactcgagcgaactgtgttatggcaaacaaatgct\n", ">transcripts_v2_5t---NA---\n", "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaayggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggtgtaagtcattacttgccaatctatggacagtcgacaagcactttctcattggtgaatgcaaattttttggacattgtacttgtcagttgtggcatttattccaaaattttttttgtatgtgcaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\n", ">Locus_180_Transcript_15/16_Confidence_0.327_Length_6143_transcripts_v2_6t---NA---\n", "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaacggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggcgtaagtcattacttgccaatctatggacagtcgacaagctctttctcgttggtgaatgcaaattttttggacattgtacttgtcagttgtggcattcattccaaaattttttttgtatgtgaaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\n", ">Locus_9682_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72920t---NA---\n", "ataaaaataccaggaattgaggaagcaagcagcggcatgccggagcagacaggcaaaaaggaaaaccagaatacgaaaagaaaggaagacgggaatctgcaagacgctttggtgga\n", ">Locus_9570_Transcript_1/1_Confidence_1.000_Length_112_transcripts_v2_72921t---NA---\n", "gctagtttcaggtgtgcattcattgaataaatgtatttgtatttagtacgagtgtataataaagcagtaaatacaaatacatttattcaatgaatgcacacctgaaactagc\n", ">Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922t---NA---\n", "tgcgtagctcggtggatgtatagagaatgggaattcagtttcagattaggtatgagaccatggatatttgtagnnnnnnnnnnncagcactctcagcacctgttgtagcag\n", ">transcripts_v2_72923t---NA---\n", "ggacgatgaggannnnnnnnnnnnnnctgatgacagtaacgatgatgatcttgatgatgatagcgttgacgagaacgacgaggatgaagactatgaagtga\n", ">Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\n", "ttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\n" ] } ], "source": [ "#Removing pipes from fasta and replacing with tab, then printing first line w/out comments and looking at contig names\n", "!sed 's/|/\\t/g' blast2go_fasta_Pdamv2.fasta | awk '{print $1}' > Pdam.fasta\n", "!head -10 Pdam.fasta\n", "!tail -10 Pdam.fasta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Converted 72890 FASTA records in 145780 lines to tabular format\r\n", "Total sequence length: 28141387\r\n", "\r\n" ] } ], "source": [ "#Converting FASTA to tabular format and placing output file in analyses directory\n", "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' \\\n", "Pdam.fasta > ../../analyses/Pdam/fasta2tab" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Pdam\n" ] } ], "source": [ "cd ../../analyses/Pdam" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r\n", "Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttagtgagagactgatagagctcagcctcagtaacatagagacgttttccaccctcagtgagcgccttgaaggcgttgatgacttcctggctagagccaacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r\n" ] } ], "source": [ "#Checking header on new tabular format file\n", "!head -2 fasta2tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Added column with length of column 2 for 72890 lines.\r\n", "\r\n" ] } ], "source": [ "#Add column with length of sequence\n", "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' \\\n", "fasta2tab > tab_1" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 72890 218670 34198048 tab_1\r\n" ] } ], "source": [ "!wc tab_1" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#File used to count Cs and Gs will only include the sequence\n", "!awk '{print $2}' tab_1 > tab_2" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#This counts CGs - both cases\n", "!echo \"CG\" | awk -F\\[Cc][Gg] '{print NF-1}' tab_2 > CG " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Cs\n", "!echo \"C\" | awk -F\\[Cc] '{print NF-1}' tab_2 > C " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Counts Gs\n", "!echo \"G\" | awk -F\\[Gg] '{print NF-1}' tab_2 > G " ] }, { "cell_type": "code", "execution_count": 146, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin\t\ttatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\t7457\t185\t1914\t1444\n", "Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\t\tttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\t101\t1\t7\t12\n" ] } ], "source": [ "#Combining counts\n", "!paste tab_1 \\\n", "CG \\\n", "C \\\n", "G \\\n", "> comb\n", "!head -1 comb\n", "!tail -1 comb" ] }, { "cell_type": "code", "execution_count": 147, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_9419_Transcript_1/1_Confidence_1.000_Length_142_transcripts_v2_72905t---NA---\t\tttggctcatccttcttgtctgtcttcttggccctctttcctctggtctttggctcctgatctgcctcctcgcccttctcctcttcctttggctcatccttcttgtctgtcttcttggccctctttcctctggtctttggctt\t142\t1\t51\t23\r\n", "Locus_9418_Transcript_2/3_Confidence_0.286_Length_141_transcripts_v2_72906t---NA---\t\tgaaagagggccaagaagacagacaagaaggatgagccaaaggaagaggagaaggtcgagaagaatgaggatgaagggaaggaagatgagaagccaaagaccagaggaaagagggccaagaagacagacaacaaggatgagc\t141\t1\t17\t52\r\n", "transcripts_v2_72907t---NA---\t\tcaccttaggaaatgattagagaatagaagggagaatatacatactgatgttaggatttaatgggtcactttaaccctttaaaccctaacatcagtatgtatattctccnnnnnnnnnnnncttctattctctaatca\t137\t0\t23\t19\r\n", "transcripts_v2_72908t---NA---\t\tttgttggatgggtagagaatgggaattcagtttcagattaggtatgagaccatggatagttgtagctttctcagcacctgttgtagcagcttttgcagtatttgttgtttatnnnnnnnnnnggacaagctagcttttt\t139\t0\t16\t35\r\n", "transcripts_v2_72909thypothetical\t\tgaactacgtcacgtgatgacaaacttgggagagaaacttacagatgaggaagttgatgagatgatccgagaagcagatactgacagtgaagaggagatcaaggaagcctttagagtgtttgacaaagatggaaacg\t136\t4\t18\t41\r\n", "Locus_9529_Transcript_1/1_Confidence_1.000_Length_136_transcripts_v2_72910tp700\t\tccaggattggacgaatagatcatttggttcatttatataccctatcggtccaggggatttatatgtacatcatgcaatagcacttggcttacatgtaactgtcctcatcctactaaagggaggtcttgaagctcgt\t136\t3\t28\t29\r\n", "transcripts_v2_72911t---NA---\t\tgttaagtgtagctgatggagattaatcctttgtaatgttgagggtgaccataaatgaaaaaaaaagtgtgaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnna\t131\t0\t5\t19\r\n", "Locus_990_Transcript_3/3_Confidence_0.600_Length_134_transcripts_v2_72912t---NA---\t\tgctatttctgatgacaacatataagaattaccacaccacatagtaattagacacaatatgagcacaacacaaggacaagagtagatcatactaaacatagaagatgtacctaattgagaagatgtaatgatgaa\t134\t0\t22\t22\r\n", "transcripts_v2_72913t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnna\t127\t4\t19\t8\r\n", "Locus_9106_Transcript_3/7_Confidence_0.200_Length_128_transcripts_v2_72914t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnnnn\t128\t4\t19\t8\r\n", "Locus_9106_Transcript_7/7_Confidence_0.300_Length_128_transcripts_v2_72915t---NA---\t\tttccgatctagcggttaacctttccttttcctttcgtacaccatcaatctcatgttacacggttaataaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnnct\t128\t4\t20\t8\r\n", "Locus_9936_Transcript_1/1_Confidence_1.000_Length_124_transcripts_v2_72916t---NA---\t\ttagtcgattagttactggaataaaaagtgcaagtctgttgagggtttgccatgtaatttgctgcaaacacggcaaaccctcaacagacttgcactttttattccagtaactaatcgactagtaa\t124\t3\t24\t24\r\n", "Locus_9701_Transcript_1/1_Confidence_1.000_Length_123_transcripts_v2_72917t---NA---\t\tggtgcttttgcagctgctcctgctggcgctggcgccagagcaggtgctgctggagccgcctctggcggtggtgcttttgcagctgctcctgctggcgctggcgccagagcaggtgctgctgga\t123\t6\t36\t48\r\n", "Locus_9530_Transcript_1/1_Confidence_1.000_Length_118_transcripts_v2_72918t---NA---\t\tagacaggcagacaaacagacaggcagacaaacagacaggcagacaaacagacagacagacaaacagacaggcagacaaacagacaggcagacaaacagacaggcagacaaacagacag\t118\t0\t29\t29\r\n", "Locus_9226_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72919t---NA---\t\taataaatcatcttctgaagggtttgttattgggtgcacagagtcaatgaaatggggataattttgtttgactctgtgcacccaataacaaacccttcagaagatgatttattaaca\t116\t0\t18\t23\r\n", "Locus_9682_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72920t---NA---\t\tataaaaataccaggaattgaggaagcaagcagcggcatgccggagcagacaggcaaaaaggaaaaccagaatacgaaaagaaaggaagacgggaatctgcaagacgctttggtgga\t116\t5\t19\t35\r\n", "Locus_9570_Transcript_1/1_Confidence_1.000_Length_112_transcripts_v2_72921t---NA---\t\tgctagtttcaggtgtgcattcattgaataaatgtatttgtatttagtacgagtgtataataaagcagtaaatacaaatacatttattcaatgaatgcacacctgaaactagc\t112\t1\t15\t19\r\n", "Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922t---NA---\t\ttgcgtagctcggtggatgtatagagaatgggaattcagtttcagattaggtatgagaccatggatatttgtagnnnnnnnnnnncagcactctcagcacctgttgtagcag\t111\t2\t16\t29\r\n", "transcripts_v2_72923t---NA---\t\tggacgatgaggannnnnnnnnnnnnnctgatgacagtaacgatgatgatcttgatgatgatagcgttgacgagaacgacgaggatgaagactatgaagtga\t101\t6\t10\t29\r\n", "Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924t---NA---\t\tttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\t101\t1\t7\t12\r\n" ] } ], "source": [ "!tail -20 comb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculating CpGo/e based on [Gavery and Roberts (2010)](http://www.biomedcentral.com/1471-2164/11/483)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "awk: division by zero\r\n", " input record number 9164, file comb\r\n", " source line number 1\r\n" ] } ], "source": [ "!awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb > ID_CpG #use ^ instead of ** for exponent\n" ] }, { "cell_type": "code", "execution_count": 145, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1tspectrin \t 0.499212\n", "Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2tspectrin \t 0.494242\n", "Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3tvitellogenin \t 0.669218\n", "transcripts_v2_5t---NA--- \t 0.266393\n", "Locus_180_Transcript_15/16_Confidence_0.327_Length_6143_transcripts_v2_6t---NA--- \t 0.279666\n", "Locus_180_Transcript_14/16_Confidence_0.308_Length_6142_transcripts_v2_7t---NA--- \t 0.279907\n", "Locus_180_Transcript_13/16_Confidence_0.308_Length_6140_transcripts_v2_8t---NA--- \t 0.280334\n", "Locus_180_Transcript_7/16_Confidence_0.288_Length_6119_transcripts_v2_9t---NA--- \t 0.280651\n", "Locus_180_Transcript_11/16_Confidence_0.250_Length_5898_transcripts_v2_10t---NA--- \t 0.292432\n", "Locus_1199_Transcript_3/4_Confidence_0.750_Length_5569_transcripts_v2_11tserine \t 0.464167\n", "transcripts_v2_9163 \t 0.849883\n", "Locus_17783_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9164tmps \t 0\n", "transcripts_v2_9165t---NA--- \t 0.636027\n", "Locus_17917_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9166t---NA--- \t 0.410266\n", "Locus_18343_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9167t---NA--- \t 0.562175\n", "Locus_18403_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9168t---NA--- \t 0.814834\n", "Locus_18577_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9169t---NA--- \t 0\n", "Locus_18650_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9170t---NA--- \t 0\n", "transcripts_v2_9171t---NA--- \t 0.496778\n", "Locus_18916_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9172t---NA--- \t 9163 18327 729602 ID_CpG\n" ] } ], "source": [ "!head ID_CpG\n", "!tail ID_CpG\n", "!wc ID_CpG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now joining CpG to annotation, but first must sort files." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tnogo-b\tsp\tQ99LJ8\tNGBR_MOUSE\t35.48\t93\t57\t2\t283\t5\t155\t244\t6e-12\t63.5\r\n", "Locus_10015_Transcript_1/2_Confidence_1.000_Length_905_transcripts_v2_1500\tsp\tQ86UC2\tRSPH3_HUMAN\t71.43\t175\t50\t0\t17\t541\t297\t471\t3e-50\t 180\r\n", "Locus_10015_Transcript_2/2_Confidence_1.000_Length_896_transcripts_v2_1531\tsp\tQ86UC2\tRSPH3_HUMAN\t71.43\t175\t50\t0\t17\t541\t297\t471\t3e-50\t 180\r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529\tadp-ribosylation\tsp\tQ99PE9\tARL4D_MOUSE\t41.98\t131\t71\t2\t29\t409\t1\t130\t2e-31\t 116\r\n", "Locus_10027_Transcript_1/1_Confidence_1.000_Length_375_transcripts_v2_4989\tkelch\tsp\tQ5R8W1\tKLDC4_PONAB\t28.81\t118\t79\t4\t23\t364\t136\t252\t2e-06\t49.3\r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337\thypothetical\tsp\tC3YZ51\tUBA5_BRAFL\t62.24\t143\t47\t2\t2\t421\t239\t377\t1e-40\t 145\r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420\tprotein\tsp\tQ22A30\tRL15_TETTS\t70.09\t107\t32\t0\t321\t1\t1\t107\t2e-39\t 137\r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512\tphotosystem\tsp\tQ5ENP6\tPSAL_ISOGA\t57.58\t66\t27\t1\t139\t336\t10\t74\t7e-18\t79.0\r\n", "Locus_10044_Transcript_1/1_Confidence_1.000_Length_273_transcripts_v2_6833\ttransmembrane\tsp\tQ96HH6\tTMM19_HUMAN\t58.14\t86\t34\t1\t5\t262\t174\t257\t5e-14\t69.7\r\n", "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383\tsp\tQ9QY36\tNAA10_MOUSE\t70.28\t212\t44\t1\t1\t579\t2\t213\t5e-99\t 295\r\n" ] } ], "source": [ "#Sorting Pdam Uniprot/Swissprot annotation file. This file was the result of work done in another notebook: \n", "#Pdam_blast_anno.ipynb\n", "!sort Pdam_blastx_uniprot_sql.tab | tail -n +2 > Pdam_blastx_uniprot_sql.tab.sorted\n", "!head Pdam_blastx_uniprot_sql.tab.sorted" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976\ttransport\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tdevelopmental processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tother biological processes\r", "\r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529\tsignal transduction\r", "\r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337\tother metabolic processes\r", "\r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420\tprotein metabolism\r", "\r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512\tother metabolic processes\r", "\r\n", "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383\tprotein metabolism\r", "\r\n", "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011\ttransport\r", "\r\n", "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761\tother biological processes\r", "\r\n" ] } ], "source": [ "#Sorting GOSlim annotation file. This file was the result of work done in another notebook: Pdam_blast_anno.ipynb\n", "!sort Pdam_GOSlim.tab | tail -n +2 > Pdam_GOSlim.sorted\n", "!head Pdam_GOSlim.sorted" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10000_Transcript_2/3_Confidence_0.667_Length_676_transcripts_v2_2486 \t 0.761861\r\n", "Locus_10001_Transcript_1/1_Confidence_1.000_Length_199_transcripts_v2_9515 \t 0.635946\r\n", "Locus_10002_Transcript_1/1_Confidence_1.000_Length_695_transcripts_v2_2386 \t 0.695709\r\n", "Locus_10003_Transcript_1/1_Confidence_1.000_Length_609_transcripts_v2_2870 \t 0.12449\r\n", "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 \t 0.230119\r\n", "Locus_10005_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9134 \t 0.530625\r\n", "Locus_10006_Transcript_1/1_Confidence_1.000_Length_167_transcripts_v2_11475 \t 1.34405\r\n", "Locus_10007_Transcript_1/2_Confidence_0.857_Length_1261_transcripts_v2_788 \t 0.746746\r\n", "Locus_10007_Transcript_2/2_Confidence_0.857_Length_1272_transcripts_v2_779 \t 0.758383\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367\r\n" ] } ], "source": [ "#Sorting CpG file\n", "!sort ID_CpG > ID_CpG.sorted\n", "!head ID_CpG.sorted" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# For this analysis, *Symbiodinium* sequences were removed. Using file generated from Pdam_zoox_removal.ipynb, ID_CpG.sorted2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!join ID_CpG.sorted2 Pdam_blastx_uniprot_sql.tab.sorted | awk '{print $1, \"\\t\", $2}' > Pdam_cpg_anno" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367\n", "Locus_10015_Transcript_1/2_Confidence_1.000_Length_905_transcripts_v2_1500 \t 0.364383\n", "Locus_10015_Transcript_2/2_Confidence_1.000_Length_896_transcripts_v2_1531 \t 0.368691\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 \t 1.4668\n", "Locus_10027_Transcript_1/1_Confidence_1.000_Length_375_transcripts_v2_4989 \t 0.847464\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 \t 0.578951\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 \t 0.914415\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 \t 0.52846\n", "Locus_10044_Transcript_1/1_Confidence_1.000_Length_273_transcripts_v2_6833 \t 0.533378\n", "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 \t 0.206723\n", " 19133 38266 1468482 Pdam_cpg_anno\n" ] } ], "source": [ "!head Pdam_cpg_anno\n", "!wc Pdam_cpg_anno" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!join ID_CpG.sorted2 Pdam_GOSlim.sorted > Pdam_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 0.230119 transport\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 0.266367 developmental processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 0.266367 other biological processes\r", "\r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 1.4668 signal transduction\r", "\r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 0.578951 other metabolic processes\r", "\r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 0.914415 protein metabolism\r", "\r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 0.52846 other metabolic processes\r", "\r\n", "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 0.206723 protein metabolism\r", "\r\n", "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011 0.384943 transport\r", "\r\n", "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761 0.499315 other biological processes\r", "\r\n" ] } ], "source": [ "!head Pdam_cpg_GOslim" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 \t 0.230119 \t transport\r", " \r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367 \t developmental processes\r", " \r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367 \t other biological processes\r", " \r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529 \t 1.4668 \t signal transduction\r", " \r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337 \t 0.578951 \t other metabolic processes\r", " \r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420 \t 0.914415 \t protein metabolism\r", " \r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512 \t 0.52846 \t other metabolic processes\r", " \r\n", "Locus_10045_Transcript_1/1_Confidence_1.000_Length_696_transcripts_v2_2383 \t 0.206723 \t protein metabolism\r", " \r\n", "Locus_10059_Transcript_1/1_Confidence_1.000_Length_589_transcripts_v2_3011 \t 0.384943 \t transport\r", " \r\n", "Locus_10063_Transcript_1/1_Confidence_1.000_Length_177_transcripts_v2_10761 \t 0.499315 \t other biological processes\r", " \r\n" ] } ], "source": [ "#Putting tabs in between columns\n", "!awk '{print $1, \"\\t\", $2, \"\\t\", $3, $4, $5, $6}' Pdam_cpg_GOslim > Pdam_cpg_GOslim.tab\n", "!head Pdam_cpg_GOslim.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now time to plot data using pandas and matplot" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "
---|---|---|---|
0 | \n", "Locus_10004_Transcript_1/1_Confidence_1.000_Le... | \n", "0.230119 | \n", "transport | \n", "
1 | \n", "\n", " | NaN | \n", "NaN | \n", "
2 | \n", "Locus_1000_Transcript_1/1_Confidence_1.000_Len... | \n", "0.266367 | \n", "developmental processes | \n", "
3 | \n", "\n", " | NaN | \n", "NaN | \n", "
4 | \n", "Locus_1000_Transcript_1/1_Confidence_1.000_Len... | \n", "0.266367 | \n", "other biological processes | \n", "
5 | \n", "\n", " | NaN | \n", "NaN | \n", "
6 | \n", "Locus_10024_Transcript_1/1_Confidence_1.000_Le... | \n", "1.466800 | \n", "signal transduction | \n", "
7 | \n", "\n", " | NaN | \n", "NaN | \n", "
8 | \n", "Locus_10037_Transcript_1/1_Confidence_1.000_Le... | \n", "0.578951 | \n", "other metabolic processes | \n", "
9 | \n", "\n", " | NaN | \n", "NaN | \n", "
10 | \n", "Locus_1003_Transcript_1/1_Confidence_1.000_Len... | \n", "0.914415 | \n", "protein metabolism | \n", "
11 | \n", "\n", " | NaN | \n", "NaN | \n", "
12 | \n", "Locus_10043_Transcript_1/1_Confidence_1.000_Le... | \n", "0.528460 | \n", "other metabolic processes | \n", "
13 | \n", "\n", " | NaN | \n", "NaN | \n", "
14 | \n", "Locus_10045_Transcript_1/1_Confidence_1.000_Le... | \n", "0.206723 | \n", "protein metabolism | \n", "
15 | \n", "\n", " | NaN | \n", "NaN | \n", "
16 | \n", "Locus_10059_Transcript_1/1_Confidence_1.000_Le... | \n", "0.384943 | \n", "transport | \n", "
17 | \n", "\n", " | NaN | \n", "NaN | \n", "
18 | \n", "Locus_10063_Transcript_1/1_Confidence_1.000_Le... | \n", "0.499315 | \n", "other biological processes | \n", "
19 | \n", "\n", " | NaN | \n", "NaN | \n", "
20 | \n", "Locus_10068_Transcript_1/1_Confidence_1.000_Le... | \n", "0.689883 | \n", "transport | \n", "
21 | \n", "\n", " | NaN | \n", "NaN | \n", "
22 | \n", "Locus_10069_Transcript_1/1_Confidence_1.000_Le... | \n", "0.348955 | \n", "other biological processes | \n", "
23 | \n", "\n", " | NaN | \n", "NaN | \n", "
24 | \n", "Locus_10069_Transcript_1/1_Confidence_1.000_Le... | \n", "0.348955 | \n", "other metabolic processes | \n", "
25 | \n", "\n", " | NaN | \n", "NaN | \n", "
26 | \n", "Locus_10073_Transcript_1/1_Confidence_1.000_Le... | \n", "0.640578 | \n", "protein metabolism | \n", "
27 | \n", "\n", " | NaN | \n", "NaN | \n", "
28 | \n", "Locus_10078_Transcript_1/1_Confidence_1.000_Le... | \n", "0.157409 | \n", "cell cycle and proliferation | \n", "
29 | \n", "Locus_10078_Transcript_1/1_Confidence_1.000_Le... | \n", "0.157409 | \n", "other metabolic processes | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
73380 | \n", "transcripts_v2_977 | \n", "1.236260 | \n", "signal transduction | \n", "
73381 | \n", "\n", " | NaN | \n", "NaN | \n", "
73382 | \n", "transcripts_v2_98 | \n", "0.341826 | \n", "other biological processes | \n", "
73383 | \n", "\n", " | NaN | \n", "NaN | \n", "
73384 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "other biological processes | \n", "
73385 | \n", "\n", " | NaN | \n", "NaN | \n", "
73386 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "other metabolic processes | \n", "
73387 | \n", "\n", " | NaN | \n", "NaN | \n", "
73388 | \n", "transcripts_v2_9821 | \n", "0.635342 | \n", "protein metabolism | \n", "
73389 | \n", "\n", " | NaN | \n", "NaN | \n", "
73390 | \n", "transcripts_v2_983 | \n", "0.508182 | \n", "other metabolic processes | \n", "
73391 | \n", "\n", " | NaN | \n", "NaN | \n", "
73392 | \n", "transcripts_v2_983 | \n", "0.508182 | \n", "transport | \n", "
73393 | \n", "\n", " | NaN | \n", "NaN | \n", "
73394 | \n", "transcripts_v2_9880 | \n", "0.891478 | \n", "protein metabolism | \n", "
73395 | \n", "\n", " | NaN | \n", "NaN | \n", "
73396 | \n", "transcripts_v2_9896 | \n", "0.394843 | \n", "other metabolic processes | \n", "
73397 | \n", "\n", " | NaN | \n", "NaN | \n", "
73398 | \n", "transcripts_v2_991 | \n", "0.638445 | \n", "RNA metabolism | \n", "
73399 | \n", "\n", " | NaN | \n", "NaN | \n", "
73400 | \n", "transcripts_v2_991 | \n", "0.638445 | \n", "developmental processes | \n", "
73401 | \n", "\n", " | NaN | \n", "NaN | \n", "
73402 | \n", "transcripts_v2_9910 | \n", "0.761721 | \n", "stress response | \n", "
73403 | \n", "\n", " | NaN | \n", "NaN | \n", "
73404 | \n", "transcripts_v2_9932 | \n", "0.386127 | \n", "other metabolic processes | \n", "
73405 | \n", "\n", " | NaN | \n", "NaN | \n", "
73406 | \n", "transcripts_v2_9932 | \n", "0.386127 | \n", "protein metabolism | \n", "
73407 | \n", "\n", " | NaN | \n", "NaN | \n", "
73408 | \n", "transcripts_v2_9936 | \n", "0.892228 | \n", "RNA metabolism | \n", "
73409 | \n", "\n", " | NaN | \n", "NaN | \n", "
73410 rows × 3 columns
\n", "\n", " | 0 | \n", "
---|---|
0 | \n", "0.761861 | \n", "
1 | \n", "0.635946 | \n", "
2 | \n", "0.695709 | \n", "
3 | \n", "0.124490 | \n", "
4 | \n", "0.230119 | \n", "
5 | \n", "0.530625 | \n", "
6 | \n", "1.344050 | \n", "
7 | \n", "0.746746 | \n", "
8 | \n", "0.758383 | \n", "
9 | \n", "0.266367 | \n", "
10 | \n", "0.158412 | \n", "
11 | \n", "0.736118 | \n", "
12 | \n", "0.978761 | \n", "
13 | \n", "1.466680 | \n", "
14 | \n", "0.364383 | \n", "
15 | \n", "0.368691 | \n", "
16 | \n", "0.779823 | \n", "
17 | \n", "0.110361 | \n", "
18 | \n", "0.560740 | \n", "
19 | \n", "0.898968 | \n", "
20 | \n", "0.281310 | \n", "
21 | \n", "0.082143 | \n", "
22 | \n", "0.687209 | \n", "
23 | \n", "0.219951 | \n", "
24 | \n", "1.466800 | \n", "
25 | \n", "0.847464 | \n", "
26 | \n", "0.676556 | \n", "
27 | \n", "0.853481 | \n", "
28 | \n", "0.661166 | \n", "
29 | \n", "0.162397 | \n", "
... | \n", "... | \n", "
69920 | \n", "0.000000 | \n", "
69921 | \n", "0.961957 | \n", "
69922 | \n", "0.635342 | \n", "
69923 | \n", "0.000000 | \n", "
69924 | \n", "0.508182 | \n", "
69925 | \n", "0.245346 | \n", "
69926 | \n", "0.850773 | \n", "
69927 | \n", "0.623037 | \n", "
69928 | \n", "0.066227 | \n", "
69929 | \n", "0.893366 | \n", "
69930 | \n", "0.891478 | \n", "
69931 | \n", "0.394843 | \n", "
69932 | \n", "0.638445 | \n", "
69933 | \n", "0.761721 | \n", "
69934 | \n", "1.173440 | \n", "
69935 | \n", "1.535910 | \n", "
69936 | \n", "0.386127 | \n", "
69937 | \n", "0.892228 | \n", "
69938 | \n", "0.674579 | \n", "
69939 | \n", "0.881767 | \n", "
69940 | \n", "1.053640 | \n", "
69941 | \n", "1.083160 | \n", "
69942 | \n", "0.850852 | \n", "
69943 | \n", "0.558495 | \n", "
69944 | \n", "0.480530 | \n", "
69945 | \n", "1.000290 | \n", "
69946 | \n", "0.804978 | \n", "
69947 | \n", "0.787074 | \n", "
69948 | \n", "0.000000 | \n", "
69949 | \n", "0.660990 | \n", "
69950 rows × 1 columns
\n", "