{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![p](http://eagle.fish.washington.edu/cnidarian/skitch/Download__Pinctada_Fucata_Genome_Ver_1_00_1AF29CD0.png)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "0:9999 \t72577\n", "10000:19999 \t16\n", "20000:29999 \t4\n", "\n", "Total length of sequence:\t44346368 bp\n", "Total number of sequences:\t72597\n", "N25 stats:\t\t\t25% of total sequence length is contained in the 4157 sequences >= 1656 bp\n", "N50 stats:\t\t\t50% of total sequence length is contained in the 13462 sequences >= 903 bp\n", "N75 stats:\t\t\t75% of total sequence length is contained in the 30455 sequences >= 477 bp\n", "Total GC count:\t\t\t18494771 bp\n", "GC %:\t\t\t\t41.71 %\n", "\n" ] } ], "source": [ "!perl /Users/sr320/git-repos/nb-2015/util/script-box/count_fasta.pl \\\n", "-i 10000 \\\n", "/Users/sr320/data-genomic/Oist-Pfuca/pfu_aug1.0_Nall.fasta" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "0:9999 \t72587\r\n", "\r\n", "Total length of sequence:\t14707499 bp\r\n", "Total number of sequences:\t72587\r\n", "N25 stats:\t\t\t25% of total sequence length is contained in the 4131 sequences >= 552 bp\r\n", "N50 stats:\t\t\t50% of total sequence length is contained in the 13383 sequences >= 302 bp\r\n", "N75 stats:\t\t\t75% of total sequence length is contained in the 30296 sequences >= 159 bp\r\n", "Total GC count:\t\t\t1052717 bp\r\n", "GC %:\t\t\t\t7.16 %\r\n", "\r\n" ] } ], "source": [ "!perl /Users/sr320/git-repos/nb-2015/util/script-box/count_fasta.pl \\\n", "-i 10000 \\\n", "/Users/sr320/data-genomic/Oist-Pfuca/pfu_aug1.0_Pall.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "0:99999 \t800353\r\n", "100000:199999 \t596\r\n", "200000:299999 \t20\r\n", "300000:399999 \t7\r\n", "400000:499999 \t3\r\n", "500000:599999 \t1\r\n", "600000:699999 \t2\r\n", "\r\n", "Total length of sequence:\t1413178538 bp\r\n", "Total number of sequences:\t800982\r\n", "N25 stats:\t\t\t25% of total sequence length is contained in the 5015 sequences >= 42768 bp\r\n", "N50 stats:\t\t\t50% of total sequence length is contained in the 19486 sequences >= 14455 bp\r\n", "N75 stats:\t\t\t75% of total sequence length is contained in the 78217 sequences >= 3540 bp\r\n", "Total GC count:\t\t\t361364099 bp\r\n", "GC %:\t\t\t\t25.57 %\r\n", "\r\n" ] } ], "source": [ "!perl /Users/sr320/git-repos/nb-2015/util/script-box/count_fasta.pl \\\n", "-i 100000 \\\n", "/Users/sr320/data-genomic/Oist-Pfuca/pfu_genome1.0.fasta" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tgene_modelid\tcreated_at\tupdated_at\tsynonym1\tgene_name\tdescription\tproject_id\tmodel_completion\tmrna_evidence\r\n", "1389\tpfu_aug1.0_7187.1_09507.t1\t2012-01-17 02:39:15\t2012-01-17 02:39:15\t\telongation factor 1 alpha-like\temb|X55324.1| \t20\tNULL\tNULL\r\n", "1395\tpfu_aug1.0_229418.1_28682\t2012-01-17 04:26:35\t2012-01-18 07:22:28\t\tKRMP-1\treciprocal BLASTP hit to NCBI Accession AB507416\t20\tfull\tNULL\r\n", "1396\tpfu_aug1.0_89285.1_56763.t1\t2012-01-17 04:32:16\t2012-01-18 04:45:52\t\tMSI31\tdbj|D86074.1|Pinctada fucata mRNA for insoluble protein\r", "\\nencoding C-terminal 312 amino acid residues.\t20\tfull\tNULL\r\n", "1400\tpfu_aug1.0_328.1_36501\t2012-01-17 04:47:54\t2012-01-18 07:37:18\t\tribosomal protein S12\t\t20\tpartial\tNULL\r\n", "1402\tpfu_aug1.0_465.1_00315\t2012-01-17 04:48:21\t2012-01-19 02:07:34\t\tAspein\tBlast best hit to ddbj AB094512 \t20\tpartial\tNULL\r\n", "1403\tpfu_aug1.0_9833.1_68103\t2012-01-17 04:49:04\t2012-01-17 06:32:15\t\torexin receptor-like protein\torexin receptor-like protein-like [Saccoglossus kowalevskii] (2e-07)\r", "\\n7tm_1\t20\tpartial\tNULL\r\n", "1407\tpfu_aug1.0_59974.1_70902\t2012-01-17 04:52:12\t2012-01-20 09:59:16\t\tPFMG1 like\tgb|DQ104255.1| Pinctada fucata mantle gene 1 mRNA, complete cds\t20\tpartial\tNULL\r\n", "1409\tpfu_aug1.0_337281.1_07113\t2012-01-17 04:53:39\t2012-01-18 07:37:42\t\t60S ribosomal protein L23\tHigh homology to known gene\t20\tpartial\tNULL\r\n", "1410\tpfu_aug1.0_653.1_07612\t2012-01-17 04:53:42\t2012-01-18 05:06:20\t\tbombyxin C-1-like\tNCBI BLASTP best hit to NP_001119736\r", "\\nInsulin/IGF/Relaxin family\t20\tpartial\tNULL\r\n" ] } ], "source": [ "!head /Users/sr320/data-genomic/Oist-Pfuca/piful.annotated_genes.latest" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 763 /Users/sr320/data-genomic/Oist-Pfuca/piful.annotated_genes.latest\r\n" ] } ], "source": [ "!wc -l /Users/sr320/data-genomic/Oist-Pfuca/piful.annotated_genes.latest" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }