{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Determining how complete the transcriptome is..." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 307964 ../data-results/Geoduck-transcriptome-v3.fa\r\n" ] } ], "source": [ "#fasta with non-euk removed\n", "!wc -l ../data-results/Geoduck-transcriptome-v3.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compare with C. gigas" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 11.6M 100 11.6M 0 0 1317k 0 0:00:09 0:00:09 --:--:-- 2284k\n" ] } ], "source": [ "!curl ftp://ftp.ensemblgenomes.org/pub/metazoa/release-29/fasta/crassostrea_gigas/cdna/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa.gz \\\n", "> analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa.gz" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!gunzip analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa.gz" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "27108\r\n" ] } ], "source": [ "!fgrep -c \">\" analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Building a new DB, current time: 11/17/2015 12:48:34\n", "New DB name: analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna\n", "New DB title: analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa\n", "Sequence type: Nucleotide\n", "Keep Linkouts: T\n", "Keep MBits: T\n", "Maximum file size: 1000000000B\n", "Adding sequences from FASTA; added 27108 sequences in 2.71984 seconds.\n" ] } ], "source": [ "!makeblastdb \\\n", "-dbtype nucl \\\n", "-in analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna.all.fa \\\n", "-out analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!blastn \\\n", "-task blastn \\\n", "-query ../data-results/Geoduck-transcriptome-v3.fa \\\n", "-db analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna \\\n", "-num_threads 4 \\\n", "-evalue 1e-30 \\\n", "-max_target_seqs 1 \\\n", "-max_hsps 1 \\\n", "-outfmt 6 \\\n", "-out analyses/Geoduck-v3_blastn_Cgigas_cdna.out" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 8395 analyses/Geoduck-v3_blastn_Cgigas_cdna_e10.out\r\n" ] } ], "source": [ "!blastn \\\n", "-task blastn \\\n", "-query ../data-results/Geoduck-transcriptome-v3.fa \\\n", "-db analyses/Crassostrea_gigas.GCA_000297895.1.29.cdna \\\n", "-num_threads 8 \\\n", "-evalue 1e-10 \\\n", "-max_target_seqs 1 \\\n", "-max_hsps 1 \\\n", "-outfmt 6 \\\n", "-out analyses/Geoduck-v3_blastn_Cgigas_cdna_e10.out \n", "!wc -l analyses/Geoduck-v3_blastn_Cgigas_cdna_e10.out" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Compare with Ruphibase" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32606\r\n" ] } ], "source": [ "!grep -c \"rud\" /Users/sr320/Desktop/query.txt" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ruditapes2_lrc7040\tATTCAAATCTCTAACACTGATTCATACATGTAATAACTTGGCATACTATACATTATCAACATGTACTGTTACTTTCCTGTAATTGTTCAAAATATCTCTGGAATATTTTACACTTTATCTGTGGTTTTTTACAGTTTTTTTTTAATTGAAATAGTGATAACTTTGATTGAACATTCTTTTATGTTTTAGCATCAAGATCTTCAAACTTGTAATACACACAATATCAATAACAAAATGTGACAGTTTTATTTTCATTCATCATACACATCTTCCTTATCACATACATACTGACATAGATTCTGGTGTCATAAGACGGTCTGCATCTTGGTCAGGTATTTCAAATCTAAATTCATCTTCCATTGCCATGATAACTTCTACAACATCTAAACTGTCCAATCCTAAATCATTCATAAAGTGTGAAGTCAATGACAGCTTTTCGGGATCAACTTTATCATAAAGTTGCAAAACGAGAATGACTCTTTCTTTAACATGAGATATTGTGAGAGCTGGCTTCTGACCATAATATCGAGGGTTTTGAATTGCAGTTTTCTGTGGTGCTAATAATTTTGTTGAATTTCTGCAATTGTGAATCGTTCTGTCTTCTTGTCTGCAATGGAGTAGTGAGACAGAAGATACACGTGGAAAGACGNTTCTAGCAACAGTTTGGAGGCAATTTGTTGCCCGGACCGTCTTCAGCAAGGTGG\r", "\r\n" ] } ], "source": [ "#Results from webform download\n", "!head -1 /Users/sr320/Desktop/query.txt" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!awk '{print \">\"$1\"\\n\"$2}' /Users/sr320/Desktop/query.txt \\\n", "> analyses/RuphiBase_32606.fa" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">ruditapes2_lrc7040\r\n", "ATTCAAATCTCTAACACTGATTCATACATGTAATAACTTGGCATACTATACATTATCAACATGTACTGTTACTTTCCTGTAATTGTTCAAAATATCTCTGGAATATTTTACACTTTATCTGTGGTTTTTTACAGTTTTTTTTTAATTGAAATAGTGATAACTTTGATTGAACATTCTTTTATGTTTTAGCATCAAGATCTTCAAACTTGTAATACACACAATATCAATAACAAAATGTGACAGTTTTATTTTCATTCATCATACACATCTTCCTTATCACATACATACTGACATAGATTCTGGTGTCATAAGACGGTCTGCATCTTGGTCAGGTATTTCAAATCTAAATTCATCTTCCATTGCCATGATAACTTCTACAACATCTAAACTGTCCAATCCTAAATCATTCATAAAGTGTGAAGTCAATGACAGCTTTTCGGGATCAACTTTATCATAAAGTTGCAAAACGAGAATGACTCTTTCTTTAACATGAGATATTGTGAGAGCTGGCTTCTGACCATAATATCGAGGGTTTTGAATTGCAGTTTTCTGTGGTGCTAATAATTTTGTTGAATTTCTGCAATTGTGAATCGTTCTGTCTTCTTGTCTGCAATGGAGTAGTGAGACAGAAGATACACGTGGAAAGACGNTTCTAGCAACAGTTTGGAGGCAATTTGTTGCCCGGACCGTCTTCAGCAAGGTGG\r", "\r\n", ">ruditapes2_c3688\r\n", "ATGTATTGTCGCATAAGAATAAATTTTCGGATAGTTAACAAGTGGAATATAGCAGACGTATTTGAACCGTGGGCTTGTTTCAAGTATCATGGCGAATGACTTCCTTATCTCAAAGTGGAAGCGATGGTTCAGTTTATTTGATGGCAATAATGATGGCAAAGTTTGCTTTGCAGACATGGAGAAAGCCAGAGATAAATTTACAAAGCAACATGAGTTGAAAGATGACGAAAAGAAGGAAATAAGTGAGAAATTCACGAAATGGTGGTGCGAATATGTGAACTGGGGAAAAGAAGAATTGACGGTGGACGAGTTTGCAGAATACCAAAATGCTGCATTTAAAGCCGACAAAGAAAAGTTTGTTGAGCGAATTAAAAAGTGTCAAGCTGAGATTGGTGACTTTGTAGACGTCGCGCACGATGGTTTGTTTCCGAAAGGAAGCTTTGGCAATCCTCAAGGCGGAGGGGCACGAGGACGAAGAGAA\r", "\r\n", ">ruditapes2_c1400\r\n", "GTCACTAGTGACGGAAATGGTTATGGGTATGTTGCCAGCGGAACAAGCTCGCCCTCTCCGAGTCTCTCAAGCCCACAGTACCCGGTCAACATTTCGTACTATGCAGACCCAAACTATATGAAAGAGGTGTCAGCAAGTCCATTACATGTGGCTGTAGGGGCACAAGTATACGTCAAGGTCTTTACAGAGACATACGACTGGAACGCTAGAATGAGAGTCGTTTCCTGTTACACTAAACCAACCTACAATGCGCCTGATCATACAAAATATTACCTTCTCGAGAACGGGTGTGAAGTTGACACGAACACGCATATTATTTCACAATCCTCCCACGAGACTAAATTTGTCTTCGACAATTTTGAGTACACAAGCAACCACGAGGGCATCTACATGTTTTGCGACACCATCTTTTGCAACACGCATGATTACTCGGCAAAATGTAGGCAGTCTTGCAACCCGGTAATTCGGAGGAAAATTTCTGTGGAAAATGTACCGTCTGATGATAAGGAAGCTGATAAGTCATCGGATGTTACTAATCAGGATTCTTAAAGAAAGCAGACGACTCTAGATGACCAAAAATGATTGAAATAGAAACATAATTGTTTTCTTTATTTGATTTATCAACTGGAAAATTTTGATTTTCTTGAGACTACTTGACAGTTACAATGAAAATTTATGGCATTTATCAGATTTAGTTCTTCTTCTGCACAGAATTTTTAGTATGGTTTATTAAAATTACACTTGTAT\r", "\r\n", ">ruditapes2_c3682\r\n", "CTTTATTTACACATTACATTTAAACTTTTAAAAATGAAATCAGTTAAACCAGATAAAGTACATATATACTAAAGTCACTCCACTGTTCTCTATAAACAATAAGTATCATTCATTACACATTTGCACAAGGAAGATAAAATCTTAATTGCTTCAGTAATTAAACAATTAGTCTCCAACCTATTACTACTACACAACTTTTGATGACACAATATTTTTCATCACAGAATTGTTTCCATTTCTCATTTAGCCTCGCCCTCTTGTTTACCTCGGAGATGAAATGGTACATATTCTGGTTTTCCTTTAAGCGCTCGTCTTTTCTTCATTTCATCTTTGAAATCTTCGTGTTCATCTCTAAATAAACCATGATCTCGAAGTCCATCCATAAGTTGTTTTAGCTCTACGTGACGAGGGTAGTAATCTACATTATAATAAGGATCCATATGTGGCGGTTTGCCAGAAAATATTCTTAGAACTTTTGTAGATTTCACATCTGTCTCTCTGGATAGCTCTCCAAATATTTTGGCAGAGAGTTTCGCCATACGCCGTGCATAATTTGATGATGACATCATTCACTGCTTTTGTTCCTTTCAAACATTTGGATGTTGTTAACGACTAAATTTATGATCTAAGATCACTTGGCAGAGGTGCCATTGGGGGGCAATACACAGTTTTCAAGGCTTCTCGGCAAACAACTGACTGCGCCACAATCAAAACCTCACACACAAGTCTGTAATCACACACTTTTTTCACTATTTTAAGTCTTCTGAATACCTTAAATGAAGAGCTCTCTAAAGT\r", "\r\n", ">ruditapes2_c4432\r\n", "CCATTGTGGAGCCGACAGATTCCATACACAATGATGAAGTTTGCCTGCTATAAACTGCTAAGTAGAAGCTCTGTACAAGTATGTTGTACCAAAACCAAGGTCAGAGTGCACAAAAGGAGAGCAGTTGGTTGTTACATTTATGGCTGGTTACATTGCTGGAGTATTCCGTGCTATTGTGTCCCATCCTGCTGATACAATCGTGTCCAAGCTGAACACAGATGCTGGAAGTACTGCCGTGGAAGCTGCAAAGGCTCTTGGATTCAAGGGATTGTGGAAGGGTCTGTTCCCCAGGATTATCATGATTGGTACACTGACCGCCCTGCAGTGGTTTATCTACGACTCTGTGAAGGTGTTCTTCCGTCTACCACGTCCACCACCGCCAGAGATGCCCGCCAGTCTCAAGAAAAAGCTTGGCATCACTGAATAATGACATTCTAACATAGGAAATAATAAATTGACTATACTTTACTTGTGATTTGACAGATTTTTATACAAAACATTGACGCATAAAGTTTACAAAAAGAATAAAAACGTATCAAAATGTTTGAAAAGAGATTTATTTAATGAAGCACTGTTACTGAACAAAGAT\r", "\r\n" ] } ], "source": [ "!head analyses/RuphiBase_32606.fa" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Building a new DB, current time: 11/17/2015 13:00:03\n", "New DB name: analyses/RuphiBase_32606\n", "New DB title: analyses/RuphiBase_32606.fa\n", "Sequence type: Nucleotide\n", "Keep Linkouts: T\n", "Keep MBits: T\n", "Maximum file size: 1000000000B\n", "Adding sequences from FASTA; added 32606 sequences in 0.991654 seconds.\n" ] } ], "source": [ "!makeblastdb \\\n", "-dbtype nucl \\\n", "-in analyses/RuphiBase_32606.fa \\\n", "-out analyses/RuphiBase_32606" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!blastn \\\n", "-task blastn \\\n", "-query ../data-results/Geoduck-transcriptome-v3.fa \\\n", "-db analyses/RuphiBase_32606 \\\n", "-num_threads 4 \\\n", "-evalue 1e-30 \\\n", "-max_target_seqs 1 \\\n", "-max_hsps 1 \\\n", "-outfmt 6 \\\n", "-out analyses/Geoduck-v3_blastn_RuphiBase.out" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1410 analyses/Geoduck-v3_blastn_RuphiBase.out\r\n" ] } ], "source": [ "!wc -l analyses/Geoduck-v3_blastn_RuphiBase.out" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!blastn \\\n", "-task blastn \\\n", "-query ../data-results/Geoduck-transcriptome-v3.fa \\\n", "-db analyses/RuphiBase_32606 \\\n", "-num_threads 4 \\\n", "-evalue 1e-60 \\\n", "-max_target_seqs 1 \\\n", "-max_hsps 1 \\\n", "-outfmt 6 \\\n", "-out analyses/Geoduck-v3_blastn_RuphiBase_e60.out" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2859 analyses/Geoduck-v3_blastn_RuphiBase_e10.out\r\n" ] } ], "source": [ "!blastn \\\n", "-task blastn \\\n", "-query ../data-results/Geoduck-transcriptome-v3.fa \\\n", "-db analyses/RuphiBase_32606 \\\n", "-num_threads 8 \\\n", "-evalue 1e-10 \\\n", "-max_target_seqs 1 \\\n", "-max_hsps 1 \\\n", "-outfmt 6 \\\n", "-out analyses/Geoduck-v3_blastn_RuphiBase_e10.out\n", "!wc -l analyses/Geoduck-v3_blastn_RuphiBase_e10.out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }