{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BLAST Tutorial\n", "\n", "In this tutorial, I'll match a class sequence with the Uniprot Swiss-Prot Database." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1: Set Variables and Working Directory" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Establish a path to the BLAST directory so I don't have to use the absolute path later\n", "blastDirectory = \"/Applications/blast/ncbi-blast-2.2.18+/bin/\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "USAGE\r\n", " blastx [-h] [-help] [-import_search_strategy filename]\r\n", " [-export_search_strategy filename] [-db database_name]\r\n", " [-dbsize num_letters] [-gilist filename] [-negative_gilist filename]\r\n", " [-mask_subjects filtering_algorithms] [-subject subject_input_file]\r\n", " [-subject_loc range] [-query input_file] [-out output_file]\r\n", " [-evalue evalue] [-word_size int_value] [-gapopen open_penalty]\r\n", " [-gapextend extend_penalty] [-xdrop_ungap float_value]\r\n", " [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n", " [-searchsp int_value] [-frame_shift_penalty frameshift]\r\n", " [-max_intron_length length] [-seg SEG_options]\r\n", " [-soft_masking soft_masking] [-matrix matrix_name]\r\n", " [-threshold float_value] [-culling_limit int_value]\r\n", " [-window_size int_value] [-ungapped] [-lcase_masking] [-query_loc range]\r\n", " [-parse_deflines] [-strand strand] [-query_gencode int_value]\r\n", " [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n", " [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n", " [-num_threads int_value] [-remote] [-version]\r\n", "\r\n", "DESCRIPTION\r\n", " Translated Query-Protein Subject BLAST 2.2.18+\r\n", "\r\n", "OPTIONAL ARGUMENTS\r\n", " -h\r\n", " Print USAGE and DESCRIPTION; ignore other arguments\r\n", " -help\r\n", " Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments\r\n", " -strand \r\n", " Query strand(s) to search against database/subject\r\n", " Default = `both'\r\n", " -version\r\n", " Print version number; ignore other arguments\r\n", "\r\n", " *** Input query options\r\n", " -query \r\n", " Input file name\r\n", " Default = `-'\r\n", " -query_loc \r\n", " Location on the query sequence (Format: start-stop)\r\n", " -query_gencode \r\n", " Genetic code to use to translate query\r\n", " Default = `1'\r\n", "\r\n", " *** General search options\r\n", " -db \r\n", " BLAST database name\r\n", " * Incompatible with: subject, subject_loc\r\n", " -out \r\n", " Output file name\r\n", " Default = `-'\r\n", " -evalue \r\n", " Expectation value (E) threshold for saving hits \r\n", " Default = `10'\r\n", " -word_size =2>\r", "\r\n", " Word size for wordfinder algorithm\r\n", " -gapopen \r\n", " Cost to open a gap\r\n", " -gapextend \r\n", " Cost to extend a gap\r\n", " -frame_shift_penalty =1>\r\n", " Frame shift penalty (for use with out-of-frame gapped alignment in blastx\r\n", " or tblastn, default ignored)\r\n", " -max_intron_length \r\n", " Length of the largest intron allowed in a translated nucleotide sequence\r\n", " when linking multiple distinct alignments (a negative value disables\r\n", " linking)\r\n", " Default = `0'\r\n", " -matrix \r\n", " Scoring matrix name\r\n", " Default = `BLOSUM62'\r\n", " -threshold =0>\r\n", " Minimum word score such that the word is added to the BLAST lookup table\r\n", "\r\n", " *** BLAST-2-Sequences options\r\n", " -subject \r\n", " Subject sequence(s) to search\r\n", " * Incompatible with: db, gilist, negative_gilist, mask_subjects\r\n", " -subject_loc \r\n", " Location on the subject sequence (Format: start-stop)\r\n", " * Incompatible with: db, gilist, negative_gilist, mask_subjects, remote\r\n", "\r\n", " *** Formatting options\r\n", " -outfmt \r\n", " alignment view options:\r\n", " 0 = pairwise,\r\n", " 1 = query-anchored showing identities,\r\n", " 2 = query-anchored no identities,\r\n", " 3 = flat query-anchored, show identities,\r\n", " 4 = flat query-anchored, no identities,\r\n", " 5 = XML Blast output,\r\n", " 6 = tabular,\r\n", " 7 = tabular with comment lines,\r\n", " 8 = Text ASN.1,\r\n", " 9 = Binary ASN.1\r\n", " 10 = Comma-separated values\r\n", " \r\n", " Options 6, 7, and 10 can be additionally configured to produce\r\n", " a custom format specified by space delimited format specifiers.\r\n", " The supported format specifiers are:\r\n", " \t qseqid means Query Seq-id\r\n", " \t qgi means Query GI\r\n", " \t qacc means Query accesion\r\n", " \t sseqid means Subject Seq-id\r\n", " \t sallseqid means All subject Seq-id(s), separated by a ';'\r\n", " \t sgi means Subject GI\r\n", " \t sallgi means All subject GIs\r\n", " \t sacc means Subject accession\r\n", " \t sallacc means All subject accessions\r\n", " \t qstart means Start of alignment in query\r\n", " \t qend means End of alignment in query\r\n", " \t sstart means Start of alignment in subject\r\n", " \t send means End of alignment in subject\r\n", " \t qseq means Aligned part of query sequence\r\n", " \t sseq means Aligned part of subject sequence\r\n", " \t evalue means Expect value\r\n", " \t bitscore means Bit score\r\n", " \t score means Raw score\r\n", " \t length means Alignment length\r\n", " \t pident means Percentage of identical matches\r\n", " \t nident means Number of identical matches\r\n", " \t mismatch means Number of mismatches\r\n", " \t positive means Number of positive-scoring matches\r\n", " \t gapopen means Number of gap openings\r\n", " \t gaps means Total number of gaps\r\n", " \t ppos means Percentage of positive-scoring matches\r\n", " \t frames means Query and subject frames separated by a '/'\r\n", " \t qframe means Query frame\r\n", " \t sframe means Subject frame\r\n", " When not provided, the default value is:\r\n", " 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send\r\n", " evalue bitscore', which is equivalent to the keyword 'std'\r\n", " Default = `0'\r\n", " -show_gis\r\n", " Show NCBI GIs in deflines?\r\n", " -num_descriptions =0>\r\n", " Number of database sequences to show one-line descriptions for\r\n", " Default = `500'\r\n", " -num_alignments =0>\r\n", " Number of database sequences to show alignments for\r\n", " Default = `250'\r\n", " -html\r\n", " Produce HTML output?\r\n", "\r\n", " *** Query filtering options\r\n", " -seg \r\n", " Filter query sequence with SEG (Format: 'yes', 'window locut hicut', or\r\n", " 'no' to disable)\r\n", " Default = `12 2.2 2.5'\r\n", " -soft_masking \r\n", " Apply filtering locations as soft masks\r\n", " Default = `false'\r\n", " -lcase_masking\r\n", " Use lower case filtering in query sequence(s)?\r\n", "\r\n", " *** Restrict search or results\r\n", " -gilist \r\n", " Restrict search of database to list of GI's\r\n", " * Incompatible with: negative_gilist, remote, subject, subject_loc\r\n", " -negative_gilist \r\n", " Restrict search of database to everything except the listed GIs\r\n", " * Incompatible with: gilist, remote, subject, subject_loc\r\n", " -mask_subjects \r\n", " Filtering algorithms IDs to apply from the database specified as soft\r\n", " masking for subjects (Format: N,M,...)\r\n", " * Incompatible with: subject, subject_loc\r\n", " -culling_limit =0>\r\n", " If the query range of a hit is enveloped by that of at least this many\r\n", " higher-scoring hits, delete the hit\r\n", " Default = `0'\r\n", " -max_target_seqs =1>\r\n", " Maximum number of aligned sequences to keep\r\n", "\r\n", " *** Statistical options\r\n", " -dbsize \r\n", " Effective length of the database \r\n", " -searchsp =0>\r\n", " Effective length of the search space\r\n", "\r\n", " *** Search strategy options\r\n", " -import_search_strategy \r\n", " Search strategy to use\r\n", " * Incompatible with: export_search_strategy\r\n", " -export_search_strategy \r\n", " File name to record the search strategy used\r\n", " * Incompatible with: import_search_strategy\r\n", "\r\n", " *** Extension options\r\n", " -xdrop_ungap \r\n", " X-dropoff value (in bits) for ungapped extensions\r\n", " -xdrop_gap \r\n", " X-dropoff value (in bits) for preliminary gapped extensions\r\n", " -xdrop_gap_final \r\n", " X-dropoff value (in bits) for final gapped alignment\r\n", " -window_size =0>\r\n", " Multiple hits window size, use 0 to specify 1-hit algorithm\r\n", " -ungapped\r\n", " Perform ungapped alignment only?\r\n", "\r\n", " *** Miscellaneous options\r\n", " -parse_deflines\r\n", " Should the query and subject defline(s) be parsed?\r\n", " -num_threads =1>\r\n", " Number of threads to use in the BLAST search\r\n", " Default = `1'\r\n", " * Incompatible with: remote\r\n", " -remote\r\n", " Execute search remotely?\r\n", " * Incompatible with: gilist, negative_gilist, subject_loc, num_threads\r\n", "\r\n" ] } ], "source": [ "#Confirm it works\n", "!{blastDirectory}blastx -help" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "'/Users/yaamini/Documents/yaamini-virginica/tutorials/2018-10-09-BLAST-Tutorial'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2: Create a `blast` Database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For my database, I'll use Uniprot Swiss-Prot." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 2a: Download Database" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 84.1M 100 84.1M 0 0 279k 0 0:05:08 0:05:08 --:--:-- 308k\n" ] } ], "source": [ "! curl \\\n", "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz \\\n", "> uniprot_sprot.fasta.gz" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Unzip the database, but keep the original file\n", "! gunzip -k uniprot_sprot.fasta" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2018-10-09-Blast-Tutorial.ipynb uniprot_sprot_20181010.pin\r\n", "uniprot_sprot.fasta uniprot_sprot_20181010.psd\r\n", "uniprot_sprot.fasta.gz uniprot_sprot_20181010.psi\r\n", "uniprot_sprot_20181010.phr uniprot_sprot_20181010.psq\r\n" ] } ], "source": [ "! ls -F" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 2b: Create the Database" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Building a new DB, current time: 10/10/2018 16:43:08\n", "New DB name: uniprot_sprot_20181010\n", "New DB title: uniprot_sprot.fasta\n", "Sequence type: Protein\n", "Keep Linkouts: T\n", "Keep MBits: T\n", "Maximum file size: 1073741824B\n", "Adding sequences from FASTA; added 558590 sequences in 33.1315 seconds.\n" ] } ], "source": [ "! {blastDirectory}makeblastdb \\\n", "-in uniprot_sprot.fasta \\\n", "-dbtype prot \\\n", "-out uniprot_sprot_20181010" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2018-10-09-Blast-Tutorial.ipynb uniprot_sprot_20181010.phr\r\n", "Ab_4-uniprot_blastx.tab uniprot_sprot_20181010.pin\r\n", "Ab_4denovo_CLC6_a.fa uniprot_sprot_20181010.psd\r\n", "uniprot_sprot.fasta uniprot_sprot_20181010.psi\r\n", "uniprot_sprot.fasta.gz uniprot_sprot_20181010.psq\r\n" ] } ], "source": [ "! ls -F" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3: Obtain the Query Sequence" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 1982k 100 1982k 0 0 17.3M 0 --:--:-- --:--:-- --:--:-- 18.0M\n" ] } ], "source": [ "!curl http://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \\\n", "> Ab_4denovo_CLC6_a.fa" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1\r\n", "ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC\r\n", "TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC\r\n", "TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC\r\n", "TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC\r\n", "CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA\r\n", "TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG\r\n", "CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA\r\n", "AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC\r\n", "TTC\r\n" ] } ], "source": [ "#Confirm download\n", "! head Ab_4denovo_CLC6_a.fa" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5490\r\n" ] } ], "source": [ "#Count number of sequences by counting \">\"\n", "! grep -c \">\" Ab_4denovo_CLC6_a.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 4: Run `blastx`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To run `blastx` I need the following:\n", " \n", "1. Path to `blastx`\n", "2. Path to query\n", "3. Path to database\n", "4. Output file name\n", "5. Maximum e-value to use for hits\n", "6. Number of CPUs to use\n", "7. Only produce 1 hit per query (may not be the best hit, but it is the first hit the algorithm finds...this is different than the web interface that finds the best hit)\n", "8. Output format 6 specifies tabular format" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!{blastDirectory}blastx \\\n", "-query Ab_4denovo_CLC6_a.fa \\\n", "-db uniprot_sprot_20181010 \\\n", "-out Ab_4-uniprot_blastx.tab \\\n", "-evalue 1E-20 \\\n", "-num_threads 4 \\\n", "-max_target_seqs 1 \\\n", "-outfmt 6" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3\tsp|O42248|GBLP_DANRE\t82.46\t171\t30\t0\t1\t513\t35\t205\t7e-84\t 309\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5\tsp|Q08013|SSRG_RAT\t75.38\t65\t16\t0\t3\t197\t121\t185\t4e-23\t 106\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_6\tsp|P12234|MPCP_BOVIN\t76.62\t77\t18\t0\t2\t232\t286\t362\t9e-21\t99.4\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_9\tsp|Q41629|ADT1_WHEAT\t82.26\t62\t11\t0\t3\t188\t170\t231\t9e-24\t 108\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_13\tsp|Q3B7H1|GALD1_DANRE\t54.22\t83\t38\t0\t22\t270\t131\t213\t4e-22\t 103\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_23\tsp|Q9GNE2|RL23_AEDAE\t97.22\t72\t2\t0\t67\t282\t14\t85\t2e-33\t 140\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_31\tsp|B3EWZ9|HEPHL_ACRMI\t56.59\t129\t53\t1\t2\t379\t26\t154\t4e-39\t 159\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_31\tsp|B3EWZ9|HEPHL_ACRMI\t44.72\t123\t64\t1\t8\t364\t380\t502\t7e-26\t 115\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_31\tsp|B3EWZ9|HEPHL_ACRMI\t44.53\t128\t65\t3\t11\t376\t732\t859\t2e-24\t 111\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_32\tsp|Q641Y2|NDUS2_RAT\t88.03\t117\t14\t0\t2\t352\t334\t450\t5e-57\t 219\r\n" ] } ], "source": [ "!head Ab_4-uniprot_blastx.tab" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 659 Ab_4-uniprot_blastx.tab\r\n" ] } ], "source": [ "#Count number of lines in output\n", "! wc -l Ab_4-uniprot_blastx.tab" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }