{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# `blastx` and Uniprot File Merging\n", "\n", "In this notebook, we will merge `blastx` output for *Zostera marina* and *Labyrinthula zosterae* with information from the [Uniprot-SwissProt database](https://www.uniprot.org/uniprot/?query=reviewed:yes)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 0. Set working directory" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaaminivenkataraman/Documents/project-EWD-transcriptomics/scripts\r\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaaminivenkataraman/Documents/project-EWD-transcriptomics/data\n" ] } ], "source": [ "cd ../data/" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m2019-07-09-Merging-Script-Troubleshooting\u001b[m\u001b[m/\r\n", "README.md\r\n", "Zostera-blast-sep.tab\r\n", "Zostera-blast-sort.tab\r\n", "\u001b[31mZostera_SwissProt_e5_output\u001b[m\u001b[m*\r\n", "\u001b[31mZostera_contigs.fasta\u001b[m\u001b[m*\r\n", "nonZostera-blast-sep.tab\r\n", "nonZostera-blast-sort.tab\r\n", "\u001b[31mnonZostera_SwissProt_e5_outputBOX.txt\u001b[m\u001b[m*\r\n", "\u001b[31mnonZostera_contigs.fasta\u001b[m\u001b[m*\r\n", "uniprot-SP-GO-sorted.tab\r\n", "uniprot-reviewed_yes.tab\r\n" ] } ], "source": [ "ls -F" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1. Format `blastx` output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1a. *Z. marina*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN31278_c0_g1_i1\tsp|Q54J75|RPB2_DICDI\t64.0\t111\t40\t0\t1\t333\t950\t1060\t6.9e-40\t164.5\r\n", "TRINITY_DN31239_c0_g1_i1\tsp|Q9SIT6|AB5G_ARATH\t29.6\t125\t88\t0\t58\t432\t440\t564\t1.4e-12\t74.3\r\n" ] } ], "source": [ "!head -n2 Zostera_SwissProt_e5_output" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Convert pipe delimiters to tab delimiters using tr (tr means translate)\n", "!tr '|' '\\t' < Zostera_SwissProt_e5_output \\\n", "> Zostera-blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN31278_c0_g1_i1\tsp\tQ54J75\tRPB2_DICDI\t64.0\t111\t40\t0\t1\t333\t950\t1060\t6.9e-40\t164.5\r\n", "TRINITY_DN31239_c0_g1_i1\tsp\tQ9SIT6\tAB5G_ARATH\t29.6\t125\t88\t0\t58\t432\t440\t564\t1.4e-12\t74.3\r\n" ] } ], "source": [ "!head -n2 Zostera-blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Reduce the number of columns using awk. Sort, and save as a new file.\n", "!awk -v OFS='\\t' '{print $3, $1, $13}' < Zostera-blast-sep.tab | sort \\\n", "> Zostera-blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A024B7I0\tTRINITY_DN278019_c0_g1_i1\t6.4e-133\r\n", "A0A067XMP1\tTRINITY_DN166310_c0_g1_i1\t2.0e-17\r\n", "A0A067XMP1\tTRINITY_DN17396_c0_g1_i1\t6.1e-17\r\n", "A0A067XMP1\tTRINITY_DN309320_c0_g7_i1\t3.2e-07\r\n", "A0A068FIK2\tTRINITY_DN241620_c0_g1_i1\t1.6e-37\r\n", "A0A068FIK2\tTRINITY_DN285385_c0_g3_i3\t2.4e-29\r\n", "A0A068FIK2\tTRINITY_DN308379_c0_g1_i1\t3.7e-207\r\n", "A0A068FIK2\tTRINITY_DN308379_c0_g1_i4\t0.0e+00\r\n", "A0A068FIK2\tTRINITY_DN308379_c0_g1_i5\t4.5e-226\r\n", "A0A068FIK2\tTRINITY_DN308379_c0_g1_i5\t7.7e-133\r\n" ] } ], "source": [ "!head Zostera-blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 270061 810183 11135448 Zostera-blast-sort.tab\n", "Zostera transcripts\n" ] } ], "source": [ "!wc Zostera-blast-sort.tab\n", "!echo \"Zostera transcripts\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1b. *L. zosterae*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will assume that everything that is not *Z. marina* will be *L. zosterae*." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN31224_c0_g1_i1\tsp|Q54T06|Y8206_DICDI\t52.1\t96\t40\t1\t4\t273\t458\t553\t4.1e-25\t115.2\r\n", "TRINITY_DN31259_c0_g1_i1\tsp|P15374|UCHL3_HUMAN\t49.3\t73\t37\t0\t8\t226\t144\t216\t1.3e-13\t76.6\r\n" ] } ], "source": [ "!head -n2 nonZostera_SwissProt_e5_outputBOX.txt" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Convert pipe delimiters to tab delimiters using tr (tr means translate)\n", "!tr '|' '\\t' < nonZostera_SwissProt_e5_outputBOX.txt \\\n", "> nonZostera-blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN31224_c0_g1_i1\tsp\tQ54T06\tY8206_DICDI\t52.1\t96\t40\t1\t4\t273\t458\t553\t4.1e-25\t115.2\r\n", "TRINITY_DN31259_c0_g1_i1\tsp\tP15374\tUCHL3_HUMAN\t49.3\t73\t37\t0\t8\t226\t144\t216\t1.3e-13\t76.6\r\n" ] } ], "source": [ "!head -n2 nonZostera-blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Reduce the number of columns using awk. Sort, and save as a new file.\n", "!awk -v OFS='\\t' '{print $3, $1, $13}' < nonZostera-blast-sep.tab | sort \\\n", "> nonZostera-blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A024RXP8\tTRINITY_DN416168_c0_g1_i1\t7.8e-07\r\n", "A0A024SMV2\tTRINITY_DN174741_c0_g1_i1\t8.9e-10\r\n", "A0A024SMV2\tTRINITY_DN192522_c0_g1_i1\t3.6e-11\r\n", "A0A060X6Z0\tTRINITY_DN123691_c0_g1_i1\t3.5e-19\r\n", "A0A067XMP1\tTRINITY_DN166166_c0_g1_i1\t1.1e-10\r\n", "A0A067XMP1\tTRINITY_DN166166_c0_g1_i2\t2.0e-10\r\n", "A0A067XMP1\tTRINITY_DN245901_c0_g1_i3\t4.4e-06\r\n", "A0A067XMP1\tTRINITY_DN336889_c0_g1_i1\t3.6e-06\r\n", "A0A067XMP1\tTRINITY_DN341637_c0_g1_i1\t1.8e-12\r\n", "A0A067XMP1\tTRINITY_DN393221_c0_g1_i1\t4.0e-06\r\n" ] } ], "source": [ "!head nonZostera-blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 87550 nonZostera-blast-sort.tab\n", "nonZostera transcripts\n" ] } ], "source": [ "!wc -l nonZostera-blast-sort.tab\n", "!echo \"nonZostera transcripts\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2. Format Uniprot-SwissProt database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Uniprot annotation file was downloaded from [this link](https://www.uniprot.org/uniprot/?query=reviewed:yes) on 2019-07-10. The following information was included as separate columns:\n", "\n", "- Entry (Uniprot Accession code)\n", "- Protein Names\n", "- Gene ontology (biological process)\n", "- Gene ontology (cellular component)\n", "- Gene ontology (molecular function)\n", "- Gene onology IDs\n", "- Status (Reviewed or not reviewed)\n", "- Organism" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Entry\tProtein names\tGene ontology (biological process)\tGene ontology (cellular component)\tGene ontology (molecular function)\tGene ontology IDs\tStatus\tOrganism\r\n", "Q0ATK2\tAcetyl-coenzyme A carboxylase carboxyl transferase subunit beta (ACCase subunit beta) (Acetyl-CoA carboxylase carboxyltransferase subunit beta) (EC 2.1.3.15)\tfatty acid biosynthetic process [GO:0006633]; malonyl-CoA biosynthetic process [GO:2001295]\tacetyl-CoA carboxylase complex [GO:0009317]\tacetyl-CoA carboxylase activity [GO:0003989]; ATP binding [GO:0005524]; carboxyl- or carbamoyltransferase activity [GO:0016743]\tGO:0003989; GO:0005524; GO:0006633; GO:0009317; GO:0016743; GO:2001295\treviewed\tMaricaulis maris (strain MCS10)\r\n" ] } ], "source": [ "!head -n2 uniprot-reviewed_yes.tab" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Sort file by the first column (-k 1), which is the Uniprot Entry (Uniprot Accession Code)\n", "!sort uniprot-reviewed_yes.tab -k 1 > uniprot-SP-GO-sorted.tab" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25\r\n" ] } ], "source": [ "#Count the number of columns for reference\n", "!awk '{print NF; exit}' uniprot-SP-GO-sorted.tab" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A023GPI8\tLectin alpha chain (CboL) [Cleaved into: Lectin beta chain; Lectin gamma chain]\t\t\tmannose binding [GO:0005537]; metal ion binding [GO:0046872]\tGO:0005537; GO:0046872\treviewed\tCanavalia boliviana\r\n", "A0A023GPJ0\tImmunity protein CdiI\t\t\t\t\treviewed\tEnterobacter cloacae subsp. cloacae (strain ATCC 13047 / DSM 30054 / NBRC 13535 / NCDC 279-56)\r\n" ] } ], "source": [ "!head -n2 uniprot-SP-GO-sorted.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3. Join `blastx` output with Uniprot annotation file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3a. *Z. marina*" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join the first column in the first file with the first column in the second file\n", "#The files are tab delimited, and the output should also be tab delimited (-t $'\\t')\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "Zostera-blast-sort.tab \\\n", "uniprot-SP-GO-sorted.tab \\\n", "> Zostera-blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A024B7I0\tTRINITY_DN278019_c0_g1_i1\t6.4e-133\tSCARECROW-LIKE protein 7 (PeSCL7)\t\tnucleus [GO:0005634]\tDNA binding [GO:0003677]\tGO:0003677; GO:0005634\treviewed\tPopulus euphratica (Euphrates poplar)\r\n", "A0A067XMP1\tTRINITY_DN166310_c0_g1_i1\t2.0e-17\tOxidoreductase ptaL (EC 1.-.-.-) (Pestheic acid biosynthesis cluster protein L)\t\t\toxidoreductase activity [GO:0016491]\tGO:0016491\treviewed\tPestalotiopsis fici (strain W106-1 / CGMCC3.15140)\r\n" ] } ], "source": [ "!head -n2 Zostera-blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 270014 Zostera-blast-annot.tab\n", "annotated Zostera transcripts\n" ] } ], "source": [ "!wc -l Zostera-blast-annot.tab\n", "!echo \"annotated Zostera transcripts\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3b. *L. zosterae*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join the first column in the first file with the first column in the second file. \n", "#The files are tab delimited, and the output should also be tab delimited (-t $'\\t')\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "nonZostera-blast-sort.tab \\\n", "uniprot-SP-GO-sorted.tab \\\n", "> nonZostera-blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A024RXP8\tTRINITY_DN416168_c0_g1_i1\t7.8e-07\tExoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobiohydrolase) (Cellobiohydrolase 7A) (Cel7A) (Exocellobiohydrolase I) (CBHI) (Exoglucanase I)\tcellulose catabolic process [GO:0030245]\textracellular region [GO:0005576]\tcellulose 1,4-beta-cellobiosidase activity [GO:0016162]; cellulose binding [GO:0030248]\tGO:0005576; GO:0016162; GO:0030245; GO:0030248\treviewed\tHypocrea jecorina (strain ATCC 56765 / BCRC 32924 / NRRL 11460 / Rut C-30) (Trichoderma reesei)\r\n", "A0A024SMV2\tTRINITY_DN174741_c0_g1_i1\t8.9e-10\tD-xylose 1-dehydrogenase (NADP(+)) (XDH) (EC 1.1.1.179) (D-xylose-NADP dehydrogenase) (NADP(+)-dependent D-xylose dehydrogenase)\t\t\tD-xylose 1-dehydrogenase (NADP+) activity [GO:0047837]\tGO:0047837\treviewed\tHypocrea jecorina (strain ATCC 56765 / BCRC 32924 / NRRL 11460 / Rut C-30) (Trichoderma reesei)\r\n" ] } ], "source": [ "!head -n2 nonZostera-blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 87532 nonZostera-blast-annot.tab\n", "annotated nonZostera transcripts\n" ] } ], "source": [ "!wc -l nonZostera-blast-annot.tab\n", "!echo \"annotated nonZostera transcripts\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 4. Isolate gene IDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`blastx` was performed using isoform data. Currently, each line in the annotated file is denoted by an isoform ID (ex. TRINITY_DN416168_c0_g1_i1). The gene IDs are similar to the isoform IDs, in that they have contig and gene information, but no isoform information (ex. TRINITY_DN416168_c0_g1). Differential expression analyses will be conducted in `edgeR` at the gene level, so gene IDs are needed on annotation files." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 4a. *Z. marina*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate the contig column name with cut\n", "#Flip order of characters with rev\n", "#Delete last three characters with cut -c\n", "#Flip order of characters with rev\n", "#Add information as a new column to annotated table with paste\n", "\n", "!cut -f2 Zostera-blast-annot.tab \\\n", "| rev \\\n", "| cut -c 4- \\\n", "| rev \\\n", "> Zostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN278019_c0_g1\r\n", "TRINITY_DN166310_c0_g1\r\n", "TRINITY_DN17396_c0_g1\r\n", "TRINITY_DN309320_c0_g7\r\n", "TRINITY_DN241620_c0_g1\r\n", "TRINITY_DN285385_c0_g3\r\n", "TRINITY_DN308379_c0_g1\r\n", "TRINITY_DN308379_c0_g1\r\n", "TRINITY_DN308379_c0_g1\r\n", "TRINITY_DN308379_c0_g1\r\n" ] } ], "source": [ "!head Zostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 270014 Zostera-blast-annot-geneIDOnly.tab\r\n" ] } ], "source": [ "#Line count matches line count of original file\n", "!wc -l Zostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste Zostera-blast-annot-geneIDOnly.tab Zostera-blast-annot.tab \\\n", "> Zostera-blast-annot-withGeneID.tab" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN278019_c0_g1\tA0A024B7I0\tTRINITY_DN278019_c0_g1_i1\t6.4e-133\tSCARECROW-LIKE protein 7 (PeSCL7)\t\tnucleus [GO:0005634]\tDNA binding [GO:0003677]\tGO:0003677; GO:0005634\treviewed\tPopulus euphratica (Euphrates poplar)\r\n", "TRINITY_DN166310_c0_g1\tA0A067XMP1\tTRINITY_DN166310_c0_g1_i1\t2.0e-17\tOxidoreductase ptaL (EC 1.-.-.-) (Pestheic acid biosynthesis cluster protein L)\t\t\toxidoreductase activity [GO:0016491]\tGO:0016491\treviewed\tPestalotiopsis fici (strain W106-1 / CGMCC3.15140)\r\n" ] } ], "source": [ "!head -n2 Zostera-blast-annot-withGeneID.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 4b. *L. zosterae*" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Isolate the contig column name with cut\n", "#Flip order of characters with rev\n", "#Delete last three characters with cut -c\n", "#Flip order of characters with rev\n", "#Add information as a new column to annotated table with paste\n", "\n", "!cut -f2 nonZostera-blast-annot.tab \\\n", "| rev \\\n", "| cut -c 4- \\\n", "| rev \\\n", "> nonZostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN416168_c0_g1\r\n", "TRINITY_DN174741_c0_g1\r\n", "TRINITY_DN192522_c0_g1\r\n", "TRINITY_DN123691_c0_g1\r\n", "TRINITY_DN166166_c0_g1\r\n", "TRINITY_DN166166_c0_g1\r\n", "TRINITY_DN245901_c0_g1\r\n", "TRINITY_DN336889_c0_g1\r\n", "TRINITY_DN341637_c0_g1\r\n", "TRINITY_DN393221_c0_g1\r\n" ] } ], "source": [ "!head nonZostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 87532 nonZostera-blast-annot-geneIDOnly.tab\r\n" ] } ], "source": [ "#Line count matches line count of original file\n", "!wc -l nonZostera-blast-annot-geneIDOnly.tab" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!paste nonZostera-blast-annot-geneIDOnly.tab nonZostera-blast-annot.tab \\\n", "> nonZostera-blast-annot-withGeneID.tab" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN416168_c0_g1\tA0A024RXP8\tTRINITY_DN416168_c0_g1_i1\t7.8e-07\tExoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobiohydrolase) (Cellobiohydrolase 7A) (Cel7A) (Exocellobiohydrolase I) (CBHI) (Exoglucanase I)\tcellulose catabolic process [GO:0030245]\textracellular region [GO:0005576]\tcellulose 1,4-beta-cellobiosidase activity [GO:0016162]; cellulose binding [GO:0030248]\tGO:0005576; GO:0016162; GO:0030245; GO:0030248\treviewed\tHypocrea jecorina (strain ATCC 56765 / BCRC 32924 / NRRL 11460 / Rut C-30) (Trichoderma reesei)\r\n", "TRINITY_DN174741_c0_g1\tA0A024SMV2\tTRINITY_DN174741_c0_g1_i1\t8.9e-10\tD-xylose 1-dehydrogenase (NADP(+)) (XDH) (EC 1.1.1.179) (D-xylose-NADP dehydrogenase) (NADP(+)-dependent D-xylose dehydrogenase)\t\t\tD-xylose 1-dehydrogenase (NADP+) activity [GO:0047837]\tGO:0047837\treviewed\tHypocrea jecorina (strain ATCC 56765 / BCRC 32924 / NRRL 11460 / Rut C-30) (Trichoderma reesei)\r\n" ] } ], "source": [ "!head -n2 nonZostera-blast-annot-withGeneID.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 5. Retain one line per gene ID" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since differential expression analysis will be conducted at the gene level, each gene should only have one associated Uniprot annotation. This is complicated by the fact that some genes have multiple isoforms. Each isoform should map to the same protein (otherwise it wouldn't be an isoform, but a different gene). We will trim the list such that we retain annotations from the first listed isoform for each gene." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 5a. *Z. marina*" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Sort file by the first column (--key = 1,1) and only retain unique IDs (--unique). Save output to a new file.\n", "!sort --unique --key=1,1 Zostera-blast-annot-withGeneID.tab \\\n", "> Zostera-blast-annot-withGeneID-noIsoforms.tab" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN100001_c0_g1\tQ54EW8\tTRINITY_DN100001_c0_g1_i1\t1.2e-19\tDihydrolipoyl dehydrogenase, mitochondrial (EC 1.8.1.4) (Dihydrolipoamide dehydrogenase) (Glycine cleavage system L protein)\tcell redox homeostasis [GO:0045454]; glycine catabolic process [GO:0006546]; isoleucine catabolic process [GO:0006550]; leucine catabolic process [GO:0006552]; L-serine biosynthetic process [GO:0006564]; valine catabolic process [GO:0006574]\textracellular matrix [GO:0031012]; mitochondrial matrix [GO:0005759]; mitochondrial pyruvate dehydrogenase complex [GO:0005967]; phagocytic vesicle [GO:0045335]\tdihydrolipoyl dehydrogenase activity [GO:0004148]; electron transfer activity [GO:0009055]; flavin adenine dinucleotide binding [GO:0050660]\tGO:0004148; GO:0005759; GO:0005967; GO:0006546; GO:0006550; GO:0006552; GO:0006564; GO:0006574; GO:0009055; GO:0031012; GO:0045335; GO:0045454; GO:0050660\treviewed\tDictyostelium discoideum (Slime mold)\r\n", "TRINITY_DN100015_c0_g1\tP16894\tTRINITY_DN100015_c0_g1_i1\t1.2e-21\tGuanine nucleotide-binding protein alpha-1 subunit (G alpha-1)\tadenylate cyclase-modulating G protein-coupled receptor signaling pathway [GO:0007188]; cell differentiation [GO:0030154]; chemotaxis to cAMP [GO:0043327]; G protein-coupled receptor signaling pathway [GO:0007186]; negative regulation of phospholipase C activity [GO:1900275]; signal transduction [GO:0007165]\theterotrimeric G-protein complex [GO:0005834]\tG-protein beta/gamma-subunit complex binding [GO:0031683]; G protein-coupled receptor binding [GO:0001664]; GTPase activity [GO:0003924]; GTP binding [GO:0005525]; metal ion binding [GO:0046872]\tGO:0001664; GO:0003924; GO:0005525; GO:0005834; GO:0007165; GO:0007186; GO:0007188; GO:0030154; GO:0031683; GO:0043327; GO:0046872; GO:1900275\treviewed\tDictyostelium discoideum (Slime mold)\r\n" ] } ], "source": [ "!head -n2 Zostera-blast-annot-withGeneID-noIsoforms.tab" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 138394 Zostera-blast-annot-withGeneID-noIsoforms.tab\n", "unique Zostera gene IDs\n" ] } ], "source": [ "!wc -l Zostera-blast-annot-withGeneID-noIsoforms.tab\n", "!echo \"unique Zostera gene IDs\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 5b. *L. zosterae*" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Sort file by the first column (--key = 1,1) and only retain unique IDs (--unique). Save output to a new file.\n", "!sort --unique --key=1,1 nonZostera-blast-annot-withGeneID.tab \\\n", "> nonZostera-blast-annot-withGeneID-noIsoforms.tab" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN100016_c0_g1\tQ8GYA6\tTRINITY_DN100016_c0_g1_i1\t1.2e-08\t26S proteasome non-ATPase regulatory subunit 13 homolog B (26S proteasome regulatory subunit RPN9b) (AtRNP9b) (26S proteasome regulatory subunit S11 homolog B)\tproteasome assembly [GO:0043248]; protein catabolic process [GO:0030163]; ubiquitin-dependent protein catabolic process [GO:0006511]\tcytosol [GO:0005829]; nucleus [GO:0005634]; proteasome complex [GO:0000502]; proteasome regulatory particle, lid subcomplex [GO:0008541]\tstructural molecule activity [GO:0005198]\tGO:0000502; GO:0005198; GO:0005634; GO:0005829; GO:0006511; GO:0008541; GO:0030163; GO:0043248\treviewed\tArabidopsis thaliana (Mouse-ear cress)\r\n", "TRINITY_DN100076_c0_g1\tQ59118\tTRINITY_DN100076_c0_g1_i1\t2.3e-09\tHistamine oxidase (EC 1.4.3.22) (Copper amine oxidase)\tamine metabolic process [GO:0009308]; cellular response to azide [GO:0097185]; oxidation-reduction process [GO:0055114]\tcytoplasm [GO:0005737]\tcopper ion binding [GO:0005507]; diamine oxidase activity [GO:0052597]; histamine oxidase activity [GO:0052598]; methylputrescine oxidase activity [GO:0052599]; primary amine oxidase activity [GO:0008131]; propane-1,3-diamine oxidase activity [GO:0052600]; quinone binding [GO:0048038]\tGO:0005507; GO:0005737; GO:0008131; GO:0009308; GO:0048038; GO:0052597; GO:0052598; GO:0052599; GO:0052600; GO:0055114; GO:0097185\treviewed\tArthrobacter globiformis\r\n" ] } ], "source": [ "!head -n2 nonZostera-blast-annot-withGeneID-noIsoforms.tab" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 71191 nonZostera-blast-annot-withGeneID-noIsoforms.tab\n", "unique nonZostera gene IDs\n" ] } ], "source": [ "!wc -l nonZostera-blast-annot-withGeneID-noIsoforms.tab\n", "!echo \"unique nonZostera gene IDs\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 6. Join with `trinity` gene matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The last step is to join the annotated genes for each species with the gene matrix output from `trinity`. This will allow us to conduct individual `edgeR` analyses for host and pathogen." ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GRASS_GENE\tS_10B\tS_9A\tS_13A\tS_42A\tS_46B\tS_47B\tS_48B\tS_2A\tS_2B\tS_7B\tS_8B\tS_33A\tS_36A\tS_38A\tS_40A\r", "\r\n", "TRINITY_DN0_c0_g1\t0\t1\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\r", "\r\n", "TRINITY_DN100_c0_g1\t0\t1\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\r", "\r\n", "TRINITY_DN10000_c0_g1\t0\t2\t0\t0\t6\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\r", "\r\n", "TRINITY_DN100000_c0_g1\t0\t2\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\r", "\r\n", "TRINITY_DN100001_c0_g1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\r", "\r\n", "TRINITY_DN100003_c0_g1\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\r", "\r\n", "TRINITY_DN100006_c0_g1\t0\t1\t0\t0\t0\t0\t0\t1\t0\t3\t0\t0\t0\t0\t0\r", "\r\n", "TRINITY_DN100007_c0_g1\t0\t0\t0\t0\t0\t0\t0\t3\t0\t0\t0\t1\t0\t0\t0\r", "\r\n", "TRINITY_DN10001_c0_g1\t0\t5\t2\t1\t9\t0\t1\t1\t0\t1\t0\t0\t1\t0\t0\r", "\r\n" ] } ], "source": [ "#Although the first column header is \"GRASS_GENE,\" the list includes both Zostera and nonZostera genes.\n", "!head genes.counts.matrix.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 6a. *Z. marina*" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join the first column in the first file with the first column in the second file. \n", "#The files are tab delimited, and the output should also be tab delimited (-t $'\\t')\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "Zostera-blast-annot-withGeneID-noIsoforms.tab \\\n", "genes.counts.matrix.txt \\\n", "> Zostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The columns are as follows:\n", "\n", "- Gene ID\n", "- Uniprot Accession code\n", "- Isoform ID \n", "- e-value\n", "- Protein Names\n", "- Gene ontology (biological process)\n", "- Gene ontology (cellular component)\n", "- Gene ontology (molecular function)\n", "- Gene onology IDs\n", "- Status (Reviewed or not reviewed)\n", "- Organism\n", "- S_10B\tcounts\n", "- S_9A counts \n", "- S_13A\tcounts\n", "- S_42A\tcounts\n", "- S_46B\tcounts\n", "- S_47B\tcounts\n", "- S_48B\tcounts\n", "- S_2A counts\n", "- S_2B counts\n", "- S_7B counts\n", "- S_8B counts \n", "- S_33A counts\n", "- S_36A counts\n", "- S_38A counts\n", "- S_40A counts" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN102005_c0_g1\tQ1DZQ0\tTRINITY_DN102005_c0_g1_i1\t1.3e-37\tProtein transport protein SEC13\tCOPII-coated vesicle budding [GO:0090114]; mRNA transport [GO:0051028]; positive regulation of TORC1 signaling [GO:1904263]; protein transport [GO:0015031]\tCOPII vesicle coat [GO:0030127]; endoplasmic reticulum membrane [GO:0005789]; Golgi membrane [GO:0000139]; nuclear pore [GO:0005643]\tstructural molecule activity [GO:0005198]\tGO:0000139; GO:0005198; GO:0005643; GO:0005789; GO:0015031; GO:0030127; GO:0051028; GO:0090114; GO:1904263\treviewed\tCoccidioides immitis (strain RS) (Valley fever fungus)\t0\t0\t1\t1\t0\t0\t0\t0\t0\t4\t0\t1\t3\t0\t0\r", "\r\n", "TRINITY_DN102006_c0_g1\tQ54NS9\tTRINITY_DN102006_c0_g1_i1\t3.0e-21\tApoptosis-inducing factor homolog A (EC 1.-.-.-)\t\tcytoplasm [GO:0005737]; lipid droplet [GO:0005811]\telectron-transferring-flavoprotein dehydrogenase activity [GO:0004174]; flavin adenine dinucleotide binding [GO:0050660]\tGO:0004174; GO:0005737; GO:0005811; GO:0050660\treviewed\tDictyostelium discoideum (Slime mold)\t0\t1\t0\t0\t0\t1\t0\t3\t1\t44\t0\t0\t4\t1\t2\r", "\r\n" ] } ], "source": [ "!head -n2 Zostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5159 Zostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab\r\n" ] } ], "source": [ "!wc -l Zostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 6b. *L. zosterae*" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Join the first column in the first file with the first column in the second file. \n", "#The files are tab delimited, and the output should also be tab delimited (-t $'\\t')\n", "!join -1 1 -2 1 -t $'\\t' \\\n", "nonZostera-blast-annot-withGeneID-noIsoforms.tab \\\n", "genes.counts.matrix.txt \\\n", "> nonZostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The columns are as follows:\n", "\n", "- Gene ID\n", "- Uniprot Accession code\n", "- Isoform ID \n", "- e-value\n", "- Protein Names\n", "- Gene ontology (biological process)\n", "- Gene ontology (cellular component)\n", "- Gene ontology (molecular function)\n", "- Gene onology IDs\n", "- Status (Reviewed or not reviewed)\n", "- Organism\n", "- S_10B\tcounts\n", "- S_9A counts \n", "- S_13A\tcounts\n", "- S_42A\tcounts\n", "- S_46B\tcounts\n", "- S_47B\tcounts\n", "- S_48B\tcounts\n", "- S_2A counts\n", "- S_2B counts\n", "- S_7B counts\n", "- S_8B counts \n", "- S_33A counts\n", "- S_36A counts\n", "- S_38A counts\n", "- S_40A counts" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN102004_c0_g1\tQ54BM8\tTRINITY_DN102004_c0_g1_i1\t6.5e-19\tUPF0652 protein\t\t\tzinc ion binding [GO:0008270]\tGO:0008270\treviewed\tDictyostelium discoideum (Slime mold)\t0\t1\t0\t0\t0\t0\t0\t0\t0\t4\t0\t0\t6\t0\t0\r", "\r\n", "TRINITY_DN102011_c0_g1\tQ84M24\tTRINITY_DN102011_c0_g1_i1\t2.5e-14\tABC transporter A family member 1 (ABC transporter ABCA.1) (AtABCA1) (ABC one homolog protein 1) (AtAOH1)\tlipid transport [GO:0006869]\tintegral component of membrane [GO:0016021]; intracellular membrane-bounded organelle [GO:0043231]; vacuolar membrane [GO:0005774]\tATPase activity [GO:0016887]; ATPase-coupled transmembrane transporter activity [GO:0042626]; ATP binding [GO:0005524]; lipid transporter activity [GO:0005319]\tGO:0005319; GO:0005524; GO:0005774; GO:0006869; GO:0016021; GO:0016887; GO:0042626; GO:0043231\treviewed\tArabidopsis thaliana (Mouse-ear cress)\t0\t11\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\r", "\r\n" ] } ], "source": [ "!head -n2 nonZostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2402 nonZostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab\r\n" ] } ], "source": [ "!wc -l nonZostera-blast-annot-withGeneID-noIsoforms-geneCounts.tab" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }