{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/sr320/Documents/GitHub/nb-2021/O_lurida/data\n"
     ]
    }
   ],
   "source": [
    "cd ../data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ls"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "wget -r \\\n",
    "--no-directories --no-parent \\\n",
    "--no-check-certificate \\\n",
    "-P . \\\n",
    "-A .fastq https://gannet.fish.washington.edu/generosa/O.lurida_QuantSeq2020/Trimmed/\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "137.trim.fastq 156.trim.fastq 34.trim.fastq  41.trim.fastq  46.trim.fastq\n",
      "139.trim.fastq 159.trim.fastq 35.trim.fastq  43.trim.fastq  47.trim.fastq\n",
      "140.trim.fastq 161.trim.fastq 37.trim.fastq  44.trim.fastq\n",
      "141.trim.fastq 162.trim.fastq 39.trim.fastq  45.trim.fastq\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bowtie 2 version 2.3.5 by Ben Langmead (langmea@cs.jhu.edu, www.cs.jhu.edu/~langmea)\n",
      "Usage: \n",
      "  bowtie2 [options]* -x <bt2-idx> {-1 <m1> -2 <m2> | -U <r> | --interleaved <i>} [-S <sam>]\n",
      "\n",
      "  <bt2-idx>  Index filename prefix (minus trailing .X.bt2).\n",
      "             NOTE: Bowtie 1 and Bowtie 2 indexes are not compatible.\n",
      "  <m1>       Files with #1 mates, paired with files in <m2>.\n",
      "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n",
      "  <m2>       Files with #2 mates, paired with files in <m1>.\n",
      "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n",
      "  <r>        Files with unpaired reads.\n",
      "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n",
      "  <i>        Files with interleaved paired-end FASTQ/FASTA reads\n",
      "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n",
      "  <sam>      File for SAM output (default: stdout)\n",
      "\n",
      "  <m1>, <m2>, <r> can be comma-separated lists (no whitespace) and can be\n",
      "  specified many times.  E.g. '-U file1.fq,file2.fq -U file3.fq'.\n",
      "\n",
      "Options (defaults in parentheses):\n",
      "\n",
      " Input:\n",
      "  -q                 query input files are FASTQ .fq/.fastq (default)\n",
      "  --tab5             query input files are TAB5 .tab5\n",
      "  --tab6             query input files are TAB6 .tab6\n",
      "  --qseq             query input files are in Illumina's qseq format\n",
      "  -f                 query input files are (multi-)FASTA .fa/.mfa\n",
      "  -r                 query input files are raw one-sequence-per-line\n",
      "  -F k:<int>,i:<int> query input files are continuous FASTA where reads\n",
      "                     are substrings (k-mers) extracted from a FASTA file <s>\n",
      "                     and aligned at offsets 1, 1+i, 1+2i ... end of reference\n",
      "  -c                 <m1>, <m2>, <r> are sequences themselves, not files\n",
      "  -s/--skip <int>    skip the first <int> reads/pairs in the input (none)\n",
      "  -u/--upto <int>    stop after first <int> reads/pairs (no limit)\n",
      "  -5/--trim5 <int>   trim <int> bases from 5'/left end of reads (0)\n",
      "  -3/--trim3 <int>   trim <int> bases from 3'/right end of reads (0)\n",
      "  --trim-to [3:|5:]<int> trim reads exceeding <int> bases from either 3' or 5' end\n",
      "                     If the read end is not specified then it defaults to 3 (0)\n",
      "  --phred33          qualities are Phred+33 (default)\n",
      "  --phred64          qualities are Phred+64\n",
      "  --int-quals        qualities encoded as space-delimited integers\n",
      "\n",
      " Presets:                 Same as:\n",
      "  For --end-to-end:\n",
      "   --very-fast            -D 5 -R 1 -N 0 -L 22 -i S,0,2.50\n",
      "   --fast                 -D 10 -R 2 -N 0 -L 22 -i S,0,2.50\n",
      "   --sensitive            -D 15 -R 2 -N 0 -L 22 -i S,1,1.15 (default)\n",
      "   --very-sensitive       -D 20 -R 3 -N 0 -L 20 -i S,1,0.50\n",
      "\n",
      "  For --local:\n",
      "   --very-fast-local      -D 5 -R 1 -N 0 -L 25 -i S,1,2.00\n",
      "   --fast-local           -D 10 -R 2 -N 0 -L 22 -i S,1,1.75\n",
      "   --sensitive-local      -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default)\n",
      "   --very-sensitive-local -D 20 -R 3 -N 0 -L 20 -i S,1,0.50\n",
      "\n",
      " Alignment:\n",
      "  -N <int>           max # mismatches in seed alignment; can be 0 or 1 (0)\n",
      "  -L <int>           length of seed substrings; must be >3, <32 (22)\n",
      "  -i <func>          interval between seed substrings w/r/t read len (S,1,1.15)\n",
      "  --n-ceil <func>    func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)\n",
      "  --dpad <int>       include <int> extra ref chars on sides of DP table (15)\n",
      "  --gbar <int>       disallow gaps within <int> nucs of read extremes (4)\n",
      "  --ignore-quals     treat all quality values as 30 on Phred scale (off)\n",
      "  --nofw             do not align forward (original) version of read (off)\n",
      "  --norc             do not align reverse-complement version of read (off)\n",
      "  --no-1mm-upfront   do not allow 1 mismatch alignments before attempting to\n",
      "                     scan for the optimal seeded alignments\n",
      "  --end-to-end       entire read must align; no clipping (on)\n",
      "   OR\n",
      "  --local            local alignment; ends might be soft clipped (off)\n",
      "\n",
      " Scoring:\n",
      "  --ma <int>         match bonus (0 for --end-to-end, 2 for --local) \n",
      "  --mp <int>         max penalty for mismatch; lower qual = lower penalty (6)\n",
      "  --np <int>         penalty for non-A/C/G/Ts in read/ref (1)\n",
      "  --rdg <int>,<int>  read gap open, extend penalties (5,3)\n",
      "  --rfg <int>,<int>  reference gap open, extend penalties (5,3)\n",
      "  --score-min <func> min acceptable alignment score w/r/t read length\n",
      "                     (G,20,8 for local, L,-0.6,-0.6 for end-to-end)\n",
      "\n",
      " Reporting:\n",
      "  (default)          look for multiple alignments, report best, with MAPQ\n",
      "   OR\n",
      "  -k <int>           report up to <int> alns per read; MAPQ not meaningful\n",
      "   OR\n",
      "  -a/--all           report all alignments; very slow, MAPQ not meaningful\n",
      "\n",
      " Effort:\n",
      "  -D <int>           give up extending after <int> failed extends in a row (15)\n",
      "  -R <int>           for reads w/ repetitive seeds, try <int> sets of seeds (2)\n",
      "\n",
      " Paired-end:\n",
      "  -I/--minins <int>  minimum fragment length (0)\n",
      "  -X/--maxins <int>  maximum fragment length (500)\n",
      "  --fr/--rf/--ff     -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)\n",
      "  --no-mixed         suppress unpaired alignments for paired reads\n",
      "  --no-discordant    suppress discordant alignments for paired reads\n",
      "  --dovetail         concordant when mates extend past each other\n",
      "  --no-contain       not concordant when one mate alignment contains other\n",
      "  --no-overlap       not concordant when mates overlap at all\n",
      "\n",
      " Output:\n",
      "  -t/--time          print wall-clock time taken by search phases\n",
      "  --un <path>        write unpaired reads that didn't align to <path>\n",
      "  --al <path>        write unpaired reads that aligned at least once to <path>\n",
      "  --un-conc <path>   write pairs that didn't align concordantly to <path>\n",
      "  --al-conc <path>   write pairs that aligned concordantly at least once to <path>\n",
      "    (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g.\n",
      "    --un-gz <path>, to gzip compress output, or add '-bz2' to bzip2 compress output.)\n",
      "  --quiet            print nothing to stderr except serious errors\n",
      "  --met-file <path>  send metrics to file at <path> (off)\n",
      "  --met-stderr       send metrics to stderr (off)\n",
      "  --met <int>        report internal counters & metrics every <int> secs (1)\n",
      "  --no-unal          suppress SAM records for unaligned reads\n",
      "  --no-head          suppress header lines, i.e. lines starting with @\n",
      "  --no-sq            suppress @SQ header lines\n",
      "  --rg-id <text>     set read group id, reflected in @RG line and RG:Z: opt field\n",
      "  --rg <text>        add <text> (\"lab:value\") to @RG line of SAM header.\n",
      "                     Note: @RG line only printed when --rg-id is set.\n",
      "  --omit-sec-seq     put '*' in SEQ and QUAL fields for secondary alignments.\n",
      "  --sam-no-qname-trunc Suppress standard behavior of truncating readname at first whitespace \n",
      "                      at the expense of generating non-standard SAM.\n",
      "  --xeq              Use '='/'X', instead of 'M,' to specify matches/mismatches in SAM record.\n",
      "  --soft-clipped-unmapped-tlen Exclude soft-clipped bases when reporting TLEN\n",
      "\n",
      " Performance:\n",
      "  -p/--threads <int> number of alignment threads to launch (1)\n",
      "  --reorder          force SAM output order to match order of input reads\n",
      "  --mm               use memory-mapped I/O for index; many 'bowtie's can share\n",
      "\n",
      " Other:\n",
      "  --qc-filter        filter out reads that are bad according to QSEQ filter\n",
      "  --seed <int>       seed for random number generator (0)\n",
      "  --non-deterministic seed rand. gen. arbitrarily instead of using read attributes\n",
      "  --version          print version information and quit\n",
      "  -h/--help          print this usage message\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/bowtie2-235/bowtie2 -h\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/sr320/Documents/GitHub/nb-2021/O_lurida/code'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 1090M  100 1090M    0     0  2226k      0  0:08:21  0:08:21 --:--:-- 2313k 0  2024k      0  0:09:11  0:00:15  0:08:56 2395k1:29  0:06:47 1934k0:06:43 1513k     0  2156k      0  0:08:37  0:01:58  0:06:39 1730k   0  0:08:38  0:02:02  0:06:36 2114k 0:08:38  0:02:04  0:06:34 2047k0  0:08:40  0:02:09  0:06:31 1989k81k      0  0:08:56  0:03:16  0:05:40 2185kk     0  0:08:32  0:04:13  0:04:19 2765k  2199k      0  0:08:27  0:04:29  0:03:58 2518k   0  0:08:26  0:04:35  0:03:51 2367k   0  2214k      0  0:08:24  0:06:01  0:02:23 2166k 2213k      0  0:08:24  0:06:13  0:02:11 2113k 0     0  2218k      0  0:08:23  0:06:22  0:02:01 2372k    0  0:08:23  0:06:23  0:02:00 2258k  2202k      0  0:08:26  0:07:01  0:01:25 2637kM   90  983M    0     0  2221k      0  0:08:22  0:07:33  0:00:49 2546k\n"
     ]
    }
   ],
   "source": [
    "!curl http://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081.fa > ../data/Olurida_v081.fa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "!curl           156.trim.fastq  34.trim.fastq   43.trim.fastq   Olurida_v081.fa\n",
      "137.trim.fastq  159.trim.fastq  35.trim.fastq   44.trim.fastq\n",
      "139.trim.fastq  161.trim.fastq  37.trim.fastq   45.trim.fastq\n",
      "140.trim.fastq  162.trim.fastq  39.trim.fastq   46.trim.fastq\n",
      "141.trim.fastq  168.trim.fastq  41.trim.fastq   47.trim.fastq\n"
     ]
    }
   ],
   "source": [
    "!ls ../data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Settings:\n",
      "  Output files: \"../data/Olurida_v081-bowtie-index.*.bt2\"\n",
      "  Line rate: 6 (line is 64 bytes)\n",
      "  Lines per side: 1 (side is 64 bytes)\n",
      "  Offset rate: 4 (one in 16)\n",
      "  FTable chars: 10\n",
      "  Strings: unpacked\n",
      "  Max bucket size: default\n",
      "  Max bucket size, sqrt multiplier: default\n",
      "  Max bucket size, len divisor: 4\n",
      "  Difference-cover sample period: 1024\n",
      "  Endianness: little\n",
      "  Actual local endianness: little\n",
      "  Sanity checking: disabled\n",
      "  Assertions: disabled\n",
      "  Random seed: 0\n",
      "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
      "Input files DNA, FASTA:\n",
      "  ../data/Olurida_v081.fa\n",
      "Building a SMALL index\n",
      "Reading reference sizes\n",
      "  Time reading reference sizes: 00:00:21\n",
      "Calculating joined length\n",
      "Writing header\n",
      "Reserving space for joined string\n",
      "Joining reference sequences\n",
      "  Time to join reference sequences: 00:00:13\n",
      "bmax according to bmaxDivN setting: 269313774\n",
      "Using parameters --bmax 201985331 --dcv 1024\n",
      "  Doing ahead-of-time memory usage test\n",
      "  Passed!  Constructing with these parameters: --bmax 201985331 --dcv 1024\n",
      "Constructing suffix-array element generator\n",
      "Building DifferenceCoverSample\n",
      "  Building sPrime\n",
      "  Building sPrimeOrder\n",
      "  V-Sorting samples\n",
      "  V-Sorting samples time: 00:00:40\n",
      "  Allocating rank array\n",
      "  Ranking v-sort output\n",
      "  Ranking v-sort output time: 00:00:15\n",
      "  Invoking Larsson-Sadakane on ranks\n",
      "  Invoking Larsson-Sadakane on ranks time: 00:00:18\n",
      "  Sanity-checking and returning\n",
      "Building samples\n",
      "Reserving space for 12 sample suffixes\n",
      "Generating random suffixes\n",
      "QSorting 12 sample offsets, eliminating duplicates\n",
      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
      "Multikey QSorting 12 samples\n",
      "  (Using difference cover)\n",
      "  Multikey QSorting samples time: 00:00:00\n",
      "Calculating bucket sizes\n",
      "Splitting and merging\n",
      "  Splitting and merging time: 00:00:00\n",
      "Avg bucket size: 1.07726e+09 (target: 201985330)\n",
      "Converting suffix-array elements to index image\n",
      "Allocating ftab, absorbFtab\n",
      "Entering Ebwt loop\n",
      "Getting block 1 of 1\n",
      "  No samples; assembling all-inclusive block\n",
      "  Sorting block of length 1077255099 for bucket 1\n",
      "  (Using difference cover)\n",
      "  Sorting block time: 00:20:49\n",
      "Returning block of 1077255100 for bucket 1\n",
      "Exited Ebwt loop\n",
      "fchr[A]: 0\n",
      "fchr[C]: 341519526\n",
      "fchr[G]: 538577822\n",
      "fchr[T]: 735626959\n",
      "fchr[$]: 1077255099\n",
      "Exiting Ebwt::buildToDisk()\n",
      "Returning from initFromVector\n",
      "Wrote 371984998 bytes to primary EBWT file: ../data/Olurida_v081-bowtie-index.1.bt2\n",
      "Wrote 269313780 bytes to secondary EBWT file: ../data/Olurida_v081-bowtie-index.2.bt2\n",
      "Re-opening _in1 and _in2 as input streams\n",
      "Returning from Ebwt constructor\n",
      "Headers:\n",
      "    len: 1077255099\n",
      "    bwtLen: 1077255100\n",
      "    sz: 269313775\n",
      "    bwtSz: 269313775\n",
      "    lineRate: 6\n",
      "    offRate: 4\n",
      "    offMask: 0xfffffff0\n",
      "    ftabChars: 10\n",
      "    eftabLen: 20\n",
      "    eftabSz: 80\n",
      "    ftabLen: 1048577\n",
      "    ftabSz: 4194308\n",
      "    offsLen: 67328444\n",
      "    offsSz: 269313776\n",
      "    lineSz: 64\n",
      "    sideSz: 64\n",
      "    sideBwtSz: 48\n",
      "    sideBwtLen: 192\n",
      "    numSides: 5610704\n",
      "    numLines: 5610704\n",
      "    ebwtTotLen: 359085056\n",
      "    ebwtTotSz: 359085056\n",
      "    color: 0\n",
      "    reverse: 0\n",
      "Total time for call to driver() for forward index: 00:27:26\n",
      "Reading reference sizes\n",
      "  Time reading reference sizes: 00:00:07\n",
      "Calculating joined length\n",
      "Writing header\n",
      "Reserving space for joined string\n",
      "Joining reference sequences\n",
      "  Time to join reference sequences: 00:00:09\n",
      "  Time to reverse reference sequence: 00:00:00\n",
      "bmax according to bmaxDivN setting: 269313774\n",
      "Using parameters --bmax 201985331 --dcv 1024\n",
      "  Doing ahead-of-time memory usage test\n",
      "  Passed!  Constructing with these parameters: --bmax 201985331 --dcv 1024\n",
      "Constructing suffix-array element generator\n",
      "Building DifferenceCoverSample\n",
      "  Building sPrime\n",
      "  Building sPrimeOrder\n",
      "  V-Sorting samples\n",
      "  V-Sorting samples time: 00:00:27\n",
      "  Allocating rank array\n",
      "  Ranking v-sort output\n",
      "  Ranking v-sort output time: 00:00:10\n",
      "  Invoking Larsson-Sadakane on ranks\n",
      "  Invoking Larsson-Sadakane on ranks time: 00:00:15\n",
      "  Sanity-checking and returning\n",
      "Building samples\n",
      "Reserving space for 12 sample suffixes\n",
      "Generating random suffixes\n",
      "QSorting 12 sample offsets, eliminating duplicates\n",
      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
      "Multikey QSorting 12 samples\n",
      "  (Using difference cover)\n",
      "  Multikey QSorting samples time: 00:00:00\n",
      "Calculating bucket sizes\n",
      "Splitting and merging\n",
      "  Splitting and merging time: 00:00:00\n",
      "Avg bucket size: 1.07726e+09 (target: 201985330)\n",
      "Converting suffix-array elements to index image\n",
      "Allocating ftab, absorbFtab\n",
      "Entering Ebwt loop\n",
      "Getting block 1 of 1\n",
      "  No samples; assembling all-inclusive block\n",
      "  Sorting block of length 1077255099 for bucket 1\n",
      "  (Using difference cover)\n",
      "  Sorting block time: 00:25:48\n",
      "Returning block of 1077255100 for bucket 1\n",
      "Exited Ebwt loop\n",
      "fchr[A]: 0\n",
      "fchr[C]: 341519526\n",
      "fchr[G]: 538577822\n",
      "fchr[T]: 735626959\n",
      "fchr[$]: 1077255099\n",
      "Exiting Ebwt::buildToDisk()\n",
      "Returning from initFromVector\n",
      "Wrote 371984998 bytes to primary EBWT file: ../data/Olurida_v081-bowtie-index.rev.1.bt2\n",
      "Wrote 269313780 bytes to secondary EBWT file: ../data/Olurida_v081-bowtie-index.rev.2.bt2\n",
      "Re-opening _in1 and _in2 as input streams\n",
      "Returning from Ebwt constructor\n",
      "Headers:\n",
      "    len: 1077255099\n",
      "    bwtLen: 1077255100\n",
      "    sz: 269313775\n",
      "    bwtSz: 269313775\n",
      "    lineRate: 6\n",
      "    offRate: 4\n",
      "    offMask: 0xfffffff0\n",
      "    ftabChars: 10\n",
      "    eftabLen: 20\n",
      "    eftabSz: 80\n",
      "    ftabLen: 1048577\n",
      "    ftabSz: 4194308\n",
      "    offsLen: 67328444\n",
      "    offsSz: 269313776\n",
      "    lineSz: 64\n",
      "    sideSz: 64\n",
      "    sideBwtSz: 48\n",
      "    sideBwtLen: 192\n",
      "    numSides: 5610704\n",
      "    numLines: 5610704\n",
      "    ebwtTotLen: 359085056\n",
      "    ebwtTotSz: 359085056\n",
      "    color: 0\n",
      "    reverse: 1\n",
      "Total time for backward call to driver() for mirror index: 00:31:59\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/bowtie2-235/bowtie2-build \\\n",
    "../data/Olurida_v081.fa \\\n",
    "../data/Olurida_v081-bowtie-index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "https://nbviewer.jupyter.org/github/sr320/student-fish546-2016/blob/1cfbc742fe2811f0b1ac8558a4579fb788f0a0b0/jupyter/02.6-Oly-Bowtie.ipynb"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}