{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get the data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/sr320/Documents/GitHub/nb-2021/O_lurida/data\n" ] } ], "source": [ "cd ../data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "ls" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "wget -r \\\n", "--no-directories --no-parent \\\n", "--no-check-certificate \\\n", "-P . \\\n", "-A .fastq https://gannet.fish.washington.edu/generosa/O.lurida_QuantSeq2020/Trimmed/\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "137.trim.fastq 156.trim.fastq 34.trim.fastq 41.trim.fastq 46.trim.fastq\n", "139.trim.fastq 159.trim.fastq 35.trim.fastq 43.trim.fastq 47.trim.fastq\n", "140.trim.fastq 161.trim.fastq 37.trim.fastq 44.trim.fastq\n", "141.trim.fastq 162.trim.fastq 39.trim.fastq 45.trim.fastq\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bowtie 2 version 2.3.5 by Ben Langmead (langmea@cs.jhu.edu, www.cs.jhu.edu/~langmea)\n", "Usage: \n", " bowtie2 [options]* -x {-1 -2 | -U | --interleaved } [-S ]\n", "\n", " Index filename prefix (minus trailing .X.bt2).\n", " NOTE: Bowtie 1 and Bowtie 2 indexes are not compatible.\n", " Files with #1 mates, paired with files in .\n", " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n", " Files with #2 mates, paired with files in .\n", " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n", " Files with unpaired reads.\n", " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n", " Files with interleaved paired-end FASTQ/FASTA reads\n", " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2).\n", " File for SAM output (default: stdout)\n", "\n", " , , can be comma-separated lists (no whitespace) and can be\n", " specified many times. E.g. '-U file1.fq,file2.fq -U file3.fq'.\n", "\n", "Options (defaults in parentheses):\n", "\n", " Input:\n", " -q query input files are FASTQ .fq/.fastq (default)\n", " --tab5 query input files are TAB5 .tab5\n", " --tab6 query input files are TAB6 .tab6\n", " --qseq query input files are in Illumina's qseq format\n", " -f query input files are (multi-)FASTA .fa/.mfa\n", " -r query input files are raw one-sequence-per-line\n", " -F k:,i: query input files are continuous FASTA where reads\n", " are substrings (k-mers) extracted from a FASTA file \n", " and aligned at offsets 1, 1+i, 1+2i ... end of reference\n", " -c , , are sequences themselves, not files\n", " -s/--skip skip the first reads/pairs in the input (none)\n", " -u/--upto stop after first reads/pairs (no limit)\n", " -5/--trim5 trim bases from 5'/left end of reads (0)\n", " -3/--trim3 trim bases from 3'/right end of reads (0)\n", " --trim-to [3:|5:] trim reads exceeding bases from either 3' or 5' end\n", " If the read end is not specified then it defaults to 3 (0)\n", " --phred33 qualities are Phred+33 (default)\n", " --phred64 qualities are Phred+64\n", " --int-quals qualities encoded as space-delimited integers\n", "\n", " Presets: Same as:\n", " For --end-to-end:\n", " --very-fast -D 5 -R 1 -N 0 -L 22 -i S,0,2.50\n", " --fast -D 10 -R 2 -N 0 -L 22 -i S,0,2.50\n", " --sensitive -D 15 -R 2 -N 0 -L 22 -i S,1,1.15 (default)\n", " --very-sensitive -D 20 -R 3 -N 0 -L 20 -i S,1,0.50\n", "\n", " For --local:\n", " --very-fast-local -D 5 -R 1 -N 0 -L 25 -i S,1,2.00\n", " --fast-local -D 10 -R 2 -N 0 -L 22 -i S,1,1.75\n", " --sensitive-local -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default)\n", " --very-sensitive-local -D 20 -R 3 -N 0 -L 20 -i S,1,0.50\n", "\n", " Alignment:\n", " -N max # mismatches in seed alignment; can be 0 or 1 (0)\n", " -L length of seed substrings; must be >3, <32 (22)\n", " -i interval between seed substrings w/r/t read len (S,1,1.15)\n", " --n-ceil func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)\n", " --dpad include extra ref chars on sides of DP table (15)\n", " --gbar disallow gaps within nucs of read extremes (4)\n", " --ignore-quals treat all quality values as 30 on Phred scale (off)\n", " --nofw do not align forward (original) version of read (off)\n", " --norc do not align reverse-complement version of read (off)\n", " --no-1mm-upfront do not allow 1 mismatch alignments before attempting to\n", " scan for the optimal seeded alignments\n", " --end-to-end entire read must align; no clipping (on)\n", " OR\n", " --local local alignment; ends might be soft clipped (off)\n", "\n", " Scoring:\n", " --ma match bonus (0 for --end-to-end, 2 for --local) \n", " --mp max penalty for mismatch; lower qual = lower penalty (6)\n", " --np penalty for non-A/C/G/Ts in read/ref (1)\n", " --rdg , read gap open, extend penalties (5,3)\n", " --rfg , reference gap open, extend penalties (5,3)\n", " --score-min min acceptable alignment score w/r/t read length\n", " (G,20,8 for local, L,-0.6,-0.6 for end-to-end)\n", "\n", " Reporting:\n", " (default) look for multiple alignments, report best, with MAPQ\n", " OR\n", " -k report up to alns per read; MAPQ not meaningful\n", " OR\n", " -a/--all report all alignments; very slow, MAPQ not meaningful\n", "\n", " Effort:\n", " -D give up extending after failed extends in a row (15)\n", " -R for reads w/ repetitive seeds, try sets of seeds (2)\n", "\n", " Paired-end:\n", " -I/--minins minimum fragment length (0)\n", " -X/--maxins maximum fragment length (500)\n", " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)\n", " --no-mixed suppress unpaired alignments for paired reads\n", " --no-discordant suppress discordant alignments for paired reads\n", " --dovetail concordant when mates extend past each other\n", " --no-contain not concordant when one mate alignment contains other\n", " --no-overlap not concordant when mates overlap at all\n", "\n", " Output:\n", " -t/--time print wall-clock time taken by search phases\n", " --un write unpaired reads that didn't align to \n", " --al write unpaired reads that aligned at least once to \n", " --un-conc write pairs that didn't align concordantly to \n", " --al-conc write pairs that aligned concordantly at least once to \n", " (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g.\n", " --un-gz , to gzip compress output, or add '-bz2' to bzip2 compress output.)\n", " --quiet print nothing to stderr except serious errors\n", " --met-file send metrics to file at (off)\n", " --met-stderr send metrics to stderr (off)\n", " --met report internal counters & metrics every secs (1)\n", " --no-unal suppress SAM records for unaligned reads\n", " --no-head suppress header lines, i.e. lines starting with @\n", " --no-sq suppress @SQ header lines\n", " --rg-id set read group id, reflected in @RG line and RG:Z: opt field\n", " --rg add (\"lab:value\") to @RG line of SAM header.\n", " Note: @RG line only printed when --rg-id is set.\n", " --omit-sec-seq put '*' in SEQ and QUAL fields for secondary alignments.\n", " --sam-no-qname-trunc Suppress standard behavior of truncating readname at first whitespace \n", " at the expense of generating non-standard SAM.\n", " --xeq Use '='/'X', instead of 'M,' to specify matches/mismatches in SAM record.\n", " --soft-clipped-unmapped-tlen Exclude soft-clipped bases when reporting TLEN\n", "\n", " Performance:\n", " -p/--threads number of alignment threads to launch (1)\n", " --reorder force SAM output order to match order of input reads\n", " --mm use memory-mapped I/O for index; many 'bowtie's can share\n", "\n", " Other:\n", " --qc-filter filter out reads that are bad according to QSEQ filter\n", " --seed seed for random number generator (0)\n", " --non-deterministic seed rand. gen. arbitrarily instead of using read attributes\n", " --version print version information and quit\n", " -h/--help print this usage message\n" ] } ], "source": [ "!/Applications/bioinfo/bowtie2-235/bowtie2 -h\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/sr320/Documents/GitHub/nb-2021/O_lurida/code'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 1090M 100 1090M 0 0 2226k 0 0:08:21 0:08:21 --:--:-- 2313k 0 2024k 0 0:09:11 0:00:15 0:08:56 2395k1:29 0:06:47 1934k0:06:43 1513k 0 2156k 0 0:08:37 0:01:58 0:06:39 1730k 0 0:08:38 0:02:02 0:06:36 2114k 0:08:38 0:02:04 0:06:34 2047k0 0:08:40 0:02:09 0:06:31 1989k81k 0 0:08:56 0:03:16 0:05:40 2185kk 0 0:08:32 0:04:13 0:04:19 2765k 2199k 0 0:08:27 0:04:29 0:03:58 2518k 0 0:08:26 0:04:35 0:03:51 2367k 0 2214k 0 0:08:24 0:06:01 0:02:23 2166k 2213k 0 0:08:24 0:06:13 0:02:11 2113k 0 0 2218k 0 0:08:23 0:06:22 0:02:01 2372k 0 0:08:23 0:06:23 0:02:00 2258k 2202k 0 0:08:26 0:07:01 0:01:25 2637kM 90 983M 0 0 2221k 0 0:08:22 0:07:33 0:00:49 2546k\n" ] } ], "source": [ "!curl http://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081.fa > ../data/Olurida_v081.fa" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "!curl 156.trim.fastq 34.trim.fastq 43.trim.fastq Olurida_v081.fa\n", "137.trim.fastq 159.trim.fastq 35.trim.fastq 44.trim.fastq\n", "139.trim.fastq 161.trim.fastq 37.trim.fastq 45.trim.fastq\n", "140.trim.fastq 162.trim.fastq 39.trim.fastq 46.trim.fastq\n", "141.trim.fastq 168.trim.fastq 41.trim.fastq 47.trim.fastq\n" ] } ], "source": [ "!ls ../data/" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Settings:\n", " Output files: \"../data/Olurida_v081-bowtie-index.*.bt2\"\n", " Line rate: 6 (line is 64 bytes)\n", " Lines per side: 1 (side is 64 bytes)\n", " Offset rate: 4 (one in 16)\n", " FTable chars: 10\n", " Strings: unpacked\n", " Max bucket size: default\n", " Max bucket size, sqrt multiplier: default\n", " Max bucket size, len divisor: 4\n", " Difference-cover sample period: 1024\n", " Endianness: little\n", " Actual local endianness: little\n", " Sanity checking: disabled\n", " Assertions: disabled\n", " Random seed: 0\n", " Sizeofs: void*:8, int:4, long:8, size_t:8\n", "Input files DNA, FASTA:\n", " ../data/Olurida_v081.fa\n", "Building a SMALL index\n", "Reading reference sizes\n", " Time reading reference sizes: 00:00:21\n", "Calculating joined length\n", "Writing header\n", "Reserving space for joined string\n", "Joining reference sequences\n", " Time to join reference sequences: 00:00:13\n", "bmax according to bmaxDivN setting: 269313774\n", "Using parameters --bmax 201985331 --dcv 1024\n", " Doing ahead-of-time memory usage test\n", " Passed! Constructing with these parameters: --bmax 201985331 --dcv 1024\n", "Constructing suffix-array element generator\n", "Building DifferenceCoverSample\n", " Building sPrime\n", " Building sPrimeOrder\n", " V-Sorting samples\n", " V-Sorting samples time: 00:00:40\n", " Allocating rank array\n", " Ranking v-sort output\n", " Ranking v-sort output time: 00:00:15\n", " Invoking Larsson-Sadakane on ranks\n", " Invoking Larsson-Sadakane on ranks time: 00:00:18\n", " Sanity-checking and returning\n", "Building samples\n", "Reserving space for 12 sample suffixes\n", "Generating random suffixes\n", "QSorting 12 sample offsets, eliminating duplicates\n", "QSorting sample offsets, eliminating duplicates time: 00:00:00\n", "Multikey QSorting 12 samples\n", " (Using difference cover)\n", " Multikey QSorting samples time: 00:00:00\n", "Calculating bucket sizes\n", "Splitting and merging\n", " Splitting and merging time: 00:00:00\n", "Avg bucket size: 1.07726e+09 (target: 201985330)\n", "Converting suffix-array elements to index image\n", "Allocating ftab, absorbFtab\n", "Entering Ebwt loop\n", "Getting block 1 of 1\n", " No samples; assembling all-inclusive block\n", " Sorting block of length 1077255099 for bucket 1\n", " (Using difference cover)\n", " Sorting block time: 00:20:49\n", "Returning block of 1077255100 for bucket 1\n", "Exited Ebwt loop\n", "fchr[A]: 0\n", "fchr[C]: 341519526\n", "fchr[G]: 538577822\n", "fchr[T]: 735626959\n", "fchr[$]: 1077255099\n", "Exiting Ebwt::buildToDisk()\n", "Returning from initFromVector\n", "Wrote 371984998 bytes to primary EBWT file: ../data/Olurida_v081-bowtie-index.1.bt2\n", "Wrote 269313780 bytes to secondary EBWT file: ../data/Olurida_v081-bowtie-index.2.bt2\n", "Re-opening _in1 and _in2 as input streams\n", "Returning from Ebwt constructor\n", "Headers:\n", " len: 1077255099\n", " bwtLen: 1077255100\n", " sz: 269313775\n", " bwtSz: 269313775\n", " lineRate: 6\n", " offRate: 4\n", " offMask: 0xfffffff0\n", " ftabChars: 10\n", " eftabLen: 20\n", " eftabSz: 80\n", " ftabLen: 1048577\n", " ftabSz: 4194308\n", " offsLen: 67328444\n", " offsSz: 269313776\n", " lineSz: 64\n", " sideSz: 64\n", " sideBwtSz: 48\n", " sideBwtLen: 192\n", " numSides: 5610704\n", " numLines: 5610704\n", " ebwtTotLen: 359085056\n", " ebwtTotSz: 359085056\n", " color: 0\n", " reverse: 0\n", "Total time for call to driver() for forward index: 00:27:26\n", "Reading reference sizes\n", " Time reading reference sizes: 00:00:07\n", "Calculating joined length\n", "Writing header\n", "Reserving space for joined string\n", "Joining reference sequences\n", " Time to join reference sequences: 00:00:09\n", " Time to reverse reference sequence: 00:00:00\n", "bmax according to bmaxDivN setting: 269313774\n", "Using parameters --bmax 201985331 --dcv 1024\n", " Doing ahead-of-time memory usage test\n", " Passed! Constructing with these parameters: --bmax 201985331 --dcv 1024\n", "Constructing suffix-array element generator\n", "Building DifferenceCoverSample\n", " Building sPrime\n", " Building sPrimeOrder\n", " V-Sorting samples\n", " V-Sorting samples time: 00:00:27\n", " Allocating rank array\n", " Ranking v-sort output\n", " Ranking v-sort output time: 00:00:10\n", " Invoking Larsson-Sadakane on ranks\n", " Invoking Larsson-Sadakane on ranks time: 00:00:15\n", " Sanity-checking and returning\n", "Building samples\n", "Reserving space for 12 sample suffixes\n", "Generating random suffixes\n", "QSorting 12 sample offsets, eliminating duplicates\n", "QSorting sample offsets, eliminating duplicates time: 00:00:00\n", "Multikey QSorting 12 samples\n", " (Using difference cover)\n", " Multikey QSorting samples time: 00:00:00\n", "Calculating bucket sizes\n", "Splitting and merging\n", " Splitting and merging time: 00:00:00\n", "Avg bucket size: 1.07726e+09 (target: 201985330)\n", "Converting suffix-array elements to index image\n", "Allocating ftab, absorbFtab\n", "Entering Ebwt loop\n", "Getting block 1 of 1\n", " No samples; assembling all-inclusive block\n", " Sorting block of length 1077255099 for bucket 1\n", " (Using difference cover)\n", " Sorting block time: 00:25:48\n", "Returning block of 1077255100 for bucket 1\n", "Exited Ebwt loop\n", "fchr[A]: 0\n", "fchr[C]: 341519526\n", "fchr[G]: 538577822\n", "fchr[T]: 735626959\n", "fchr[$]: 1077255099\n", "Exiting Ebwt::buildToDisk()\n", "Returning from initFromVector\n", "Wrote 371984998 bytes to primary EBWT file: ../data/Olurida_v081-bowtie-index.rev.1.bt2\n", "Wrote 269313780 bytes to secondary EBWT file: ../data/Olurida_v081-bowtie-index.rev.2.bt2\n", "Re-opening _in1 and _in2 as input streams\n", "Returning from Ebwt constructor\n", "Headers:\n", " len: 1077255099\n", " bwtLen: 1077255100\n", " sz: 269313775\n", " bwtSz: 269313775\n", " lineRate: 6\n", " offRate: 4\n", " offMask: 0xfffffff0\n", " ftabChars: 10\n", " eftabLen: 20\n", " eftabSz: 80\n", " ftabLen: 1048577\n", " ftabSz: 4194308\n", " offsLen: 67328444\n", " offsSz: 269313776\n", " lineSz: 64\n", " sideSz: 64\n", " sideBwtSz: 48\n", " sideBwtLen: 192\n", " numSides: 5610704\n", " numLines: 5610704\n", " ebwtTotLen: 359085056\n", " ebwtTotSz: 359085056\n", " color: 0\n", " reverse: 1\n", "Total time for backward call to driver() for mirror index: 00:31:59\n" ] } ], "source": [ "!/Applications/bioinfo/bowtie2-235/bowtie2-build \\\n", "../data/Olurida_v081.fa \\\n", "../data/Olurida_v081-bowtie-index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "https://nbviewer.jupyter.org/github/sr320/student-fish546-2016/blob/1cfbc742fe2811f0b1ac8558a4579fb788f0a0b0/jupyter/02.6-Oly-Bowtie.ipynb" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }