In this script, we will be taking genetic sequence reads and giving
them taxonomic assignments using BLAST. This was originally written
using Jupyter
Set Up
here are the packages that are used in this script
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
echo = TRUE, # Display code chunks
eval = FALSE, # Evaluate code chunks
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
fig.width = 6, # Set plot width in inches
fig.height = 4, # Set plot height in inches
fig.align = "center" # Align plots to the center
)
Making the Blast
Database
download the
file
This is downloading the NCBI Blast software. I put it in a folder
called applications. cd is indicating a new director and curl is
downloading from the given website.
cd /home/jovyan/applications
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz
This unzips the file
tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz
Check that the software is working by using the help function
(-h)
~/applications/ncbi-blast-2.13.0+/bin/blastx -h
Make a database
This is putting our sequences in a folder named Data. using ls will
return all a list of files in the folder so we know that they went to
the right place
cd ../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz
gunzip -k uniprot_sprot_r2023_01.fasta.gz
ls ../data
~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01
Get the query
sequence
This code downloads a query sequence and saves it in the data
folder
curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> ../data/Ab_4denovo_CLC6_a.fa
The head function allows us to look at the first few lines of the
data. This code will also return the number of sequences in our
data.
head ../data/Ab_4denovo_CLC6_a.fa
echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
Run Blast
This section uses the NCBI Blast software using the query sequence
and database we downloaded. It creates an output file (.tab) with the
results.
~/applications/ncbi-blast-2.13.0+/bin/blastx \
-query ../data/Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6
LS0tCnRpdGxlOiAiQkxBU1QsIGJ1dCBtYWtlIGl0IGxvb2sgbmljZSIKYXV0aG9yOiAiU2FyYWggWWVycmFjZSIKZGF0ZTogIjIwMjMtMDQtMTMiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRoZW1lOiByZWFkYWJsZQogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAgdG9jOiB0cnVlCiAgICB0b2NfZmxvYXQ6IHRydWUKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCi0tLQoKSW4gdGhpcyBzY3JpcHQsIHdlIHdpbGwgYmUgdGFraW5nIGdlbmV0aWMgc2VxdWVuY2UgcmVhZHMgYW5kIGdpdmluZyB0aGVtIHRheG9ub21pYyBhc3NpZ25tZW50cyB1c2luZyBCTEFTVC4gVGhpcyB3YXMgb3JpZ2luYWxseSB3cml0dGVuIHVzaW5nIEp1cHl0ZXIKCiMgU2V0IFVwCmhlcmUgYXJlIHRoZSBwYWNrYWdlcyB0aGF0IGFyZSB1c2VkIGluIHRoaXMgc2NyaXB0CgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KGtuaXRyKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShrYWJsZUV4dHJhKQpsaWJyYXJ5KERUKQpsaWJyYXJ5KEJpb3N0cmluZ3MpCmxpYnJhcnkodG0pCmtuaXRyOjpvcHRzX2NodW5rJHNldCgKICBlY2hvID0gVFJVRSwgICAgICAgICAjIERpc3BsYXkgY29kZSBjaHVua3MKICBldmFsID0gRkFMU0UsICAgICAgICAjIEV2YWx1YXRlIGNvZGUgY2h1bmtzCiAgd2FybmluZyA9IEZBTFNFLCAgICAgIyBIaWRlIHdhcm5pbmdzCiAgbWVzc2FnZSA9IEZBTFNFLCAgICAgIyBIaWRlIG1lc3NhZ2VzCiAgZmlnLndpZHRoID0gNiwgICAgICAgIyBTZXQgcGxvdCB3aWR0aCBpbiBpbmNoZXMKICBmaWcuaGVpZ2h0ID0gNCwgICAgICAjIFNldCBwbG90IGhlaWdodCBpbiBpbmNoZXMKICBmaWcuYWxpZ24gPSAiY2VudGVyIiAjIEFsaWduIHBsb3RzIHRvIHRoZSBjZW50ZXIKKQpgYGAKCiMgTWFraW5nIHRoZSBCbGFzdCBEYXRhYmFzZQojIyBkb3dubG9hZCB0aGUgZmlsZQoKVGhpcyBpcyBkb3dubG9hZGluZyB0aGUgTkNCSSBCbGFzdCBzb2Z0d2FyZS4gSSBwdXQgaXQgaW4gYSBmb2xkZXIgY2FsbGVkIGFwcGxpY2F0aW9ucy4KY2QgaXMgaW5kaWNhdGluZyBhIG5ldyBkaXJlY3RvciBhbmQgY3VybCBpcyBkb3dubG9hZGluZyBmcm9tIHRoZSBnaXZlbiB3ZWJzaXRlLgoKYGBge3IsIGVuZ2luZSA9ICdiYXNoJ30KY2QgL2hvbWUvam92eWFuL2FwcGxpY2F0aW9ucwpjdXJsIC1PIGh0dHBzOi8vZnRwLm5jYmkubmxtLm5paC5nb3YvYmxhc3QvZXhlY3V0YWJsZXMvYmxhc3QrL0xBVEVTVC9uY2JpLWJsYXN0LTIuMTMuMCsteDY0LWxpbnV4LnRhci5negpgYGAKClRoaXMgdW56aXBzIHRoZSBmaWxlCgpgYGB7ciwgZW5naW5lID0gJ2Jhc2gnfQp0YXIgLXhmIG5jYmktYmxhc3QtMi4xMy4wKy14NjQtbGludXgudGFyLmd6CmBgYAoKQ2hlY2sgdGhhdCB0aGUgc29mdHdhcmUgaXMgd29ya2luZyBieSB1c2luZyB0aGUgaGVscCBmdW5jdGlvbiAoLWgpCgpgYGB7ciwgZW5naW5lID0gJ2Jhc2gnfQp+L2FwcGxpY2F0aW9ucy9uY2JpLWJsYXN0LTIuMTMuMCsvYmluL2JsYXN0eCAtaApgYGAKCiMjIE1ha2UgYSBkYXRhYmFzZQpUaGlzIGlzIHB1dHRpbmcgb3VyIHNlcXVlbmNlcyBpbiBhIGZvbGRlciBuYW1lZCBEYXRhLgp1c2luZyBscyB3aWxsIHJldHVybiBhbGwgYSBsaXN0IG9mIGZpbGVzIGluIHRoZSBmb2xkZXIgc28gd2Uga25vdyB0aGF0IHRoZXkgd2VudCB0byB0aGUgcmlnaHQgcGxhY2UKCmBgYHtyLCBlbmdpbmUgPSAnYmFzaCd9CmNkIC4uL2RhdGEKY3VybCAtTyBodHRwczovL2Z0cC51bmlwcm90Lm9yZy9wdWIvZGF0YWJhc2VzL3VuaXByb3QvY3VycmVudF9yZWxlYXNlL2tub3dsZWRnZWJhc2UvY29tcGxldGUvdW5pcHJvdF9zcHJvdC5mYXN0YS5negptdiB1bmlwcm90X3Nwcm90LmZhc3RhLmd6IHVuaXByb3Rfc3Byb3RfcjIwMjNfMDEuZmFzdGEuZ3oKZ3VuemlwIC1rIHVuaXByb3Rfc3Byb3RfcjIwMjNfMDEuZmFzdGEuZ3oKbHMgLi4vZGF0YQpgYGAKCmBgYHtyLCBlbmdpbmUgPSAnYmFzaCd9Cn4vYXBwbGljYXRpb25zL25jYmktYmxhc3QtMi4xMy4wKy9iaW4vbWFrZWJsYXN0ZGIgXAotaW4gLi4vZGF0YS91bmlwcm90X3Nwcm90X3IyMDIzXzAxLmZhc3RhIFwKLWRidHlwZSBwcm90IFwKLW91dCAuLi9ibGFzdGRiL3VuaXByb3Rfc3Byb3RfcjIwMjNfMDEKYGBgCgojIyBHZXQgdGhlIHF1ZXJ5IHNlcXVlbmNlClRoaXMgY29kZSBkb3dubG9hZHMgYSBxdWVyeSBzZXF1ZW5jZSBhbmQgc2F2ZXMgaXQgaW4gdGhlIGRhdGEgZm9sZGVyCgpgYGB7ciwgZW5naW5lID0gJ2Jhc2gnfQpjdXJsIGh0dHBzOi8vZWFnbGUuZmlzaC53YXNoaW5ndG9uLmVkdS9jbmlkYXJpYW4vQWJfNGRlbm92b19DTEM2X2EuZmEgXAotayBcCj4gLi4vZGF0YS9BYl80ZGVub3ZvX0NMQzZfYS5mYQpgYGAKClRoZSBoZWFkIGZ1bmN0aW9uIGFsbG93cyB1cyB0byBsb29rIGF0IHRoZSBmaXJzdCBmZXcgbGluZXMgb2YgdGhlIGRhdGEuIFRoaXMgY29kZSB3aWxsIGFsc28gcmV0dXJuIHRoZSBudW1iZXIgb2Ygc2VxdWVuY2VzIGluIG91ciBkYXRhLgoKYGBge3IsIGVuZ2luZSA9ICdiYXNoJ30KaGVhZCAuLi9kYXRhL0FiXzRkZW5vdm9fQ0xDNl9hLmZhCmVjaG8gIkhvdyBtYW55IHNlcXVlbmNlcyBhcmUgdGhlcmU/IgpncmVwIC1jICI+IiAuLi9kYXRhL0FiXzRkZW5vdm9fQ0xDNl9hLmZhCmBgYAoKIyBSdW4gQmxhc3QKClRoaXMgc2VjdGlvbiB1c2VzIHRoZSBOQ0JJIEJsYXN0IHNvZnR3YXJlIHVzaW5nIHRoZSBxdWVyeSBzZXF1ZW5jZSBhbmQgZGF0YWJhc2Ugd2UgZG93bmxvYWRlZC4gSXQgY3JlYXRlcyBhbiBvdXRwdXQgZmlsZSAoLnRhYikgd2l0aCB0aGUgcmVzdWx0cy4KCmBgYHtyLCBlbmdpbmUgPSAnYmFzaCd9Cn4vYXBwbGljYXRpb25zL25jYmktYmxhc3QtMi4xMy4wKy9iaW4vYmxhc3R4IFwKLXF1ZXJ5IC4uL2RhdGEvQWJfNGRlbm92b19DTEM2X2EuZmEgXAotZGIgLi4vYmxhc3RkYi91bmlwcm90X3Nwcm90X3IyMDIzXzAxIFwKLW91dCAuLi9vdXRwdXQvQWJfNC11bmlwcm90X2JsYXN0eC50YWIgXAotZXZhbHVlIDFFLTIwIFwKLW51bV90aHJlYWRzIDIwIFwKLW1heF90YXJnZXRfc2VxcyAxIFwKLW91dGZtdCA2CmBgYAoKIyMgQWRkIEFubm90YXRpb24gaW5mb3JtYXRpb24KU3RhcnQgYnkgY2hhbmdpbmcgdGhlIGZvcm1hdCBvZiBibGFzdCBvdXRwdXQuIFRoaXMgbmV4dCBzZWN0aW9uIHJlcGxhY2VzICJ8IiB3aXRoICJcdCIgYW5kIHNhdmVzIGEgbmV3IG91dHB1dCBmaWxlLiBUaGlzIGNoYW5nZSB3aWxsIG1ha2UgaXQgZWFzaWVyIHRvIGpvaW4gd2l0aCBvdXQgc2Vjb25kIHRhYmxlLgoKYGBge3IsIGVuZ2luZSA9ICdiYXNoJ30KdHIgJ3wnICdcdCcgPCAuLi9vdXRwdXQvQWJfNC11bmlwcm90X2JsYXN0eC50YWIgXAo+IC4uL291dHB1dC9BYl80LXVuaXByb3RfYmxhc3R4X3NlcC50YWIKYGBgCgpUaGlzIGNvZGUgcmVhZHMgdGhlIENTVnMgYW5kIHN0b3JlcyB0aGVtIGFzIG9iamVjdHMKCmBgYHtyLCBlbmdpbmUgPSAnYmFzaCd9CmJsdGFibCA8LSByZWFkLmNzdigiLi4vb3V0cHV0L0FiXzQtdW5pcHJvdF9ibGFzdHhfc2VwLnRhYiIsIHNlcCA9ICdcdCcsIGhlYWRlciA9IEZBTFNFKQpzcGdvIDwtIHJlYWQuY3N2KCJodHRwczovL2dhbm5ldC5maXNoLndhc2hpbmd0b24uZWR1L3NlYXNoZWxsL3NuYXBzL3VuaXByb3RfdGFibGVfcjIwMjNfMDEudGFiIiwgc2VwID0gJ1x0JywgaGVhZGVyID0gVFJVRSkKYGBgCgpUaGlzIGNodW5rIGpvaW5zIHRoZSB0d28gQ1NWcyB0b2dldGhlciBpbiBvbmUgdGFibGUuIFRoZXJlIGFyZSBtYW55IG9wdGlvbnMgZm9yIGV4cGxvcmluZyB0aGlzIGRhdGEgc2V0LCB0aGlzIGlzIGp1c3Qgb25lIG9mIHRoZW0uCgpgYGB7ciwgZW5naW5lID0gJ2Jhc2gnfQpsZWZ0X2pvaW4oYmx0YWJsLCBzcGdvLCAgYnkgPSBjKCJWMyIgPSAiRW50cnkiKSkgJT4lCiAgc2VsZWN0KFYxLCBWMywgVjEzLCBQcm90ZWluLm5hbWVzLCBPcmdhbmlzbSwgR2VuZS5PbnRvbG9neS4uYmlvbG9naWNhbC5wcm9jZXNzLiwgR2VuZS5PbnRvbG9neS5JRHMpICU+JSBtdXRhdGUoVjEgPSBzdHJfcmVwbGFjZV9hbGwoVjEsIAogICAgICAgICAgICBwYXR0ZXJuID0gInNvbGlkMDA3OF8yMDExMDQxMl9GUkFHX0JDX1dISVRFX1dISVRFX0YzX1FWX1NFX3RyaW1tZWQiLCByZXBsYWNlbWVudCA9ICJBYiIpKSAlPiUKICB3cml0ZV9kZWxpbSgiLi4vb3V0cHV0L2JsYXN0X2Fubm90X2dvLnRhYiIsIGRlbGltID0gJ1x0JykKYGBg