--- <<<<<<< HEAD title: "FISH546 Week 1: Blast Assignment" author: "Sam Cryan" date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set( ======= title: "FISH546 Assignment 1: BLAST testing" author: "Sam Cryan" date: "2023-04-11" output: html_document --- This document generates a BLAST database ```{r setup, include=FALSE} library(knitr) opts_chunk$set( >>>>>>> 15610ea2cc2f623fab2331bcbee37bd6a3a57195 echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` ## Setting Up Blast Scripts <<<<<<< HEAD ======= # ONLY RUN ON JUPYTER >>>>>>> 15610ea2cc2f623fab2331bcbee37bd6a3a57195 ```{bash} cd ~/applications curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz ``` ```{bash} cd ~/applications tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz ``` # Check that blast is installed ```{bash} ~/applications/ncbi-blast-2.13.0+/bin/blastx -h ``` ```{bash} cd /scryan_coursework/scryan-coursework/assignments/data curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz gunzip -k uniprot_sprot_r2023_01.fasta.gz ``` ```{bash} ~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \ -in ./data/uniprot_sprot_r2023_01.fasta \ -dbtype prot \ -out ./blastdb/uniprot_sprot_r2023_01 ``` ## Download unknown BLAST sequences ```{bash} curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \ -k \ > ./data/Ab_4denovo_CLC6_a.fa ``` ```{bash} ~/applications/ncbi-blast-2.13.0+/bin/blastx \ -query ./data/Ab_4denovo_CLC6_a.fa \ -db ./blastdb/uniprot_sprot_r2023_01 \ -out ./output/Ab_4-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 20 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{bash} curl -o uniprot_table_r2023_01.tab -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Ccc_induction%2Cgo%2Cgo_p%2Cgo_id%2Cxref_unipathway&format=tsv&query=%28%2A%29%20AND%20%28reviewed%3Atrue%29&sort=gene%20asc" ``` ```{bash} tr '|' '\t' < ./output/Ab_4-uniprot_blastx.tab \ > ./output/Ab_4-uniprot_blastx_sep.tab ``` ```{bash} head -2 ./data/uniprot_table_r2023_01.tab # wc -l ./data/uniprot_table_r2023_01.tab ``` ``` {r load libaries} library(tidyverse) library("kableExtra") ``` ``` {r read data} bltabl <- read.csv("./output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) ``` ``` {r test join} kbl( left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) ) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) ```