--- title: "NCBI BLAST" date: "2023-04-11" output: html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=F} library(knitr) library(tidyverse) library(Biostrings) library(DT) library(stringr) library(viridis) opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Don't evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` This document goes through the workflow to take an unknown multi-fasta file and annotate it using blast. # Download software For my Mac, I downloaded the x64-macosx version of blast. The latest version available during the generation of this report was 2.13. Download and unzip the new blast software into the right folder. ```{r, engine='bash'} # Download code in location you want cd /Applications/ curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-macosx.tar.gz # Unzip file tar -xf ncbi-blast-2.13.0+-x64-macosx.tar.gz ``` Before continuing, check that blast is in the right folder and it is functional by calling the help menu. ```{r, engine='bash'} # Check for location of blast ls /Applications/ # Look at help menu /Applications/ncbi-blast-2.13.0+/bin/blastx -h ``` # Make blast database Download and unzip the latest uniprot database. I put it in my repo but ignored it before pushing to GitHub because the file is very large. ```{r, engine='bash'} # Go to folder you want to download database in cd ../data/ # Download latest version of uniprot using curl curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz # Rename the file to indicate the version mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz # Unzip the file and make sure it is where you expect it to be gunzip -k uniprot_sprot_r2023_01.fasta.gz ls ``` Make blast database using the uniprot fasta and put the output in a blast database folder. ```{r, engine='bash'} # Run makeblastdb /Applications/ncbi-blast-2.13.0+/bin/makeblastdb \ -in uniprot_sprot_r2023_01.fasta \ -dbtype prot \ -out ../blastdb/uniprot_sprot_r2023_01 ``` # Get the query sequence Download unknown fasta file. ```{r, engine='bash'} # Use curl to download fasta file curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \ -k \ > Ab_4denovo_CLC6_a.fa ``` Check data is correctly downloaded by looking at the first few lines of code ```{r, engine='bash', eval=T} cd ../data/ head -3 Ab_4denovo_CLC6_a.fa ``` Get the amount of sequences in the fasta file ```{r, engine='bash', eval=T} echo "How many sequences are there?" grep -c ">" ../data/Ab_4denovo_CLC6_a.fa ``` Visualize GC content of sequences ```{r, eval=T} # Import fasta file fasta <- readDNAStringSet("../data/Ab_4denovo_CLC6_a.fa") # Create a data frame with GC content gc_df <- data.frame(GC_content = letterFrequency(fasta, "GC", as.prob = TRUE)) # Plot histogram using ggplot2 ggplot(gc_df, aes(x = G.C)) + geom_histogram(binwidth = 0.01, color = "black", fill = "blue", alpha = 0.75) + labs(title = "Histogram of GC content", x = "GC content", y = "Frequency") + theme_bw() ``` # Run blast Annotate the unknown sequences using blast. ```{r, engine='bash'} /Applications/ncbi-blast-2.13.0+/bin/blastx \ -query Ab_4denovo_CLC6_a.fa \ -db ../blastdb/uniprot_sprot_r2023_01 \ -out ../output/Ab_4-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 8 \ -max_target_seqs 1 \ -outfmt 6 ``` Check the output file by looking at the first few lines and the number of lines. ```{r, engine='bash', eval=T} head -1 ../output/Ab_4-uniprot_blastx.tab ``` ```{r, engine='bash', eval=T} echo "Number of lines" wc -l ../output/Ab_4-uniprot_blastx.tab ``` # Joining blast table with annotation table Create tables with all the uniprot data in R and look at them separately. ```{r, eval=T} # Read in data bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE) # Create readable table datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE)) ``` ```{r, eval=T} # Read in data spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) # Create readable table datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE)) ``` Make the blast table format easy to join with the annotation table and check output. ```{r, engine='bash', eval=T} # Edit format of file tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \ > ../output/Ab_4-uniprot_blastx_sep.tab # Look at first line head -1 ../output/Ab_4-uniprot_blastx_sep.tab ``` Combine blast and annotation tables using a left join and clean up the names. Write the file to repo. ```{r} left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>% write_delim("../output/blast_annot_go.tab", delim = '\t') ``` Visualize data. ```{r, eval=T} annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE) # Count the occurrences of each protein string_counts <- table(annot_tab[["Gene.Ontology..biological.process."]]) # Convert to a data frame, sort by count, and select the top 10 string_counts_df <- as.data.frame(string_counts) colnames(string_counts_df) <- c("Process", "Count") string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ] string_counts_df$Process <- as.character(string_counts_df$Process) top_10_strings <- head(subset(string_counts_df, nchar(string_counts_df$Process) > 0), n = 10) # Clean up string names top_10_strings$Process <- str_split_fixed(top_10_strings$Process, "GO", 2)[,1] top_10_strings$Process <- str_sub(top_10_strings$Process, 1, str_length(top_10_strings$Process)-2) # Plot the top 10 most common strings using ggplot2 ggplot(top_10_strings, aes(x = reorder(Process, -Count), y = Count, fill = Process)) + geom_bar(stat = "identity", position = "dodge", color = "black") + labs(title = "Top 10 Process Hits", x = "Process", y = "Count") + theme_minimal() + theme(legend.position = "none") + scale_fill_brewer(palette="Set1") + coord_flip() ```