This document goes through the workflow to take an unknown multi-fasta file and annotate it using blast.

1 Download software

For my Mac, I downloaded the x64-macosx version of blast. The latest version available during the generation of this report was 2.13.

Download and unzip the new blast software into the right folder.

# Download code in location you want
cd /Applications/
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-macosx.tar.gz

# Unzip file
tar -xf ncbi-blast-2.13.0+-x64-macosx.tar.gz

Before continuing, check that blast is in the right folder and it is functional by calling the help menu.

# Check for location of blast
ls /Applications/

# Look at help menu
/Applications/ncbi-blast-2.13.0+/bin/blastx -h

2 Make blast database

Download and unzip the latest uniprot database. I put it in my repo but ignored it before pushing to GitHub because the file is very large.

# Go to folder you want to download database in
cd ../data/

# Download latest version of uniprot using curl
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

# Rename the file to indicate the version
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz

# Unzip the file and make sure it is where you expect it to be
gunzip -k uniprot_sprot_r2023_01.fasta.gz
ls

Make blast database using the uniprot fasta and put the output in a blast database folder.

# Run makeblastdb

/Applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01

3 Get the query sequence

Download unknown fasta file.

# Use curl to download fasta file

curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> Ab_4denovo_CLC6_a.fa

Check data is correctly downloaded by looking at the first few lines of code

cd ../data/
head -3 Ab_4denovo_CLC6_a.fa
## >solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1
## ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC
## TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC

Get the amount of sequences in the fasta file

echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
## How many sequences are there?
## 5490

Visualize GC content of sequences

# Import fasta file
fasta <- readDNAStringSet("../data/Ab_4denovo_CLC6_a.fa")

# Create a data frame with GC content
gc_df <- data.frame(GC_content = letterFrequency(fasta, "GC", as.prob = TRUE))

# Plot histogram using ggplot2
ggplot(gc_df, aes(x = G.C)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of GC content",
       x = "GC content",
       y = "Frequency") +
  theme_bw()

4 Run blast

Annotate the unknown sequences using blast.

/Applications/ncbi-blast-2.13.0+/bin/blastx \
-query Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 6

Check the output file by looking at the first few lines and the number of lines.

head -1 ../output/Ab_4-uniprot_blastx.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp|O42248|GBLP_DANRE    82.456  171 30  0   1   513 35  205 2.81e-103   301
echo "Number of lines"
wc -l ../output/Ab_4-uniprot_blastx.tab
## Number of lines
##      337 ../output/Ab_4-uniprot_blastx.tab

5 Joining blast table with annotation table

Create tables with all the uniprot data in R and look at them separately.

# Read in data
bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)

# Create readable table
datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
# Read in data
spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)

# Create readable table
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))

Make the blast table format easy to join with the annotation table and check output.

# Edit format of file
tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \
> ../output/Ab_4-uniprot_blastx_sep.tab

# Look at first line
head -1 ../output/Ab_4-uniprot_blastx_sep.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp  O42248  GBLP_DANRE  82.456  171 30  0   1   513 35  205 2.81e-103   301

Combine blast and annotation tables using a left join and clean up the names. Write the file to repo.

left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>%
  write_delim("../output/blast_annot_go.tab", delim = '\t')

Visualize data.

annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)

# Count the occurrences of each protein
string_counts <- table(annot_tab[["Gene.Ontology..biological.process."]])

# Convert to a data frame, sort by count, and select the top 10
string_counts_df <- as.data.frame(string_counts)
colnames(string_counts_df) <- c("Process", "Count")
string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ]
string_counts_df$Process <- as.character(string_counts_df$Process)
top_10_strings <- head(subset(string_counts_df, nchar(string_counts_df$Process) > 0), n = 10)

# Clean up string names
top_10_strings$Process <- str_split_fixed(top_10_strings$Process, "GO", 2)[,1]
top_10_strings$Process <- str_sub(top_10_strings$Process, 1, str_length(top_10_strings$Process)-2)

# Plot the top 10 most common strings using ggplot2
ggplot(top_10_strings, aes(x = reorder(Process, -Count), y = Count, fill = Process)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Top 10 Process Hits",
       x = "Process",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette="Set1") +
  coord_flip()