This document goes through the workflow to take an unknown multi-fasta file and annotate it using blast.
For my Mac, I downloaded the x64-macosx version of blast. The latest version available during the generation of this report was 2.13.
Download and unzip the new blast software into the right folder.
# Download code in location you want
cd /Applications/
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-macosx.tar.gz
# Unzip file
tar -xf ncbi-blast-2.13.0+-x64-macosx.tar.gz
Before continuing, check that blast is in the right folder and it is functional by calling the help menu.
# Check for location of blast
ls /Applications/
# Look at help menu
/Applications/ncbi-blast-2.13.0+/bin/blastx -h
Download and unzip the latest uniprot database. I put it in my repo but ignored it before pushing to GitHub because the file is very large.
# Go to folder you want to download database in
cd ../data/
# Download latest version of uniprot using curl
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
# Rename the file to indicate the version
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz
# Unzip the file and make sure it is where you expect it to be
gunzip -k uniprot_sprot_r2023_01.fasta.gz
ls
Make blast database using the uniprot fasta and put the output in a blast database folder.
# Run makeblastdb
/Applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01
Download unknown fasta file.
# Use curl to download fasta file
curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> Ab_4denovo_CLC6_a.fa
Check data is correctly downloaded by looking at the first few lines of code
cd ../data/
head -3 Ab_4denovo_CLC6_a.fa
## >solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1
## ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC
## TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC
Get the amount of sequences in the fasta file
echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
## How many sequences are there?
## 5490
Visualize GC content of sequences
# Import fasta file
fasta <- readDNAStringSet("../data/Ab_4denovo_CLC6_a.fa")
# Create a data frame with GC content
gc_df <- data.frame(GC_content = letterFrequency(fasta, "GC", as.prob = TRUE))
# Plot histogram using ggplot2
ggplot(gc_df, aes(x = G.C)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "blue", alpha = 0.75) +
labs(title = "Histogram of GC content",
x = "GC content",
y = "Frequency") +
theme_bw()
Annotate the unknown sequences using blast.
/Applications/ncbi-blast-2.13.0+/bin/blastx \
-query Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 6
Check the output file by looking at the first few lines and the number of lines.
head -1 ../output/Ab_4-uniprot_blastx.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp|O42248|GBLP_DANRE 82.456 171 30 0 1 513 35 205 2.81e-103 301
echo "Number of lines"
wc -l ../output/Ab_4-uniprot_blastx.tab
## Number of lines
## 337 ../output/Ab_4-uniprot_blastx.tab
Create tables with all the uniprot data in R and look at them separately.
# Read in data
bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
# Create readable table
datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
# Read in data
spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)
# Create readable table
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
Make the blast table format easy to join with the annotation table and check output.
# Edit format of file
tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \
> ../output/Ab_4-uniprot_blastx_sep.tab
# Look at first line
head -1 ../output/Ab_4-uniprot_blastx_sep.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp O42248 GBLP_DANRE 82.456 171 30 0 1 513 35 205 2.81e-103 301
Combine blast and annotation tables using a left join and clean up the names. Write the file to repo.
left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1,
pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>%
write_delim("../output/blast_annot_go.tab", delim = '\t')
Visualize data.
annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)
# Count the occurrences of each protein
string_counts <- table(annot_tab[["Gene.Ontology..biological.process."]])
# Convert to a data frame, sort by count, and select the top 10
string_counts_df <- as.data.frame(string_counts)
colnames(string_counts_df) <- c("Process", "Count")
string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ]
string_counts_df$Process <- as.character(string_counts_df$Process)
top_10_strings <- head(subset(string_counts_df, nchar(string_counts_df$Process) > 0), n = 10)
# Clean up string names
top_10_strings$Process <- str_split_fixed(top_10_strings$Process, "GO", 2)[,1]
top_10_strings$Process <- str_sub(top_10_strings$Process, 1, str_length(top_10_strings$Process)-2)
# Plot the top 10 most common strings using ggplot2
ggplot(top_10_strings, aes(x = reorder(Process, -Count), y = Count, fill = Process)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Top 10 Process Hits",
x = "Process",
y = "Count") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_brewer(palette="Set1") +
coord_flip()