This code uses NCBI Blast to identify and annotate unidentified sequences.

Load necessary packages

library(tidyverse)
library(DT)
library(splitstackshape)
library(reshape2)

Download Software

Move to the directory you prefer to work in

cd /home/jovyan/applications

Download Blast software from NCBI

#download from url
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz

#unzip file
tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz

Check if software is working

~/applications/ncbi-blast-2.13.0+/bin/blastx -h

Make blast database

Download uniprot database

#change directory using a relative file path
cd ../data

#download from url
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

#rename the file
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz

#unzip data
gunzip -k uniprot_sprot_r2023_01.fasta.gz

#list the files in the data directory
ls ../data

Make the database using blast

~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01

Get the query sequence

#download frojm url
curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> ../data/Ab_4denovo_CLC6_a.fa

Explore the query file

head ../data/Ab_4denovo_CLC6_a.fa
echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
## >solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1
## ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC
## TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC
## TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC
## TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC
## CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA
## TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG
## CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA
## AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC
## TTC
## How many sequences are there?
## 5490

Run Blast

~/applications/ncbi-blast-2.13.0+/bin/blastx \
-query ../data/Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 6
#evalue is the percent match

Explore blast output

head -2 ../output/Ab_4-uniprot_blastx.tab
wc -l ../output/Ab_4-uniprot_blastx.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp|O42248|GBLP_DANRE    82.456  171 30  0   1   513 35  205 2.81e-103   301
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5 sp|Q08013|SSRG_RAT  75.385  65  16  0   3   197 121 185 1.40e-28    104
## 765 ../output/Ab_4-uniprot_blastx.tab

Join blast table with annotation table

Change format of blast output

tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \
> ../output/Ab_4-uniprot_blastx_sep.tab

Read in annotation table

#read file in from url
spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)

Read data table into rstudio

bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)

Join annotation table and data table

left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>%
  write_delim("../output/blast_annot_go.tab", delim = '\t')

Visualize combined table

#read in combined table
annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)

#visualize table
datatable(annot_tab)

Visualize data

make frequency table of biological processes

#split gene ontology into multiple columns
gene_ontology<-cSplit(data.frame(annot_tab), 'Gene.Ontology..biological.process.', ";")

#remove first six columns
gene_ontology1 <- gene_ontology[, -c(1:6)] 

#re-format to put each occurrence of each process in one column
gene_ontology2<-melt(gene_ontology1,0)

#count number of occurences of each process
gene_ontology3 <- as.data.frame(table(gene_ontology2$value))

#get the top 10 processes
top_process<-top_n(gene_ontology3, 15,Freq)
#add x-value column
top_process$Process<-"Process"

make stacked bar plot

ggplot(top_process, aes(y=Freq, x=reorder(Var1, Freq))) + 
    geom_bar(position="dodge", stat="identity")+ theme_bw()+coord_flip()+ylab("Frequency")+xlab("Biological Process")