This code uses NCBI Blast to identify and annotate unidentified sequences.
library(tidyverse)
library(DT)
library(splitstackshape)
library(reshape2)
Move to the directory you prefer to work in
cd /home/jovyan/applications
Download Blast software from NCBI
#download from url
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz
#unzip file
tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz
Check if software is working
~/applications/ncbi-blast-2.13.0+/bin/blastx -h
Download uniprot database
#change directory using a relative file path
cd ../data
#download from url
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
#rename the file
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz
#unzip data
gunzip -k uniprot_sprot_r2023_01.fasta.gz
#list the files in the data directory
ls ../data
Make the database using blast
~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01
Get the query sequence
#download frojm url
curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> ../data/Ab_4denovo_CLC6_a.fa
Explore the query file
head ../data/Ab_4denovo_CLC6_a.fa
echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
## >solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1
## ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC
## TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC
## TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC
## TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC
## CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA
## TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG
## CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA
## AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC
## TTC
## How many sequences are there?
## 5490
~/applications/ncbi-blast-2.13.0+/bin/blastx \
-query ../data/Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 8 \
-max_target_seqs 1 \
-outfmt 6
#evalue is the percent match
Explore blast output
head -2 ../output/Ab_4-uniprot_blastx.tab
wc -l ../output/Ab_4-uniprot_blastx.tab
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3 sp|O42248|GBLP_DANRE 82.456 171 30 0 1 513 35 205 2.81e-103 301
## solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5 sp|Q08013|SSRG_RAT 75.385 65 16 0 3 197 121 185 1.40e-28 104
## 765 ../output/Ab_4-uniprot_blastx.tab
Change format of blast output
tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \
> ../output/Ab_4-uniprot_blastx_sep.tab
Read in annotation table
#read file in from url
spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)
Read data table into rstudio
bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
Join annotation table and data table
left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1,
pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>%
write_delim("../output/blast_annot_go.tab", delim = '\t')
Visualize combined table
#read in combined table
annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)
#visualize table
datatable(annot_tab)
make frequency table of biological processes
#split gene ontology into multiple columns
gene_ontology<-cSplit(data.frame(annot_tab), 'Gene.Ontology..biological.process.', ";")
#remove first six columns
gene_ontology1 <- gene_ontology[, -c(1:6)]
#re-format to put each occurrence of each process in one column
gene_ontology2<-melt(gene_ontology1,0)
#count number of occurences of each process
gene_ontology3 <- as.data.frame(table(gene_ontology2$value))
#get the top 10 processes
top_process<-top_n(gene_ontology3, 15,Freq)
#add x-value column
top_process$Process<-"Process"
make stacked bar plot
ggplot(top_process, aes(y=Freq, x=reorder(Var1, Freq))) +
geom_bar(position="dodge", stat="identity")+ theme_bw()+coord_flip()+ylab("Frequency")+xlab("Biological Process")