--- title: "Appending Gene Symbols and Protein Descriptions with blastx" output: html_document date: "2025-03-26" --- ```{r, engine='bash'} cd ../data curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2025_03.fasta.gz gunzip -k uniprot_sprot_r2025_03.fasta.gz ls ../data ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in /home/shared/8TB_HDD_02/cvaldi/Botryllus-Nickel-Tox/data/uniprot_sprot_r2025_03.fasta \ -dbtype prot \ -out /home/shared/8TB_HDD_02/cvaldi/Botryllus-Nickel-Tox/output/blastdb/uniprot_sprot_r2025_03 ``` ```{r, engine='bash'} head -n 20 ../data/noname.fasta # View the first 20 lines of the file to ensure it's correct echo "How many sequences are there?" grep -c ">" ../data/noname.fasta # Count the number of sequences in the FASTA file ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastx \ -query ../data/noname.fasta \ -db ../output/blastdb/uniprot_sprot_r2025_03 \ -out ../output/Bsc-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 20 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{r, engine='bash'} head -2 ../output/Bsc-uniprot_blastx.tab # View the first two lines of the BLAST output wc -l ../output/Bsc-uniprot_blastx.tab # Count the number of results ``` ```{r, engine='bash'} tr '|' '\t' < ../output/Bsc-uniprot_blastx.tab | head -2 # replace the "|" with "\t" in the file "Ab_4-uniprot_blastx.tab" # and save the output to the file "Ab_4-uniprot_blastx_sep.tab" tr '|' '\t' < ../output/Bsc-uniprot_blastx.tab \ > ../output/Bsc-uniprot_blastx_sep.tab ``` ```{r} bltabl <- read.csv("../output/Bsc-uniprot_blastx_sep.tab", sep = '\t', header = FALSE) ``` ```{r} # Reading the Uniprot table spgo <- read.csv("../data/uniprot_sprot_r2025_03_cleaned.tab", sep = "\t", header = TRUE) # Merge the BLAST results with the Uniprot annotations annotated_blast <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V4, V13, Protein.names, Organism) %>% write_delim("../output/blast_annot_go.tab", delim = '\t') ```