--- title: "3.0 Blasting!" author: Susan Garcia date: "2023-04-11" output: html_document: theme: cosmo highlight: tango toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(Biostrings) library(tidyverse) library(kableExtra) library(Biostrings) library(DT) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` Assignment to place our code into a more user friendly document. --- #Downloading Software ##BLAST acquisition Dowloaded unto my mac the x64-macosx version of BLAST. ```{r, engine='bash'} #Downloading the code into my desired location. This may change for you. cd /home/joyvan/applications curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz #Unzipping file tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz ``` ```{r, engine='bash'} #Looking at the location of BLAST and this looks at the help menu ~/applications/ncbi-blast-2.13.0+/bin/blastx -h ``` ##Creating the DataBase Downloads and unzips the latest uniprot database. ```{r, engine='bash'} #Locating yourself to where you want to dowload, make sure you gitignore. File is large. cd ~/github/susan-coursework/assignments/data # Using curl, download the latest version of uniprot. curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz # Renaming the file to showcase the new version mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz # Unzip the file gunzip -k uniprot_sprot_r2023_01.fasta.gz #Check the file is where you want it ls ../data #Create BLAST database using the uniprot fasta then place the output in a BLAST database folder # Run makeblastdb ~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \ -in ../data/uniprot_sprot_r2023_01.fasta \ -dbtype prot \ -out ../blastdb/uniprot_sprot_r2023_01 ``` #Obtaining the Query Sequence Download unknown fasta file. ```{r, engine='bash'} # Use curl to download fasta file curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \ -k \ > ../data/Ab_4denovo_CLC6_a.fa ``` What is this fasta file? ```{r, engine='bash'} head ../data/Ab_4denovo_CLC6_a.fa #Find out how many sequences there are. echo "How many sequences are there?" grep -c ">" ../data/Ab_4denovo_CLC6_a.fa ``` Visualizing the Sequence Lengths of the sequences ```{r} # Read out FASTA file f_file <- "../data/Ab_4denovo_CLC6_a.fa" seq <- readDNAStringSet(fasta_file) # Calculate the sequence lengths seq_lengths <- width(sequences) # Create a data frame seq_lengths_df <- data.frame(Length = seq_lengths) # Plot histogram using ggplot2 ggplot(seq_lengths_df, aes(x = Length)) + geom_histogram(binwidth = 0.01, color = "purple", fill = "blue", alpha = 0.75) + labs(title = "Histogram of Sequence Lengths", x = "Sequence Length", y = "Frequency") + theme_minimal() ``` #Running BLAST Annotation of the unknown sequences of the BLAST. ```{r, engine='bash'} ~/applications/ncbi-blast-2.13.0+/bin/blastx \ -query ../data/Ab_4denovo_CLC6_a.fa \ -db ../blastdb/uniprot_sprot_r2023_01 \ -out ../output/Ab_4-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 20 \ -max_target_seqs 1 \ -outfmt 6 ``` Look at the first couple lines and number of line by checking the output file. ```{r, engine='bash'} head -2 ../output/Ab_4-uniprot_blastx.tab wc -l ../output/Ab_4-uniprot_blastx.tab ``` ```{r, engine='bash'} curl -O "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+organism_id:9606" ``` ```{r, engine='bash'} curl -O -H "Accept: text/plain; format=tsv" "https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab" ``` #Joining blast table with annotation table Create the tables with all the uniprot data in R. ```{r, engine='bash'} head -2 ../output/Ab_4-uniprot_blastx.tab wc -l ../output/Ab_4-uniprot_blastx.tab ``` ```{r, engine='bash'} tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab | head -2 ``` ```{r, engine='bash'} tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \ > ../output/Ab_4-uniprot_blastx_sep.tab ``` ```{r, engine='bash'} head -2 ../data/uniprot_table_r2023_01.tab wc -l ../data/uniprot_table_r2023_01.tab ``` ```{r, engine='bash'} curl -O -H "Accept: text/plain; format=tsv" "https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab" ``` ```{r} # Read in data bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../data/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) str(spgo) ``` Combine both blast and annotation tables using a left join and clean up the names. ```{r} kbl( head( left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) ) ) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) ``` ```{r} left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>% write_delim("../output/blast_annot_go.tab", delim = '\t') ``` ##Write the file into the repository. ```{r} annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE) ``` #Visualize data. ```{r, eval=T} #Read Dataset annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE) # Count the occurrences of each protein strg_counts <- table(annot_tab[["Gene.Ontology..biological.process."]]) # Make a data frame and select the top 10 strg_counts_df <- as.data.frame(strg_counts) colnames(strg_counts_df) <- c("Process", "Count") strg_counts_df <- strg_counts_df[order(strg_counts_df$Count, decreasing = TRUE), ] strg_counts_df$Process <- as.character(strg_counts_df$Process) top_10_strgs <- head(subset(strg_counts_df, nchar(strg_counts_df$Process) > 0), n = 10) # Clean up string names top_10_strgs$Process <- str_split_fixed(top_10_strgs$Process, "GO", 2)[,1] top_10_strgs$Process <- str_sub(top_10_strgs$Process, 1, str_length(top_10_strgs$Process)-2) # Plot the top 10 most common strings using ggplot ggplot(top_10_strgs, aes(x = reorder(Process, -Count), y = Count, fill = Process)) + geom_bar(stat = "identity", position = "dodge", color = "black") + labs(title = "Top 10 Process Hits", x = "Process", y = "Count") + theme_minimal() + theme(legend.position = "none") + scale_fill_brewer(palette="Set1") + coord_flip() ```