---
title: "3.0 Blasting!"
author: Susan Garcia
date: "2023-04-11" 
output: 
  html_document:
    theme: cosmo
    highlight: tango
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true

---

```{r setup, include=FALSE}
library(knitr)
library(Biostrings)
library(tidyverse)
library(kableExtra)
library(Biostrings)
library(DT)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```


Assignment to place our code into a more user friendly document.


---

#Downloading Software

##BLAST acquisition


Dowloaded unto my mac the x64-macosx version of BLAST. 


```{r, engine='bash'}
#Downloading the code into my desired location. This may change for you. 
cd /home/joyvan/applications
curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz

#Unzipping file
tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz
```

```{r, engine='bash'}
#Looking at the location of BLAST and this looks at the help menu

~/applications/ncbi-blast-2.13.0+/bin/blastx -h
```

##Creating the DataBase

Downloads and unzips the latest uniprot database.

```{r, engine='bash'}
#Locating yourself to where you want to dowload, make sure you gitignore. File is large.
cd ~/github/susan-coursework/assignments/data

# Using curl, download the latest version of uniprot.
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

# Renaming the file to showcase the new version
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz

# Unzip the file
gunzip -k uniprot_sprot_r2023_01.fasta.gz

#Check the file is where you want it
ls ../data

#Create BLAST database using the uniprot fasta then place the output in a BLAST database folder


# Run makeblastdb

~/applications/ncbi-blast-2.13.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2023_01.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_01

```


#Obtaining the Query Sequence

Download unknown fasta file.

```{r, engine='bash'}
# Use curl to download fasta file
curl https://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa \
-k \
> ../data/Ab_4denovo_CLC6_a.fa
```

What is this fasta file?

```{r, engine='bash'}
head ../data/Ab_4denovo_CLC6_a.fa

#Find out how many sequences there are.
echo "How many sequences are there?"
grep -c ">" ../data/Ab_4denovo_CLC6_a.fa
```

Visualizing the Sequence Lengths of the sequences
```{r}
# Read out FASTA file
f_file <- "../data/Ab_4denovo_CLC6_a.fa"  
seq <- readDNAStringSet(fasta_file)

# Calculate the sequence lengths
seq_lengths <- width(sequences)

# Create a data frame
seq_lengths_df <- data.frame(Length = seq_lengths)

# Plot histogram using ggplot2
ggplot(seq_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 0.01, color = "purple", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()
```

#Running BLAST
Annotation of the unknown sequences of the BLAST.
```{r, engine='bash'}
~/applications/ncbi-blast-2.13.0+/bin/blastx \
-query ../data/Ab_4denovo_CLC6_a.fa \
-db ../blastdb/uniprot_sprot_r2023_01 \
-out ../output/Ab_4-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6
```

Look at the first couple lines and number of line by checking the output file.
```{r, engine='bash'}
head -2 ../output/Ab_4-uniprot_blastx.tab
wc -l ../output/Ab_4-uniprot_blastx.tab
```

```{r, engine='bash'}
curl -O "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+organism_id:9606"
```

```{r, engine='bash'}
curl -O -H "Accept: text/plain; format=tsv" "https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab"
```


#Joining blast table with annotation table

Create the tables with all the uniprot data in R.

```{r, engine='bash'}
head -2 ../output/Ab_4-uniprot_blastx.tab
wc -l ../output/Ab_4-uniprot_blastx.tab
```

```{r, engine='bash'}
tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab | head -2
```

```{r, engine='bash'}
tr '|' '\t' < ../output/Ab_4-uniprot_blastx.tab \
> ../output/Ab_4-uniprot_blastx_sep.tab
```


```{r, engine='bash'}
head -2 ../data/uniprot_table_r2023_01.tab
wc -l ../data/uniprot_table_r2023_01.tab
```

```{r, engine='bash'}
curl -O -H "Accept: text/plain; format=tsv" "https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab"
```

```{r}
# Read in data
bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
spgo <- read.csv("../data/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)
str(spgo)
```

Combine both blast and annotation tables using a left join and clean up the names. 

```{r}
kbl(
head(
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
)
) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) 
  
```


```{r}
 left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>%
write_delim("../output/blast_annot_go.tab", delim = '\t')
```

##Write the file into the repository.
```{r}
annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)
```

#Visualize data.

```{r, eval=T}
#Read Dataset
annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE)

# Count the occurrences of each protein

strg_counts <- table(annot_tab[["Gene.Ontology..biological.process."]])

# Make a data frame and select the top 10

strg_counts_df <- as.data.frame(strg_counts)
colnames(strg_counts_df) <- c("Process", "Count")
strg_counts_df <- strg_counts_df[order(strg_counts_df$Count, decreasing = TRUE), ]
strg_counts_df$Process <- as.character(strg_counts_df$Process)
top_10_strgs <- head(subset(strg_counts_df, nchar(strg_counts_df$Process) > 0), n = 10)

# Clean up string names

top_10_strgs$Process <- str_split_fixed(top_10_strgs$Process, "GO", 2)[,1]
top_10_strgs$Process <- str_sub(top_10_strgs$Process, 1, str_length(top_10_strgs$Process)-2)


# Plot the top 10 most common strings using ggplot

ggplot(top_10_strgs, aes(x = reorder(Process, -Count), y = Count, fill = Process)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Top 10 Process Hits",
       x = "Process",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette="Set1") +
  coord_flip()
```