---
title: "Gene annotation"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
editor_options: 
  markdown: 
    wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```

## Summary
Here I will try to annotate the Manila clam genes. I know NCBI has annoation and I will also blast to SP to get GO information.


# What does NCBI have?

![](http://gannet.fish.washington.edu/seashell/snaps/2024-08-30_13-59-11.png)
<https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_026571515.1/>


Grabbing the code clicking on 'datasets' button in screenshot

```{r, engine='bash'}
cd ../data

/home/shared/datasets \
download genome accession GCF_026571515.1 --include gff3,rna,cds,protein,genome,seq-report
```

```{r, engine='bash'}
cd ../data
unzip ncbi_dataset.zip

```
```{r, engine='bash', eval = TRUE}
cd ../data/ncbi_dataset/data/GCF_026571515.1/
ls *
```


```{r, engine='bash', eval = TRUE}
head ../data/ncbi_dataset/data/GCF_026571515.1/*
```
```{r, engine='bash', eval = TRUE}
tail ../data/ncbi_dataset/data/GCF_026571515.1/*gff
```

# Swiss-Prot Blast

## Create Blast Database


```{r, engine = 'bash'}
cd ../data/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_04.fasta.gz
gunzip -k uniprot_sprot_r2024_04.fasta.gz
```

```{r, engine = 'bash', eval = TRUE}
head ../data/blast_dbs/uniprot_sprot_r2024_04.fasta
echo "Number of Sequences"
grep -c ">" ../data/blast_dbs/uniprot_sprot_r2024_04.fasta
```

```{r, engine = 'bash'}
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../data/blast_dbs/uniprot_sprot_r2024_04.fasta \
-dbtype prot \
-out ../data/blast_dbs/uniprot_sprot_r2024_04
```

# Blast

```{r, engine = 'bash'}
fasta="../data/ncbi_dataset/data/GCF_026571515.1/cds_from_genomic.fna"

/home/shared/ncbi-blast-2.15.0+/bin/blastx \
-query $fasta \
-db ../data/blast_dbs/uniprot_sprot_r2024_04 \
-out ../output/14-gene-annotation/cds_blastx_sp.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{r, engine='bash', eval = TRUE}        
wc -l ../output/14-gene-annotation/cds_blastx_sp.tab
```


```{r, engine='bash', eval = TRUE}         
tr '|' '\t' < ../output/14-gene-annotation/cds_blastx_sp.tab \
> ../output/14-gene-annotation/cds_blastx_sp_sep.tab

head -1 ../output/14-gene-annotation/cds_blastx_sp_sep.tab
```


![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap__in_UniProtKB_search_571864__UniProt_2024-08-23_16-18-03.png)


# **Download Swiss-Prot Information**

```
sr320@raven:/home/shared/8TB_HDD_01/sr320/github/deep-dive/data$ curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_082324.tsv
```

```{bash}
wc -l ../../data/SwissProt-Annot-GO_082324.tsv 
```


# **Join blast with GO info**

```{r}
bltabl <- read.csv("../output/20-Apul-gene-annotation/blastp_out_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../../data/SwissProt-Annot-GO_082324.tsv", sep = '\t', header = TRUE)
```

```{r}
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
  select(
    query = V1,
    blast_hit = V3,
    evalue = V13,
    ProteinNames = Protein.names,
    BiologicalProcess = Gene.Ontology..biological.process.,
    GeneOntologyIDs = Gene.Ontology.IDs
  )
```

```{r}
head(annot_tab)
```
```{r}
write.table(annot_tab, 
            file = "../output/20-Apul-gene-annotation/Apul-protein-GO.tsv", 
            sep = "\t", 
            row.names = FALSE, 
            quote = FALSE)
```

```{bash}
head ../output/20-Apul-gene-annotation/Apul-protein-GO.tsv

```