---
title: "01-repro-annot"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
analyses: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
editor_options: 
  markdown: 
    wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
library(pheatmap)
library(DESeq2)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code analyses
)
```
Lets take proteins and characterize all those involved in reproduction

```{r, engine='bash', eval=TRUE}
head ../data/PO2457_Ostrea_lurida.protein.fasta

```

```{r, engine='bash', eval=TRUE}
grep ">" -c  ../data/PO2457_Ostrea_lurida.protein.fasta

```

# Make BlastDB
 
```{r, engine='bash'}
cd ../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_03.fasta.gz
gunzip -k uniprot_sprot_r2024_03.fasta.gz
```

```{r, engine='bash'}
mkdir ../blastdb
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2024_03.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2024_03
```

# Blast

```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastp \
-query ../data/PO2457_Ostrea_lurida.protein.fasta \
-db ../blastdb/uniprot_sprot_r2024_03 \
-out ../output/01-repro-annot/Olur-uniprot_blastp.tab \
-evalue 1E-20 \
-num_threads 30 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{r, engine='bash', eval=TRUE}
head ../output/01-repro-annot/Olur-uniprot_blastp.tab
```

```{r, engine='bash', eval=TRUE}
wc -l ../output/01-repro-annot/Olur-uniprot_blastp.tab
```


```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastp \
-query ../data/PO2457_Ostrea_lurida.protein.fasta \
-db ../blastdb/uniprot_sprot_r2024_03 \
-out ../output/01-repro-annot/Olur-uniprot_blastp_e10.tab \
-evalue 1E-10 \
-num_threads 30 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{r, engine='bash', eval=TRUE}
head ../output/01-repro-annot/Olur-uniprot_blastp_e10.tab
```

```{r, engine='bash', eval=TRUE}
wc -l ../output/01-repro-annot/Olur-uniprot_blastp_10.tab
```


```{r, engine='bash', eval=TRUE}
tr '|' '\t' < ../output/01-repro-annot/Olur-uniprot_blastp.tab \
> ../output/01-repro-annot/Olur-uniprot_blastp_sep.tab

head -1 ../output/01-repro-annot/Olur-uniprot_blastp_sep.tab
```


# Download SP GO INFORMATION 

```{bash}
curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o ../data/SPannot-GO.tsv
```


bltabl <- read.csv("output/01-repro-annot/Olur-uniprot_blastp_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("data/uniprot_table_go_r2024_03.tab", sep = '\t', header = TRUE)


```{r read-data}
bltabl <- read.csv("../output/01-repro-annot/Olur-uniprot_blastp_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../data/uniprot_table_go_r2024_03.tab", sep = '\t', header = TRUE)
```

```{r}
datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
```

```{r spgo-table}
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
```


Annot <- left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)


```{r see}
datatable(
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
)
```

```{r}
#library(dplyr)
#library(AnnotationDbi)
#library(GO.db)


# Function to check if any term in a list is a descendant of 'GO:0019953 reproduction'
check_reproduction_descendant <- function(go_terms) {
  reproduction <- "GO:0019953"
  terms <- unlist(strsplit(go_terms, "; "))  # Split the terms
  ancestor_list <- lapply(terms, function(term) {
    tryCatch({
      ans <- select(GO.db, keys = term, columns = "ANCESTOR", keytype = "GOID")
      as.character(ans$ANCESTOR)
    }, error = function(e) { return(NULL) })  # Handle missing terms gracefully
  })
  any(sapply(ancestor_list, function(x) reproduction %in% x))
}

# Apply the function to each cell in the dataframe
Annot2 <- Annot %>%
  mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant))

# Output the updated dataframe
print(Annot2)

```

GO:0019953 - sexual reproduction


```{r}
# Function to check if any term in a list is a descendant of 'GO:0019953 reproduction'
check_reproduction_descendant <- function(go_terms) {
  reproduction <- "GO:0019953"
  terms <- unlist(strsplit(go_terms, ";"))  # Split the terms
  ancestor_list <- lapply(terms, function(term) {
    tryCatch({
      ans <- select(GO.db, keys = term, columns = "ANCESTOR", keytype = "GOID")
      as.character(ans$ANCESTOR)
    }, error = function(e) { 
      cat("Error with term:", term, "\nError Message:", e$message, "\n")
      return(NULL)  # Handle missing terms gracefully
    })
  })
  result <- any(sapply(ancestor_list, function(x) reproduction %in% x))
  cat("Terms:", terms, "Result:", result, "\n")  # Debug output
  return(result)
}

# Apply the function to each cell in the dataframe
Annot2 <- Annot %>%
  dplyr::mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant))

# Output the updated dataframe
print(Annot2)

```


333
```{r}
library(dplyr)
library(GO.db)


# Function to check if a term is a descendant of 'GO:0000003 reproduction'
check_reproduction_descendant <- function(go_term) {
  reproduction <- "GO:0000003"
  ancestors <- as.character(GOTERM[go_term]@ancestors)
  return(reproduction %in% ancestors)
}
```


```{r}
# Function to check if any term in a list is a descendant of 'GO:0000003 reproduction'
check_reproduction_descendant <- function(go_terms) {
  reproduction <- "GO:0000003"
  terms <- unlist(strsplit(go_terms, ";"))  # Split the terms
  # Check each term and return TRUE if any term is a descendant
  any(sapply(terms, function(term) reproduction %in% as.character(GOTERM[[term]]@ancestors)))
}

# Apply the function to each cell in the dataframe
Annot2 <- Annot %>%
  mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant))

# Output the updated dataframe
print(Annot2)
```


```{r}


# Apply the function to each GO term in the dataframe
Annot2 <- Annot %>%
  mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant))

# Output the updated dataframe
print(Annot2)
```

# Apply the function to each GO term in the dataframe
data <- data %>%
  mutate(Is_Reproduction = sapply(GO_Term, check_reproduction_descendant))

# Output the updated dataframe
print(data)

```


333


```{r}
marts <- listMarts()
print(marts)
```


```{r}
ensembl <- useDataset("cintestinalis_gene_ensembl", mart = ensembl)
```
```{r}
ensembl <- useMart("uniprot", dataset = "uniprot")
```
```{r}
gene2go <- getBM(
    attributes = c("uniprotswissprot", "external_gene_name", "go_id", "namespace_1003"),
    filters = "uniprotswissprot", 
    values = genes, 
    mart = ensembl
)
print(gene2go)

```


```{r}
gene2go <- getBM(
    attributes = c("uniprotswissprot", "external_gene_name", "go_id", "namespace_1003"),
    filters = "uniprotswissprot", 
    values = genes, 
    mart = ensembl
)
```


```{r join}
annot_tab <-
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, 
            pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
```


if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install(c("clusterProfiler", "GO.db", "org.Hs.eg.db", "dplyr", "GOSemSim"))

library(clusterProfiler)
library(GO.db)
library(org.Hs.eg.db)
library(dplyr)
library(GOSemSim)


```{r}
# Read dataset
dataset <- read.csv("../output/blast_annot_go.tab", sep = '\t')  # Replace with the path to your dataset

# Select the column of interest
column_name <- "Organism"  # Replace with the name of the column of interest
column_data <- dataset[[column_name]]

# Count the occurrences of the strings in the column
string_counts <- table(column_data)

# Convert to a data frame, sort by count, and select the top 10
string_counts_df <- as.data.frame(string_counts)
colnames(string_counts_df) <- c("String", "Count")
string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ]
top_10_strings <- head(string_counts_df, n = 10)