---
title: "GO Annoations"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
format:
  html:
    toc: true
    toc-depth: 2
    html-math-method: katex
    css: styles.css
    theme: sandstone
editor: 
  markdown: 
    wrap: 72
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```


Want to start with grabbing GOs 

see also https://www.ebi.ac.uk/QuickGO/annotations

:GO:0009060
:aerobic respiration

:GO:0061621
:canonical glycolysis

:GO:0006119
:oxidative phosphorylation


#Variables
```{r setup, include=FALSE}
# Global R options
knitr::opts_chunk$set(echo = TRUE)

# Define key paths and tool directories
 
OUT_DIR <- "../output/27-Apul-pheno-annot/"


# Export these as environment variables for bash chunks.
Sys.setenv(
  OUT_DIR = OUT_DIR
)

```


# Aerobic respiration (GO:0009060)
aerobic respiration

```{bash}
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0009060%29%29+AND+%28reviewed%3Atrue%29" -o ../output/27-Apul-pheno-annot/SwissProt-GO:0009060.fa
```


```{bash}
head ../output/27-Apul-pheno-annot/SwissProt-GO:0009060.fa
grep -c ">" ../output/27-Apul-pheno-annot/SwissProt-GO:0009060.fa
```


Lets Apul as query
and SwissProt-GO:0009060.fa is the database. 

```{r, engine='bash'}
echo "${OUT_DIR}"
```


```{r, engine='bash'}
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:0009060.fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:0009060
```


```{bash}
fasta="../data/Apulchra-genome.pep.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:0009060 \
-out "${OUT_DIR}"Apul_blastp-GO:0009060_out.tab \
-evalue 1E-05 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{bash}
head "${OUT_DIR}"Apul_blastp-GO:0009060_out.tab
wc -l "${OUT_DIR}"Apul_blastp-GO:0009060_out.tab
```

# Oxidative phosphorylation (GO:0006119)
:oxidative phosphorylation

```{r, engine='bash'}
GO="0006119"
fasta="../data/Apulchra-genome.pep.faa"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue 1E-05 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
```


# Canonical glycolysis (GO:0061621)
GO:0061621
:canonical glycolysis
```{bash}
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0061621%29%29+AND+%28reviewed%3Atrue%29" -o ../output/27-Apul-pheno-annot/SwissProt-GO:0061621.fa
```


```{bash}
head "${OUT_DIR}"SwissProt-GO:0061621.fa
grep -c ">" "${OUT_DIR}"SwissProt-GO:0061621.fa
```


Lets Apul as query
and SwissProt-GO:0061621.fa is the database. 


```{r, engine='bash'}
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:0061621.fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:0061621
```


```{bash}
fasta="../data/Apulchra-genome.pep.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:0061621 \
-out "${OUT_DIR}"Apul_blastp-GO:0061621_out.tab \
-evalue 1E-05 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{bash}
head "${OUT_DIR}"Apul_blastp-GO:0061621_out.tab
wc -l "${OUT_DIR}"Apul_blastp-GO:0061621_out.tab
```


# Tricarboxylic Acid Cycle (GO:0006099)


```{r, engine='bash'}
GO="0006099"
fasta="../data/Apulchra-genome.pep.faa"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue 1E-05 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
```


# Oxidative phosphorylation (GO:0006119)

```{r, engine='bash'}
GO="0006119"
fasta="../data/Apulchra-genome.pep.faa"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue 1E-05 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
```


old stuff

## Vizualize in R using heatmaps

```{r}
# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)
library(genefilter) #for pOverA filtering

#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
 
count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
#rownames(count_mat) <- all_genes

# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))

dim(count_mat)
```

## pOverA filtering

```{r}
ffun<-filterfun(pOverA(0.25,10))  #set up filtering parameters
counts <- count_mat %>% select(-gene_id)
count_mat_poa <- genefilter((counts), ffun) #apply filter
sum(count_mat_poa) #count number of genes left

count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter

count_mat <- count_mat_poa
```

The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df 

Read in file with gene name iterations 
```{r}
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
  select(Query, Gene)

count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))
```

##  Glycolysis (GO:0006096)

```{r}
go_term = "0006096"

# Read the tab file for the specified GO term
go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab")
GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE)

colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", 
                       "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score")

head(GO_data)
dim(GO_data)

count_GO <- GO_data %>%
  left_join(count_mat, by = c("query_id" = "Query")) %>%
  na.omit()

# Check the filtered result
head(count_GO)
dim(count_GO)
```