---
title: "06 more go"
output: html_document
date: 
---

new GO terms to try: Glycolysis (GO:0006096), Tricarboxylic Acid Cycle (GO:0006099), Oxidative phosphorylation (GO:0006119)


#  Glycolysis (GO:0006096)

```{bash}
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0006096%29%29+AND+%28reviewed%3Atrue%29" -o ../output/06-moreGO/SwissProt-GO:0006096.fa
```


```{bash}
head ../output/06-moreGO/SwissProt-GO:0006096.fa
grep -c ">" ../output/06-moreGO/SwissProt-GO:0006096.fa
```


Lets Pver as query
and SwissProt-GO:0006096.fa is the database. 


```{bash}
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../output/06-moreGO/SwissProt-GO:0006096.fa \
-dbtype prot \
-out ../output/06-moreGO/SwissProt-GO:0006096
```


```{bash}
fasta="../data/Pver_proteins_names_v1.0.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../output/06-moreGO/SwissProt-GO:0006096 \
-out ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{bash}
head ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab
wc -l ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab
```
---


Tricarboxylic Acid Cycle (GO:0006099)


```{bash}
# Set the variable for the GO term
GO_TERM="0006099"

# Use the variable in the curl command
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa"


head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa
grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa


/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \
-dbtype prot \
-out ../output/06-moreGO/SwissProt-GO:${GO_TERM}


fasta="../data/Pver_proteins_names_v1.0.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \
-out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2>/dev/null

head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
```


# Oxidative phosphorylation (GO:0006119)


```{bash}
# Set the variable for the GO term
GO_TERM="0006119"

# Use the variable in the curl command
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa"


head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa
grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa


/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \
-dbtype prot \
-out ../output/06-moreGO/SwissProt-GO:${GO_TERM}


fasta="../data/Pver_proteins_names_v1.0.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \
-out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2>/dev/null

head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
```


## Vizualize in R using heatmaps

```{r}
# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)
library(genefilter) #for pOverA filtering

#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
 
count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
#rownames(count_mat) <- all_genes

# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))

dim(count_mat)
```

## pOverA filtering

```{r}
ffun<-filterfun(pOverA(0.25,10))  #set up filtering parameters
counts <- count_mat %>% select(-gene_id)
count_mat_poa <- genefilter((counts), ffun) #apply filter
sum(count_mat_poa) #count number of genes left

count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter

count_mat <- count_mat_poa
```

The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df 

Read in file with gene name iterations 
```{r}
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
  select(Query, Gene)

count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))
```

##  Glycolysis (GO:0006096)

```{r}
go_term = "0006096"

# Read the tab file for the specified GO term
go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab")
GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE)

colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", 
                       "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score")

head(GO_data)
dim(GO_data)

count_GO <- GO_data %>%
  left_join(count_mat, by = c("query_id" = "Query")) %>%
  na.omit()

# Check the filtered result
head(count_GO)
dim(count_GO)
```

```{r}
expression_data <- count_GO %>% select(-c(1:13))  # Adjust the column selection if necessary

# Set row names to gene identifiers
rownames(expression_data) <- count_GO$Gene

# Convert expression data to a matrix
expression_matrix <- as.matrix(expression_data)

# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_matrix)))

# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0
```

### heatmap

```{r}
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = TRUE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),    # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = FALSE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))
```

## Tricarboxylic Acid Cycle (GO:0006099)

```{r}
go_term = "0006099"

# Read the tab file for the specified GO term
go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab")
GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE)

colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", 
                       "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score")

head(GO_data)
dim(GO_data)

count_GO <- GO_data %>%
  left_join(count_mat, by = c("query_id" = "Query")) %>%
  na.omit()

# Check the filtered result
head(count_GO)
dim(count_GO)
```

```{r}
expression_data <- count_GO %>% select(-c(1:13))  # Adjust the column selection if necessary

# Set row names to gene identifiers
rownames(expression_data) <- count_GO$Gene

# Convert expression data to a matrix
expression_matrix <- as.matrix(expression_data)

# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_matrix)))

# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0
```

### heatmap

```{r}
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = TRUE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),    # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = FALSE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))
```

## Oxidative phosphorylation (GO:0006119)

```{r}
go_term = "0006119"

# Read the tab file for the specified GO term
go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab")
GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE)

colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", 
                       "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score")

head(GO_data)
dim(GO_data)

count_GO <- GO_data %>%
  left_join(count_mat, by = c("query_id" = "Query")) %>%
  na.omit()

# Check the filtered result
head(count_GO)
dim(count_GO)
```


```{r}
expression_data <- count_GO %>% select(-c(1:13))  # Adjust the column selection if necessary

# Set row names to gene identifiers
rownames(expression_data) <- count_GO$Gene

# Convert expression data to a matrix
expression_matrix <- as.matrix(expression_data)

# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_matrix)))

# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0
```

### heatmap

```{r}
# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = TRUE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = paste("Genes in", go_term),    # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = FALSE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))
```


![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_QuickGOTerm_GO0040029_2024-10-03_16-29-35.png)

# Grabbing Epigenetic Regulators of Gene Expression

GO:0040029  
epigenetic regulation of gene expression


```{bash}
# Set the variable for the GO term
GO_TERM="0040029"

# Use the variable in the curl command
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa"


head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa
grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa


/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \
-dbtype prot \
-out ../output/06-moreGO/SwissProt-GO:${GO_TERM}


fasta="../data/Pver_proteins_names_v1.0.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \
-out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2>/dev/null

head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab
```