---
title: "04-Carbon-transport"
output: html_document
date: "2024-10-02"
---

![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_QuickGOTerm_GO0034219_2024-10-02_11-46-38.png)


```{bash}
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0034219%29%29+AND+%28reviewed%3Atrue%29" -o ../output/04-carbon-transport/SwissProt-GO:0034219.fa
```


```{bash}
head ../output/04-carbon-transport/SwissProt-GO:0034219.fa
grep -c ">" ../output/04-carbon-transport/SwissProt-GO:0034219.fa
```


Lets Pver as query
and SwissProt-GO:0034219 is the database. 


```{bash}
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../output/04-carbon-transport/SwissProt-GO:0034219.fa \
-dbtype prot \
-out ../output/04-carbon-transport/SwissProt-GO:0034219
```


```{bash}
fasta="../data/Pver_proteins_names_v1.0.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../output/04-carbon-transport/SwissProt-GO:0034219 \
-out ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{bash}
head ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab
wc -l ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab
```


## Vizualize in R using heatmaps

```{r}
# Load necessary libraries
library(readr)
library(readxl)
library(tidyverse)

#install.packages("ComplexHeatmap")
library(ComplexHeatmap)
 
count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv")
all_genes <- count_mat$gene_id
#count_mat <- count_mat %>% select(-gene_id)
rownames(count_mat) <- all_genes

# Clean up the column names
colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat))

head(count_mat)
dim(count_mat)
```

The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df 

Read in file with gene name iterations 
```{r}
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
  select(Query, Gene)

count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene"))
count_mat
```

```{r}
# Read the tab file and extract row names
GO_0034219 <- read_delim("../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab", delim = "\t", col_names = FALSE)

colnames(GO_0034219) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", 
                     "s_end", "e_value", "bit_score")
head(GO_0034219)

count_GO <- GO_0034219 %>%
  left_join(count_mat, by = c("query_id" = "Query" )) %>%
  na.omit()

# Check the filtered result
head(count_GO)
dim(count_GO)
colnames(count_GO)
```

```{r}
expression_data <- count_GO %>% select(-c(1:13))  # Adjust the column selection if necessary

# Set row names to gene identifiers
rownames(expression_data) <- count_GO$Gene

# Convert expression data to a matrix
expression_matrix <- as.matrix(expression_data)

# Calculate Z-scores for the expression matrix
z_score_matrix <- t(scale(t(expression_matrix)))

# Replace NA/NaN with 0
z_score_matrix[is.nan(z_score_matrix)] = 0

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = "Genes in GO term",               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = TRUE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))

# Create the heatmap using ComplexHeatmap
Heatmap(z_score_matrix,
         na_col = "black",
         name = "Gene Count",         # Name for the heatmap color bar
         row_title = "Genes in GO term",               # Title for the row side
         column_title = "Samples",           # Title for the column side
         show_row_names = TRUE,              # Show gene names
         show_column_names = TRUE,           # Show sample names
         cluster_rows = TRUE,                # Cluster rows
         cluster_columns = FALSE,             # Cluster columns
         row_dend_reorder = TRUE,            # Reorder dendrogram based on clustering
         column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6),
            # Reorder dendrogram based on clustering
         heatmap_legend_param = list(title = "Gene Count"))
```