---
title: "12-deg_annot_from_hisat"
author: "Megan Ewing"
date: "2024-05-13"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
# do once
# install.packages("R.utils")
library(tidyverse)
library(R.utils)
library(dplyr)

```

## Annotating our top DEGs using Blast and Uniprot Gene Ontology (GO) ids

```{r}

#read in DEG results
DEG <- read.csv("../output/0513-DEG.tab", sep = " ")
head(DEG)

```

```{r}
# make column with LOC names

LOCnames <- rownames(DEG)

DEG$LOC <- LOCnames

head(DEG)

```

```{r}
# read in BLAST results
blast <- read.delim("../output/rphil_blast_cds.tab", sep = "")
head(blast)
```

```{r}
# make sure the column names are the same for the merge
# not needed if you specify "by.x" and "by.y" in the merge function tho 

colnames(blast)[2] <- "LOC"
head(blast)
```

```{r}
# join by gene name

DEG_Blast <- left_join(x = DEG, y = blast, by = "LOC")
head(DEG_Blast)

write_delim(DEG_Blast, "../output/DEG_blast_cds_full.tab")

```

```{r}
# make a short file of just the DEG names and other desired data

DEG_annot <- data.frame( LOC = DEG_Blast$LOC, geneName = DEG_Blast$saccver, baseMean = DEG_Blast$baseMean, log2FoldChange = DEG_Blast$log2FoldChange, pvalue = DEG_Blast$pvalue, padj = DEG_Blast$padj, protein = DEG_Blast$protein, cds = DEG_Blast$cds  )

head(DEG_annot)


```

```{r}
# retrieve gene names for uniprot id lookup

# this is to get just the numbers from the geneName column for uniprot
# so sp|S8FGV1|LAC... becomes just S8FGV1
uniprot_id <- substr(DEG_annot$geneName, 4, 9)

head(uniprot_id)

# add this list of uniprot id names to the dataframe
DEG_annot$uniprot_id <- uniprot_id
head(DEG_annot)

# write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included)
write.table(uniprot_id, "../output/uniprot_id_cds.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

uniprot_id_no_NA <- na.omit(uniprot_id)
write.table(uniprot_id_no_NA, "../output/uniprot_id_cds_no_NA.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)


```

```{r}

# retrieved our unirpot id GO file from the web interface https://www.uniprot.org/
# importing and unzipping file here

gunzip("../data/idmapping_2024_05_14.tsv.gz")

GO_id <- read.csv("../data/idmapping_2024_05_14.tsv", sep = '\t', header = TRUE, row.names=NULL)
head(GO_id)
```

```{r}
# join GO id info to our DEG_annot dataframe from earlier by the uniprot ids

clam_GO_annotations <- merge(DEG_annot, GO_id, by.x = "uniprot_id", by.y = "Entry")
head(clam_GO_annotations)

```

```{r}
# siic it looks all good so let's write to file

write.csv(clam_GO_annotations, "../output/clam_GO_annotations_cds_0514.csv")
```

```{r}
# getting a full list of *all* of the accession numbers for gene enrichment
# this is to get just the numbers from the geneName column for uniprot
# so sp|S8FGV1|LAC... becomes just S8FGV1
uniprot_id_full <- substr(rphil_blast_cds$saccver, 4, 9)

head(uniprot_id_full)

# write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included)
write.table(uniprot_id_full, "../output/uniprot_id_cds_full.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)


```