---
title: "R Notebook"
author: "Megan Ewing"
date: "2024-11-22"
output:
  html_document: default
  word_document: default
---

```{r}

# load installed packages
library(dplyr)
library(tidyr)
```

### read in DEG and BLAST data

```{r}

stats <- read.csv("../output/1020-STAR-DEGstats_ToC_8ind27r.tab", sep="")
head(stats)

stats$LOC <- row.names(stats)
head(stats)


blast <- read.delim("../output/1125-blastout.tab")
head(blast)
```

### gene level

select best bitscore hit for each gene to get gene level blast results (as opposed to isoform level)

```{r}

# Create the gene-level dataframe

blast_gene <- blast %>% 
  group_by(LOC) %>% 
  # Group by gene ID.

  dplyr::slice(which.min(Evalue)) %>% 
  # Select the row with the smallest Evalue for each gene.

  separate(Subject_ID, sep="\\|", into=c("na", "SPID", "gene.Uni"), remove = F) %>% 
  # Split 'Subject_ID' into 'na', 'SPID', and 'gene.Uni' using "|" as a separator.

  dplyr::select(-na) %>% 
  # Remove the temporary 'na' column.

  separate(gene.Uni, sep="_", into=c("gene.Uni", "species"), remove = T)
  # Split 'gene.Uni' into 'gene.Uni' and 'species' using "_" as a separator.

head(blast_gene)
```

### join deg info to blast info by LOC

```{r}

DEG_annot <- left_join(stats, blast_gene, by = "LOC")
head(DEG_annot)

row.names(DEG_annot) <- DEG_annot$LOC
head(DEG_annot)

```

### getting uniprot info

first need to extract SwissProt IDs

```{r}

SPID <- DEG_annot$SPID
head(SPID)

write(SPID, "../output/1126-DEG_SPID_star.txt")


```

went to uniprot website, mapped SPIDs, saved as tsv and unzipped. importing here.

```{r}

uniprot <- read_delim("../output/idmapping_2024_11_27.tsv")
head(uniprot)

colnames(uniprot)[colnames(uniprot) == "From"] <- "SPID"
head(uniprot)
```

### join blast and uniprot info

```{r}

DEG_annot2 <- left_join(DEG_annot, uniprot, by = "SPID")
head(DEG_annot2)

write_csv(DEG_annot2, "../output/1126-DEG_annot_star.csv", col_names = T)

```