---
title: "03-RNAseqSamples"
author: "Celeste Valdivia"
date: "2024-04-15"
output: html_document
---

```{r, eval =TRUE}
library(knitr)
library(tidyverse)
library(tidyr)
library(dplyr)
library(hrbrthemes)
library(ggplot2)
library(RColorBrewer)
library(ggpubr)
library(stringr)
knitr::opts_chunk$set(echo = TRUE,
                      eval = FALSE)
```

# Objective

Determine which 12 of the 39 RNA samples available from the Autumn 2023 nickel trials to send off for RNAseq. We are targeting all late stage (C2-D) and high (45 mg/L nickel) vs control.

# Revtrieve Data
We will need both the morphometric data and the nanodrop readings. 

```{r, engine='bash', eval=FALSE}
cd ..

curl -L https://docs.google.com/spreadsheets/d/10uM3N3PD9xIP4yUnadfhkcXa8TPMRMD-adOIKbmkYzY/export?exportFormat=csv | tee data/morph.csv
```

nanodrop data
```{r, engine='bash', eval=FALSE}

cd ..

curl -L https://docs.google.com/spreadsheets/d/1A8RAq72h2FLstFS7FcUE5hDpYsaJg9ql0M7iGxuN2xA/export?exportFormat=csv | tee data/nanodrop.csv
```

Read in the data to your local R environment.

```{r, eval=TRUE}
setwd('..')
morph <- read.csv(file = "data/morph.csv")  
```

``` {r}
setwd('..')
nano <- read.csv(file = "data/nanodrop.csv")
```

# Cleaning up Data for Morph

```{r, eval=TRUE}
morph$date <- mdy(morph$date) #convert the date column from characters to true date
```

```{r, eval=TRUE}
morph <- morph %>%
  separate(jar_id, c("treatment", "replicate"), sep = "-") #create two new columns, treatment and replicate from jar id 
```

```{r, eval=TRUE}
morph <- morph %>%
  mutate(treatment = as.factor(treatment)) %>%
  mutate(stage = as.factor(stage)) %>%
  mutate(animal_id = as.factor(animal_id)) %>%
  mutate(date = as.factor(date))
```

```{r}
# Create a new column 'simple_stage' based on conditions
morph <- morph %>%
  mutate(simple_stage = case_when(
    stage %in% c("A1", "A2") ~ "A",
    stage %in% c("B1", "B2") ~ "B",
    stage %in% c("C1", "C2") ~ "C",
    stage == "TO" ~ "D",
    TRUE ~ NA_character_  # This will handle any other cases or return NA if none match
  ))
```

New data frame with only the rows after 24 hours of exposure when the RNA samples were saved.

```{r}
morph_24 <- morph[morph$hpe == 24, ]
```


# Cleaning up Nano Data

Extract identifiers from the sample_id column and make new columns.

```{r}
nano <- nano %>%
  mutate(jar_id = str_extract(sample_id, "(0|5|45)-R[1-4]"))%>%
  mutate(experiment = str_extract(sample_id, "E[1-4]"))
```

```{r, eval=TRUE}
nano <- nano %>%
  separate(jar_id, c("treatment", "replicate"), sep = "-") #create two new columns, treatment and replicate from jar id 
```


```{r, eval=TRUE}
nano$date <- mdy(nano$date) #convert the date column from characters to true date
```

```{r, eval=TRUE}
nano <- nano %>%
  mutate(treatment = as.factor(treatment)) %>%
  mutate(experiment = as.factor(experiment)) %>%
  mutate(date = as.factor(date))
```

# Join Data Frames

```{r}
# Perform the inner join with an additional condition
merged_data <- inner_join(nano, morph, by = c("date", "experiment", "replicate", "treatment"))
```

# Reduce data frame to only columns of interest
Major Note: The system that was actually saved on the 24 hour mark is system 1 for the RNA work. System 2 was set aside for the proteomic work. If only one of them was tied it was the one that was set aside for proteomics. 

For RNAseq animal requirements in descending order or importance:

-   Stage C2 or TO/D
-   45 mg/L or 0 mg/L (although I wrote out so the csv file also contains the 5 mg/L option just in case we go with that instead)
-   Minimum 200 ng RNA available
-   A260.280 > 1.8
-   Both Attached

```{r}
merged <- merged_data[merged_data$hpe == 24, c(1, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 20)] # get the rows for only the 24 hour mark since that's the day we saved everything in liquid nitrogen


# Note we are modifying the entries in this column because there is no available proteomic samples from E1 or E2 since they were sacraficed for dissections    
merged <- merged %>% 
  mutate(no_prim_sys2_Prot = case_when(
    experiment %in% c("E1", "E2") ~ 0,
    TRUE ~ no_prim_sys2_Prot
  ))


merged_all_rows <- merged

merged_late_stage <- merged[merged$stage_2 == "late", ] # get only the rows with animals in the late stage (C2 or D)

merged_late_stage <- merged_late_stage[merged_late_stage$A260.A280 > 1.5,] # get only rows where the RNA quality is above 1.5 (although ideally its greater than 1.8, may need to do a clean up step)

#for late stage
merged_late_stage <- merged_late_stage %>%
  mutate(attachment_sys2_prot = case_when(
    attachment == "attached" ~ "attached",
    attachment %in% c("both tied", "one tied", "tied") ~ "tied", # simplify the column attachment so that we know if the system intended for proteomics was attached or tied to the glass slide at freezing
    TRUE ~ NA_character_ #handle any other cases or return NA if none match
  ))

merged_late_stage <- merged_late_stage %>%
  mutate(attachment_sys1_RNA = case_when(
    attachment %in% c("attached", "one tied") ~ "attached",
    attachment %in% c("both tied", "tied") ~ "tied",
    TRUE ~ NA_character_
  ))

#for all
merged_all_rows <- merged_all_rows %>%
  mutate(attachment_sys2_prot = case_when(
    attachment == "attached" ~ "attached",
    attachment %in% c("both tied", "one tied", "tied") ~ "tied", # simplify the column attachment so that we know if the system intended for proteomics was attached or tied to the glass slide at freezing
    TRUE ~ NA_character_ #handle any other cases or return NA if none match
  ))

merged_all_rows <- merged_all_rows %>%
  mutate(attachment_sys1_RNA = case_when(
    attachment %in% c("attached", "one tied") ~ "attached",
    attachment %in% c("both tied", "tied") ~ "tied",
    TRUE ~ NA_character_
  ))


merged_all_rows <- merged_all_rows[, -15] # get rid of column attachment since we have our two new columns specifying attachment for RNA and proteomic intended systems

merged_late_stage <- merged_late_stage[, -15]
```

Only have four 45 mg/L samples available for this RNAseq study.

```{r}
setwd('..')
write.csv(merged_late_stage, file ="output/RNAquality_sampleID_latestage_RNAseq.csv")

write.csv(merged_all_rows, file = "output/RNAquality_sampleID_allsamples.csv")
```