---
title: "Comparing gene tables"
output: html_document
date: "2024-02-20"
---
### Load libraries
```{r load_libraries, inlcude = TRUE}
## clear
rm(list=ls())
# List of packages we want to install (run every time)
load.lib<-c("RColorBrewer","readxl","ggpubr","tidyverse","tibble","stringr","beepr","gplots")
# Select only the packages that aren't currently installed (run every time)
install.lib <- load.lib[!load.lib %in% installed.packages()]
# And finally we install the missing packages, including their dependency.
for(lib in install.lib) install.packages(lib,dependencies=TRUE)
# After the installation process completes, we load all packages.
sapply(load.lib,require,character=TRUE)
#invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))
library(tidyverse)
library(dplyr)
```
```{r}
library(tidyr)
# Load output of blastx, uniprot, and GO into a single masterID table
masterID <- read.delim("output/LOC_GO_list.txt", sep= "\t", header = TRUE)
#Cut out extras
masterID <- masterID[, -((ncol(masterID)-2):ncol(masterID))]
# Rename remaining columns
colnames(masterID) <- c("transcript", "database","uniprot_accession","geneID","species", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "reviewed", "entryname", "protein_name", "gene_name", "organism", "prot_length", "GO_ID", "GO_BP", "GO_MF", "GO_CC", "Genename_orderedlocus", "Induction", "Pathwaycommons", "Reactome", "Unipathway", "Interacts_with", "LOC_ID")
```
### FDO merge
```{r}
#FDO_LC_siggene <- read.delim("output/DEG_lists/Foot/FDO_LC_siggene.csv", sep = " ", header = TRUE)
FDO_TC_siggenes_apeglm <- read.delim("output/DEG_lists/Foot/FDO_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
FDO_TC_siggenes_apeglm <- FDO_TC_siggenes_apeglm %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
# FDO_sigs <- inner_join(FDO_TC_siggene, FDO_LC_siggene, by = 'gene')
# #basemean x is TC, basemean y is LC
#
# FDO_sigs$log2FoldChange <- FDO_sigs$log2FoldChange.x-FDO_sigs$log2FoldChange.y
# FDO_sigs_inter <- FDO_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# FDO_sigs_merged <- merge(FDO_TC_siggene, FDO_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# FDO_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FDO_sigs_merged$log2FoldChange.y) , FDO_sigs_merged$log2FoldChange.y, FDO_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_x" if needed
# FDO_sigs_merged <- FDO_sigs_merged[, !(names(FDO_sigs_merged) %in% c("log2FoldChange.y"))]
# # change gene column to be called transcript
# colnames(FDO_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FDO_TC_siggene <- left_join(FDO_TC_siggenes_apeglm, masterID, by = "LOC_ID")
FDO_sigs_unID <- FDO_TC_siggene[is.na(FDO_TC_siggene$geneID), ]
FDO_sigs_ID <- FDO_TC_siggene[!is.na(FDO_TC_siggene$geneID), ]
nrow(FDO_sigs_ID) # 361 id'ed genes
nrow(FDO_sigs_unID) # 99 unID'd genes
# 460 total expressed
#Write file of merged files
write_csv(FDO_TC_siggene, file = "output/DEG_lists/GOterms_genome/FDO_sigs_merged.csv")
write_csv(FDO_sigs_unID, file = "output/DEG_lists/GOterms_genome/FDO_sigs_unID.csv")
write_csv(FDO_sigs_ID, file = "output/DEG_lists/GOterms_genome/FDO_sigs_ID.csv")
```
### GDO merge
```{r}
#GDO_LC_siggene <- read.delim("output/DEG_lists/Gill/GDO_LC_siggene.csv", sep = " ", header = TRUE)
GDO_TC_siggene <- read.delim("output/DEG_lists/Gill/GDO_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
GDO_TC_siggene <- GDO_TC_siggene %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
# GDO_sigs <- inner_join(GDO_TC_siggene, GDO_LC_siggene, by = 'gene')
# #basemean x is TC, basemean y is LC
#
# GDO_sigs$log2FoldChange <- GDO_sigs$log2FoldChange.x-GDO_sigs$log2FoldChange.y
# GDO_sigs_inter <- GDO_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# GDO_sigs_merged <- merge(GDO_TC_siggene, GDO_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# GDO_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GDO_sigs_merged$log2FoldChange.y) , GDO_sigs_merged$log2FoldChange.y, GDO_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_x" if needed
# GDO_sigs_merged <- GDO_sigs_merged[, !(names(GDO_sigs_merged) %in% c("log2FoldChange.y"))]
#
# # change gene column to be called transcript
# colnames(GDO_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
GDO_sigs_merged <- left_join(GDO_TC_siggene, masterID, by = "LOC_ID")
GDO_sigs_unID <- GDO_sigs_merged[is.na(GDO_sigs_merged$geneID), ]
GDO_sigs_ID <- GDO_sigs_merged[!is.na(GDO_sigs_merged$geneID), ]
nrow(GDO_sigs_ID) # 301 id'ed genes
nrow(GDO_sigs_unID) # 108 unID'd genes
# 409 genes total diff expressed between GDO and GTC
#Write file of merged files
write_csv(GDO_sigs_merged, file = "output/DEG_lists/GOterms_genome/GDO_sigs_merged.csv")
write_csv(GDO_sigs_unID, file = "output/DEG_lists/GOterms_genome/GDO_sigs_unID.csv")
write_csv(GDO_sigs_ID, file = "output/DEG_lists/GOterms_genome/GDO_sigs_ID.csv")
```
### FOA merge
```{r}
#FOA_LC_siggene <- read.delim("output/DEG_lists/Foot/FOA_LC_siggene.csv", sep = " ", header = TRUE)
FOA_TC_siggene <- read.delim("output/DEG_lists/Foot/FOA_TC_siggene.csv", sep = " ", header = TRUE)
FOA_TC_siggene <- FOA_TC_siggene %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
# FOA_sigs <- inner_join(FOA_TC_siggene, FOA_LC_siggene, by = 'gene')
# #basemean x is TC, basemean y is LC
#
# FOA_sigs$log2FoldChange <- FOA_sigs$log2FoldChange.x-FOA_sigs$log2FoldChange.y
# FOA_sigs_inter <- FOA_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# FOA_sigs_merged <- merge(FOA_TC_siggene, FOA_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# FOA_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FOA_sigs_merged$log2FoldChange.y) , FOA_sigs_merged$log2FoldChange.y, FOA_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_y" if needed
# FOA_sigs_merged <- FOA_sigs_merged[, !(names(FOA_sigs_merged) %in% c("log2FoldChange.y"))]
#
# # change gene column to be called transcript
# colnames(FOA_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FOA_sigs_merged <- left_join(FOA_TC_siggene, masterID, by = "LOC_ID")
FOA_sigs_unID <- FOA_sigs_merged[is.na(FOA_sigs_merged$geneID), ]
FOA_sigs_ID <- FOA_sigs_merged[!is.na(FOA_sigs_merged$geneID), ]
nrow(FOA_sigs_ID) # 72 id'ed genes
nrow(FOA_sigs_unID) # 20 unID'd genes
#92 genes total diff expressed between FOA and FTC
#Write file of merged files
write_csv(FOA_sigs_merged, file = "output/DEG_lists/GOterms_genome/FOA_sigs_merged.csv")
write_csv(FOA_sigs_unID, file = "output/DEG_lists/GOterms_genome/FOA_sigs_unID.csv")
write_csv(FOA_sigs_ID, file = "output/DEG_lists/GOterms_genome/FOA_sigs_ID.csv")
```
### GOA merge
```{r}
#GOA_LC_siggene <- read.delim("output/DEG_lists/Gill/GOA_LC_siggene.csv", sep = " ", header = TRUE)
GOA_TC_siggene <- read.delim("output/DEG_lists/Gill/GOA_TC_siggene.csv", sep = " ", header = TRUE)
GOA_TC_siggene <- GOA_TC_siggene %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
#GOA_sigs <- inner_join(GOA_TC_siggene, GOA_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
#
# GOA_sigs$log2FoldChange <- GOA_sigs$log2FoldChange.x-GOA_sigs$log2FoldChange.y
# GOA_sigs_inter <- GOA_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# GOA_sigs_merged <- merge(GOA_TC_siggene, GOA_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# GOA_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GOA_sigs_merged$log2FoldChange.y) , GOA_sigs_merged$log2FoldChange.y, GOA_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_y" if needed
# GOA_sigs_merged <- GOA_sigs_merged[, !(names(GOA_sigs_merged) %in% c("log2FoldChange.y"))]
#
# # change gene column to be called transcript
# colnames(GOA_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
#
GOA_TC_siggene <- GOA_TC_siggene %>%
filter(!is.na(LOC_ID))
GOA_sigs_merged <- left_join(GOA_TC_siggene, masterID, by = "LOC_ID")
GOA_sigs_unID <- GOA_sigs_merged[is.na(GOA_sigs_merged$geneID), ]
GOA_sigs_ID <- GOA_sigs_merged[!is.na(GOA_sigs_merged$geneID), ]
nrow(GOA_sigs_ID) # 592 id'ed genes
nrow(GOA_sigs_unID) # 248 unID'd genes
# 830 genes total diff expressed between GOA and GTC
#Write file of merged files
write_csv(GOA_sigs_merged, file = "output/DEG_lists/GOterms_genome/GOA_sigs_merged.csv")
write_csv(GOA_sigs_unID, file = "output/DEG_lists/GOterms_genome/GOA_sigs_unID.csv")
write_csv(GOA_sigs_ID, file = "output/DEG_lists/GOterms_genome/GOA_sigs_ID.csv")
```
### FOW merge
```{r}
# FOW_LC_siggene <- read.delim("output/DEG_lists/Foot/FOW_LC_siggene.csv", sep = " ", header = TRUE)
FOW_TC_siggene <- read.delim("output/DEG_lists/Foot/FOW_TC_siggene.csv", sep = " ", header = TRUE)
FOW_TC_siggene <- FOW_TC_siggene %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
#
# FOW_sigs <- inner_join(FOW_TC_siggene, FOW_LC_siggene, by = 'gene')
# #basemean x is TC, basemean y is LC
#
# FOW_sigs$log2FoldChange <- FOW_sigs$log2FoldChange.x-FOW_sigs$log2FoldChange.y
# FOW_sigs_inter <- FOW_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# FOW_sigs_merged <- merge(FOW_TC_siggene, FOW_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# FOW_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FOW_sigs_merged$log2FoldChange.y) , FOW_sigs_merged$log2FoldChange.y, FOW_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_y" if needed
# FOW_sigs_merged <- FOW_sigs_merged[, !(names(FOW_sigs_merged) %in% c("log2FoldChange.y"))]
#
# # change gene column to be called transcript
# colnames(FOW_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FOW_sigs_merged <- left_join(FOW_TC_siggene, masterID, by = "LOC_ID")
FOW_sigs_unID <- FOW_sigs_merged[is.na(FOW_sigs_merged$geneID), ]
FOW_sigs_ID <- FOW_sigs_merged[!is.na(FOW_sigs_merged$geneID), ]
nrow(FOW_sigs_ID) # 152 id'ed genes
nrow(FOW_sigs_unID) # 49 unID'd genes
# 201 genes total diff expressed between FOW and FTC
#Write file of merged files
write_csv(FOW_sigs_merged, file = "output/DEG_lists/GOterms_genome/FOW_sigs_merged.csv")
write_csv(FOW_sigs_unID, file = "output/DEG_lists/GOterms_genome/FOW_sigs_unID.csv")
write_csv(FOW_sigs_ID, file = "output/DEG_lists/GOterms_genome/FOW_sigs_ID.csv")
```
### GOW merge
```{r}
#GOW_LC_siggene <- read.delim("output/DEG_lists/Gill/GOW_LC_siggene.csv", sep = " ", header = TRUE)
GOW_TC_siggene <- read.delim("output/DEG_lists/Gill/GOW_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
GOW_TC_siggene <- GOW_TC_siggene %>%
mutate(LOC_ID = str_extract(gene, "LOC\\d+"))
# GOW_sigs <- inner_join(GOW_TC_siggene, GOW_LC_siggene, by = 'gene')
# #basemean x is TC, basemean y is LC
#
# GOW_sigs$log2FoldChange <- GOW_sigs$log2FoldChange.x-GOW_sigs$log2FoldChange.y
# GOW_sigs_inter <- GOW_sigs[,c(1,12)]
#
# # Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
# GOW_sigs_merged <- merge(GOW_TC_siggene, GOW_sigs_inter, by = "gene", all.x = TRUE)
#
# # Replace values in df_y with values from df_x based on a condition
# GOW_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GOW_sigs_merged$log2FoldChange.y) , GOW_sigs_merged$log2FoldChange.y, GOW_sigs_merged$log2FoldChange.x)
#
# # Drop the extra column "Value_y" if needed
# GOW_sigs_merged <- GOW_sigs_merged[, !(names(GOW_sigs_merged) %in% c("log2FoldChange.y"))]
#
# # change gene column to be called transcript
# colnames(GOW_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
GOW_sigs_merged <- left_join(GOW_TC_siggene, masterID, by = "LOC_ID")
GOW_sigs_unID <- GOW_sigs_merged[is.na(GOW_sigs_merged$geneID), ]
GOW_sigs_ID <- GOW_sigs_merged[!is.na(GOW_sigs_merged$geneID), ]
nrow(GOW_sigs_ID) # 198 id'ed genes
nrow(GOW_sigs_unID) # 53 unID'd genes
# 251 genes total diff expressed between GOW and GTC
#Write file of merged files
write_csv(GOW_sigs_merged, file = "output/DEG_lists/GOterms_genome/GOW_sigs_merged.csv")
write_csv(GOW_sigs_unID, file = "output/DEG_lists/GOterms_genome/GOW_sigs_unID.csv")
write_csv(GOW_sigs_ID, file = "output/DEG_lists/GOterms_genome/GOW_sigs_ID.csv")
```