---
title: "Comparing gene tables"
output: html_document
date: "2024-02-20"
---
### Load libraries
```{r load_libraries, inlcude = TRUE}
## clear
rm(list=ls())
# List of packages we want to install (run every time)
load.lib<-c("RColorBrewer","readxl","ggpubr","tidyverse","tibble","stringr","beepr","gplots")
# Select only the packages that aren't currently installed (run every time)
install.lib <- load.lib[!load.lib %in% installed.packages()]
# And finally we install the missing packages, including their dependency.
for(lib in install.lib) install.packages(lib,dependencies=TRUE)
# After the installation process completes, we load all packages.
sapply(load.lib,require,character=TRUE)
#invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))
library(tidyverse)
library(dplyr)
```
```{r}
library(tidyr)
# Load output of blastx, uniprot, and GO into a single masterID table
masterID <- read.delim("data/Mtros_GO.txt", sep= " ", header = TRUE)
# Remove the "V11" column, not sure what it is
masterID <- masterID[, -which(names(masterID) == "V11")]
# Rename remaining columns
colnames(masterID) <- c("transcript", "V2", "protein.names","GOterm.BP","GO.IDs")
# Seperate the V2 into its constitutant parts: sp, the uniprot #, geneID, and species
masterID <- separate(masterID, V2, into = c("sp", "uniprot_accession", "geneID", "species"), sep = "[\\|_]")
# remove columns we don't need (V2), reorder the columns to look pretty
masterID <- masterID[, c("transcript", "species", "geneID","uniprot_accession","protein.names","GO.IDs","GOterm.BP")]
```
### FDO merge
```{r}
FDO_LC_siggene <- read.delim("output/DEG_lists/Foot/FDO_LC_siggene.csv", sep = " ", header = TRUE)
FDO_TC_siggene <- read.delim("output/DEG_lists/Foot/FDO_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
FDO_sigs <- inner_join(FDO_TC_siggene, FDO_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
FDO_sigs$log2FoldChange <- FDO_sigs$log2FoldChange.x-FDO_sigs$log2FoldChange.y
FDO_sigs_inter <- FDO_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
FDO_sigs_merged <- merge(FDO_TC_siggene, FDO_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
FDO_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FDO_sigs_merged$log2FoldChange.y) , FDO_sigs_merged$log2FoldChange.y, FDO_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_x" if needed
FDO_sigs_merged <- FDO_sigs_merged[, !(names(FDO_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(FDO_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FDO_sigs_merged <- left_join(FDO_sigs_merged, masterID, by = "transcript")
FDO_sigs_unID <- FDO_sigs_merged[is.na(FDO_sigs_merged$geneID), ]
FDO_sigs_ID <- FDO_sigs_merged[!is.na(FDO_sigs_merged$geneID), ]
nrow(FDO_sigs_ID) # 7 id'ed genes
nrow(FDO_sigs_unID) # 8 unID'd genes
# 15 genes total diff expressed between FDO and FTC
#Write file of merged files
write.table(FDO_sigs_merged, file = "output/DEG_lists/Foot/FDO_sigs_merged.csv", row.names = FALSE)
write.table(FDO_sigs_unID, file = "output/DEG_lists/Foot/FDO_sigs_unID.csv", row.names = FALSE)
write.table(FDO_sigs_ID, file = "output/DEG_lists/with_GO_terms/FDO_sigs_ID.csv", row.names = FALSE)
```
### GDO merge
```{r}
GDO_LC_siggene <- read.delim("output/DEG_lists/Gill/GDO_LC_siggene.csv", sep = " ", header = TRUE)
GDO_TC_siggene <- read.delim("output/DEG_lists/Gill/GDO_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
GDO_sigs <- inner_join(GDO_TC_siggene, GDO_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
GDO_sigs$log2FoldChange <- GDO_sigs$log2FoldChange.x-GDO_sigs$log2FoldChange.y
GDO_sigs_inter <- GDO_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
GDO_sigs_merged <- merge(GDO_TC_siggene, GDO_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
GDO_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GDO_sigs_merged$log2FoldChange.y) , GDO_sigs_merged$log2FoldChange.y, GDO_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_x" if needed
GDO_sigs_merged <- GDO_sigs_merged[, !(names(GDO_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(GDO_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
GDO_sigs_merged <- left_join(GDO_sigs_merged, masterID, by = "transcript")
GDO_sigs_unID <- GDO_sigs_merged[is.na(GDO_sigs_merged$geneID), ]
GDO_sigs_ID <- GDO_sigs_merged[!is.na(GDO_sigs_merged$geneID), ]
nrow(GDO_sigs_ID) # 7 id'ed genes
nrow(GDO_sigs_unID) # 43 unID'd genes
# 50 genes total diff expressed between GDO and GTC
#Write file of merged files
write.table(GDO_sigs_merged, file = "output/DEG_lists/Gill/GDO_sigs_merged.csv", row.names = FALSE)
write.table(GDO_sigs_unID, file = "output/DEG_lists/Gill/GDO_sigs_unID.csv", row.names = FALSE)
write.table(GDO_sigs_ID, file = "output/DEG_lists/Gill/GDO_sigs_ID.csv", row.names = FALSE)
```
### FOA merge
```{r}
FOA_LC_siggene <- read.delim("output/DEG_lists/Foot/FOA_LC_siggene.csv", sep = " ", header = TRUE)
FOA_TC_siggene <- read.delim("output/DEG_lists/Foot/FOA_TC_siggene.csv", sep = " ", header = TRUE)
FOA_sigs <- inner_join(FOA_TC_siggene, FOA_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
FOA_sigs$log2FoldChange <- FOA_sigs$log2FoldChange.x-FOA_sigs$log2FoldChange.y
FOA_sigs_inter <- FOA_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
FOA_sigs_merged <- merge(FOA_TC_siggene, FOA_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
FOA_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FOA_sigs_merged$log2FoldChange.y) , FOA_sigs_merged$log2FoldChange.y, FOA_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_y" if needed
FOA_sigs_merged <- FOA_sigs_merged[, !(names(FOA_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(FOA_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FOA_sigs_merged <- left_join(FOA_sigs_merged, masterID, by = "transcript")
FOA_sigs_unID <- FOA_sigs_merged[is.na(FOA_sigs_merged$geneID), ]
FOA_sigs_ID <- FOA_sigs_merged[!is.na(FOA_sigs_merged$geneID), ]
nrow(FOA_sigs_ID) # 3 id'ed genes
nrow(FOA_sigs_unID) # 11 unID'd genes
# 14 genes total diff expressed between FOA and FTC
#Write file of merged files
write.table(FOA_sigs_merged, file = "output/DEG_lists/Foot/FOA_sigs_merged.csv", row.names = FALSE)
write.table(FOA_sigs_unID, file = "output/DEG_lists/Foot/FOA_sigs_unID.csv", row.names = FALSE)
write.table(FOA_sigs_ID, file = "output/DEG_lists/Foot/FOA_sigs_ID.csv", row.names = FALSE)
```
### GOA merge
```{r}
GOA_LC_siggene <- read.delim("output/DEG_lists/Gill/GOA_LC_siggene.csv", sep = " ", header = TRUE)
GOA_TC_siggene <- read.delim("output/DEG_lists/Gill/GOA_TC_siggene.csv", sep = " ", header = TRUE)
GOA_sigs <- inner_join(GOA_TC_siggene, GOA_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
GOA_sigs$log2FoldChange <- GOA_sigs$log2FoldChange.x-GOA_sigs$log2FoldChange.y
GOA_sigs_inter <- GOA_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
GOA_sigs_merged <- merge(GOA_TC_siggene, GOA_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
GOA_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GOA_sigs_merged$log2FoldChange.y) , GOA_sigs_merged$log2FoldChange.y, GOA_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_y" if needed
GOA_sigs_merged <- GOA_sigs_merged[, !(names(GOA_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(GOA_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
GOA_sigs_merged <- left_join(GOA_sigs_merged, masterID, by = "transcript")
GOA_sigs_unID <- GOA_sigs_merged[is.na(GOA_sigs_merged$geneID), ]
GOA_sigs_ID <- GOA_sigs_merged[!is.na(GOA_sigs_merged$geneID), ]
nrow(GOA_sigs_ID) # 23 id'ed genes
nrow(GOA_sigs_unID) # 76 unID'd genes
# 99 genes total diff expressed between GOA and GTC
#Write file of merged files
write.table(GOA_sigs_merged, file = "output/DEG_lists/Gill/GOA_sigs_merged.csv", row.names = FALSE)
write.table(GOA_sigs_unID, file = "output/DEG_lists/Gill/GOA_sigs_unID.csv", row.names = FALSE)
write.table(GOA_sigs_ID, file = "output/DEG_lists/Gill/GOA_sigs_ID.csv", row.names = FALSE)
```
### FOW merge
```{r}
FOW_LC_siggene <- read.delim("output/DEG_lists/Foot/FOW_LC_siggene.csv", sep = " ", header = TRUE)
FOW_TC_siggene <- read.delim("output/DEG_lists/Foot/FOW_TC_siggene.csv", sep = " ", header = TRUE)
FOW_sigs <- inner_join(FOW_TC_siggene, FOW_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
FOW_sigs$log2FoldChange <- FOW_sigs$log2FoldChange.x-FOW_sigs$log2FoldChange.y
FOW_sigs_inter <- FOW_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
FOW_sigs_merged <- merge(FOW_TC_siggene, FOW_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
FOW_sigs_merged$log2FoldChange.x <- ifelse(!is.na(FOW_sigs_merged$log2FoldChange.y) , FOW_sigs_merged$log2FoldChange.y, FOW_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_y" if needed
FOW_sigs_merged <- FOW_sigs_merged[, !(names(FOW_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(FOW_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
FOW_sigs_merged <- left_join(FOW_sigs_merged, masterID, by = "transcript")
FOW_sigs_unID <- FOW_sigs_merged[is.na(FOW_sigs_merged$geneID), ]
FOW_sigs_ID <- FOW_sigs_merged[!is.na(FOW_sigs_merged$geneID), ]
nrow(FOW_sigs_ID) # 9 id'ed genes
nrow(FOW_sigs_unID) # 23 unID'd genes
# 32 genes total diff expressed between FOW and FTC
#Write file of merged files
write.table(FOW_sigs_merged, file = "output/DEG_lists/Foot/FOW_sigs_merged.csv", row.names = FALSE)
write.table(FOW_sigs_unID, file = "output/DEG_lists/Foot/FOW_sigs_unID.csv", row.names = FALSE)
write.table(FOW_sigs_ID, file = "output/DEG_lists/Foot/FOW_sigs_ID.csv", row.names = FALSE)
```
### GOW merge
```{r}
GOW_LC_siggene <- read.delim("output/DEG_lists/Gill/GOW_LC_siggene.csv", sep = " ", header = TRUE)
GOW_TC_siggene <- read.delim("output/DEG_lists/Gill/GOW_TC_siggene_apeglm.csv", sep = " ", header = TRUE)
GOW_sigs <- inner_join(GOW_TC_siggene, GOW_LC_siggene, by = 'gene')
#basemean x is TC, basemean y is LC
GOW_sigs$log2FoldChange <- GOW_sigs$log2FoldChange.x-GOW_sigs$log2FoldChange.y
GOW_sigs_inter <- GOW_sigs[,c(1,12)]
# Merge dataframes based on the shared column 'ID' and replace values from df2 into df1
GOW_sigs_merged <- merge(GOW_TC_siggene, GOW_sigs_inter, by = "gene", all.x = TRUE)
# Replace values in df_y with values from df_x based on a condition
GOW_sigs_merged$log2FoldChange.x <- ifelse(!is.na(GOW_sigs_merged$log2FoldChange.y) , GOW_sigs_merged$log2FoldChange.y, GOW_sigs_merged$log2FoldChange.x)
# Drop the extra column "Value_y" if needed
GOW_sigs_merged <- GOW_sigs_merged[, !(names(GOW_sigs_merged) %in% c("log2FoldChange.y"))]
# change gene column to be called transcript
colnames(GOW_sigs_merged) <- c("transcript", "baseMean", "log2FoldChange","lfcSE","pvalue", "padj")
GOW_sigs_merged <- left_join(GOW_sigs_merged, masterID, by = "transcript")
GOW_sigs_unID <- GOW_sigs_merged[is.na(GOW_sigs_merged$geneID), ]
GOW_sigs_ID <- GOW_sigs_merged[!is.na(GOW_sigs_merged$geneID), ]
nrow(GOW_sigs_ID) # 7 id'ed genes
nrow(GOW_sigs_unID) # 18 unID'd genes
# 25 genes total diff expressed between GOW and GTC
#Write file of merged files
write.table(GOW_sigs_merged, file = "output/DEG_lists/Gill/GOW_sigs_merged.csv", row.names = FALSE)
write.table(GOW_sigs_unID, file = "output/DEG_lists/Gill/GOW_sigs_unID.csv", row.names = FALSE)
write.table(GOW_sigs_ID, file = "output/DEG_lists/Gill/GOW_sigs_ID.csv", row.names = FALSE)
```