Aidan Coyle,

Originally written 2021/02/25

Roberts Lab, UW-SAFS

Takes the results from our BLASTs in 22_DEG_blast.ipynb and examines the e-values to determine whether sequences are more likely to have originated from C. bairdi or Hematodinium sp.

library(tidyverse)
outfmt6 <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore")

amb2_elev2_alveolata <- read.table(file = "../output/BLASTs/alveolata_publicseqs/cbaihemat_v2.0_amb2_vs_elev2_DEGs.tab",
                                   col.names = outfmt6)
elev0_elev2_alveolata <- read.table(file = "../output/BLASTs/alveolata_publicseqs/cbaihemat_v2.0_elev0_vs_elev2_DEGs.tab",
                                    col.names = outfmt6)

amb2_elev2_arthropoda <- read.table(file = "../output/BLASTs/arthropoda_publicseqs/cbaihemat_v2.0_amb2_vs_elev2_DEGs.tab",
                                    col.names = outfmt6)
elev0_elev2_arthropoda <- read.table(file = "../output/BLASTs/arthropoda_publicseqs/cbaihemat_v2.0_elev0_vs_elev2_DEGs.tab",
                                     col.names = outfmt6)

# Filter by e-value (bar = 10)
eval <- 10^(-4)

sum(amb2_elev2_alveolata$evalue <= eval)
## [1] 569
sum(amb2_elev2_arthropoda$evalue <= eval)
## [1] 1137
sum(elev0_elev2_alveolata$evalue <= eval)
## [1] 60
sum(elev0_elev2_arthropoda$evalue <= eval)
## [1] 181
# Create table with transcript IDs and e-values for both alveolata and arthropoda

# First, remove duplicate transcript IDs, keeping the one with the highest e-value

amb2_elev2_alv <- amb2_elev2_alveolata %>%
  select(qseqid, sseqid, evalue)

amb2_elev2_alv <- amb2_elev2_alv[order(amb2_elev2_alv[ ,"qseqid"], -amb2_elev2_alv[ ,"evalue"]),]

amb2_elev2_alv <- amb2_elev2_alv[!duplicated(amb2_elev2_alv$qseqid),]

# Rename evalue column for ID post-join

colnames(amb2_elev2_alv)[3] <- "alv_eval"

# Doing the same to arthropoda
amb2_elev2_arth <- amb2_elev2_arthropoda %>%
  select(qseqid, sseqid, evalue)

amb2_elev2_arth <- amb2_elev2_arth[order(amb2_elev2_arth[ ,"qseqid"], -amb2_elev2_arth[ ,"evalue"]),]

amb2_elev2_arth <- amb2_elev2_arth[!duplicated(amb2_elev2_arth$qseqid),]

# Rename evalue column for ID post-join

colnames(amb2_elev2_arth)[3] <- "arth_eval"

# Left join the two tables
amb2_elev2 <- left_join(amb2_elev2_alv, amb2_elev2_arth, by = "qseqid")

# Number of DEGs more likely to be C. bairdi

sum(as.numeric(amb2_elev2$alv_eval > amb2_elev2$arth_eval), na.rm = TRUE)
## [1] 1175
# Number of DEGs more likely to be Hematodinium

sum(as.numeric(amb2_elev2$alv_eval < amb2_elev2$arth_eval), na.rm = TRUE)
## [1] 741
#####################################
# Do the same for comparing Elevated Day 0/2
#####################################

elev0_elev2_alv <- elev0_elev2_alveolata %>%
  select(qseqid, sseqid, evalue)

elev0_elev2_alv <- elev0_elev2_alv[order(elev0_elev2_alv[ ,"qseqid"], -elev0_elev2_alv[ ,"evalue"]),]

elev0_elev2_alv <- elev0_elev2_alv[!duplicated(elev0_elev2_alv$qseqid),]

# Rename evalue column for ID post-join

colnames(elev0_elev2_alv)[3] <- "alv_eval"

# Doing the same to arthropoda
elev0_elev2_arth <- elev0_elev2_arthropoda %>%
  select(qseqid, sseqid, evalue)

elev0_elev2_arth <- elev0_elev2_arth[order(elev0_elev2_arth[ ,"qseqid"], -elev0_elev2_arth[ ,"evalue"]),]

elev0_elev2_arth <- elev0_elev2_arth[!duplicated(elev0_elev2_arth$qseqid),]

# Rename evalue column for ID post-join

colnames(elev0_elev2_arth)[3] <- "arth_eval"

# Left join the two tables
elev0_elev2 <- left_join(elev0_elev2_alv, elev0_elev2_arth, by = "qseqid")

# Number of DEGs more likely to be C. bairdi

sum(as.numeric(elev0_elev2$alv_eval > elev0_elev2$arth_eval), na.rm = TRUE)
## [1] 217
# Number of DEGs more likely to be Hematodinium

sum(as.numeric(elev0_elev2$alv_eval < elev0_elev2$arth_eval), na.rm = TRUE)
## [1] 66