library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
pwd
## /Users/sr320/Documents/GitHub/nb-2021/C_gigas/code

Bringing in the list of 46753 CDS IDs

cdslist <- read.csv("../analyses/GCF_000297895_cdslist.tab", header = FALSE)

UNIPROT (but need to redo!!! - end premature)

uniprot <- read.csv("../analyses/Cg-uniprot_blastx_02.tab", header = FALSE, sep="\t") %>%
  distinct(V1, .keep_all = TRUE)

SPUR Protein hits

spur <- read.csv("../analyses/Cg-ProteinsSpur5.0_blastx.tab", header = FALSE, sep="\t") %>%
  distinct(V1, .keep_all = TRUE)

SPUR nucleotide

spurn <- read.csv("../analyses/Cg-SpurCDS_blastn.tab", header = FALSE, sep="\t") %>%
  distinct(V1, .keep_all = TRUE)

Drosophila

!{bldir}blastn \
-task blastn \
-query ../data/GCF_000297895.1_oyster_v9_cds_from_genomic.fna \
-db ../blastdb/dmel-all-gene-r6.37  \
-out ../analyses/Cg-dmel-gene_blastn_02.tab \
-evalue 1E-05 \
-num_threads 1 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt "6 qaccver saccver evalue"
dmel <- read.csv("../analyses/Cg-dmel-gene_blastn_02.tab", header = FALSE, sep="\t")  %>%
  distinct(V1, .keep_all = TRUE)

Celg Gene symbols

celsym <- read.csv("../analyses/Cg_Ce-genesym.tab", header = FALSE, sep="\t")

JOIN

comb <- left_join(cdslist, uniprot, by = "V1") %>%
left_join(spur, by = "V1") %>%
left_join(spurn, by = "V1") %>%
left_join(dmel, by = "V1") %>%
left_join(celsym, by = c("V1" = "V2")) %>%
select(V1, V2.x, V11.x, V2.y, V11.y, V2.x.x, V3.x.x, V2.y.y, V3.y.y, V1.y, V3, V4) %>%
rename(cdsID = V1, uniprot = V2.x, u_ev = V11.x, spur = V2.y, s_ev = V11.y, spurN = V2.x.x, spurN_ev = V3.x.x, dmel = V2.y.y, dmel_ev = V3.y.y, cel_ID = V1.y, cel_ev = V3, cel_sym = V4)

GOing back to Giles Blast

../data/bestblast_/bestblast.tab

giles <- read.csv("../data/bestblast_/bestblast.tab", header = TRUE, sep = "\t") %>%
  select(ID, HitAcc, HitDesc, E.Value)
comb_g <- left_join(comb, giles, by = c("cdsID" = "ID"))
loc <- read.csv("../data/bestblast_/LOC_seqID_key.tab", header = FALSE, sep = "\t")
loc_dt <- left_join(loc, comb_g, by = c("V1" = "cdsID"))
write_tsv(
  loc_dt,
  "../analyses/Cg-annot-join-V1.1.tab"
)