---
title: "main"
author: "Samuel Cryan"
date: "2023-04-13"
output: html_document
---

List all the taxa that I'm interested in
```{r}

```

Taxa Run So Far:
acidobacteriota
aquificota
Deltaproteobacteria
Bacteroidota
Chloroflexota 
Bacillota - dominates currently
Cyanobacteriota
Mycoplasmatota

Downloading the files
```{bash}
cd ../data

rm -r ncbi_dataset
rm README.md

/home/shared/datasets download genome taxon "bacteria" --annotated --reference --include gff3
unzip ncbi_dataset.zip
```

Create a list of all genes present
```{r Presence}
library(data.table)
library(tidyverse)
accession_list = dir(path="../data/ncbi_dataset/data",pattern="GCF*")

gene_presence_list <- NULL

gene_presence_list <- lapply(accession_list, function(accession_number_i){
  filename <- paste0("../data/ncbi_dataset/data/", accession_number_i, "/genomic.gff")
  # Apparently the files have different length headers so we need to skip them dynamically
  # from https://stackoverflow.com/q/18920777
  # This makes the reader much slower but I can't figure out how to avoid it otherwise
  lineskip_command <- paste("grep -v '^#'", filename)
  # fread is data.table's fast table reader function
  # we also index for column 3 equaling "gene" at the end here
  tab_i <- fread(cmd=lineskip_command, sep = "\t", fill = TRUE, )[V3=="gene"]
  
  
  # Approach number 1 - downloading the GeneID name
  # This regex is a lookbehind (?<=) and grabs all digits (\\d) following 
  # the lookbehind match
  # gene_ids <- as.numeric(str_extract(tab_i$V9, pattern = "(?<=Dbxref=GeneID:)\\d+"))
  
  # Approach number 2 - using the gene name technique
  gene_names <- str_extract(tab_i$V9, pattern="(?<=gene=)[^;]*(?=;)")
  
  # Remove all but 1 NA
  gene_names <- unique(gene_names, incomparables=FALSE, fromLast=FALSE, by=seq_along(gene_names))
  
  # Rename to avoid name collision on merging
  output_table <- data.table(1, gene_names)
  names(output_table) <- c(accession_number_i, "gene_names")
  return(output_table)
})
```

```{r full} 
genes_merged <- purrr::reduce(gene_presence_list, dplyr::full_join, by="gene_names")
genes_merged[is.na(genes_merged)] <- 0
```

``` {r save}
write.csv(genes_merged, "../data/merged_genes_full.csv", row.names=TRUE)
```

```{r merge_with_current}
# Load in the current 
genes_merged_old <- read.csv("../data/merged_genes.csv",row.names = 1, header= TRUE)

# Reduce using the merge function, keeping all rows, accumulating genes each time
genes_merged <- purrr::reduce(gene_presence_list, dplyr::full_join, by="gene_names")

# Combine the Two
genes_merged_new <- merge(genes_merged_old, genes_merged, by="gene_names",all = TRUE)
genes_merged_new[is.na(genes_merged_new)] <- 0


write.csv(genes_merged_new, "../data/merged_genes.csv", row.names=TRUE)

```