*This document goes over the steps to check the metagenome bins for completeness and contamination.*

Load required packages.

```{r}
library(tidyverse)
library(GEOquery)
library(Biostrings)
library(dplyr)
library(ggalluvial)
```

Import data. Note that this data is not publicly available yet, so I'm getting it from my Google Drive.

```{r}
# set path to all data
path <- "~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/"
```

Find bins that are Sulfitobacter These bins are MAGs (metagenome assembled genomes), and I'll find which are Sulfitobacter by using the CATBAT consensus results of what each contig is. Since I know what microbe each contig is, I'll look for bins where most contigs are Sulfitobacter.

```{r}
# open file
classification <- read.csv(paste0(path, "CAT-BAT/1058_P1_2018_585_0.2um/1058_P1_2018_585_0.2um_assembly_plus.contig2classification_names.txt"), sep = "\t")

# get only those labeled as Marinimicrobia
classification <- classification[str_detect(classification$genus, "Sulfitobacter"),]

# add this classification file to the repo
write.csv(classification, "../output/CATBAT_sulfitobacter.csv")
```

Now that I have a classification file and my bins, I can find bins where the most reads and contigs are Sulfitobacter.

```{r}
# get Sulfitobacter bins
classification <- read.csv("../data/CATBAT_all.txt", sep = "\t")

# run through each binning method
bin_path <- list.dirs("../data/Bins")
sulf_bins <- NULL
a <- 0

for (i in 2:7){
  
  pathname <- paste0(bin_path[i], "/")
  bins <- list.files(pathname)
  
  for (binname in bins){
    print(binname)
    bin_pathname <- paste0(pathname, binname)
    contig_names <- read.csv(bin_pathname, header=F)
    all_contigs <- NULL
    
    for (i in 1:nrow(contig_names)){
      contigname <- contig_names[i,1]
      find_contig <- classification %>%
        filter(str_detect(X..contig, contigname))
      all_contigs <- rbind(all_contigs, find_contig)
    }
    
    # majority of ORFs have to be Marinimicrobia for me to look at the bin
    all_contigs$ORFs <- str_split_fixed(all_contigs$reason, "[ ]", 4)[,3]
    all_contigs$ORFs_true <- as.numeric(str_split_fixed(all_contigs$ORFs, "[/]", 4)[,1])
    consensus <- all_contigs %>%
      group_by(genus) %>%
      summarize(support = sum(ORFs_true)) %>%
      filter(genus != "no support") # only to get all possible Mar bins
    index <- which(str_detect(consensus$genus, "Sulfitobacter"))
    consensus$genus[index] <- "Sulfitobacter"
    consensus <- consensus$genus[which(consensus$support == max(consensus$support))]
    print(consensus)
    
    if (identical(consensus[str_detect(consensus, "Sulfitobacter")], character(0)) == F){
      a <- a + 1
      sulf_bins$bin[a] <- binname
      sulf_bins$sample[a] <- basename(pathname)
    }
  }
}

sulf_bins <- as.data.frame(sulf_bins)
write.csv(sulf_bins, "../output/all_mar_bins.csv")
```

Use BUSCO (which is similar to CheckM to get completeness, contamination, etc of bins). This is run on my own computer, hence the Google Drive files.

```{r}
# Get fasta files for each bin
mar_bins <- read.csv("../output/all_mar_bins.csv")
all <- readDNAStringSet("~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/Assembly/1058_P1_2018_585_0.2um_assembly_plus.fa")
path <- "~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/Bins/"

for (i in 1:nrow(mar_bins)){
  bin <- mar_bins$bin[i]
  bintype <- mar_bins$sample[i]
  bin_nms <- read.csv(paste0(path, bintype, "/", bin), header = F)
  small_fa <- str_detect(all@ranges@NAMES, paste(bin_nms$V1, collapse = "|"))
  index <- which(small_fa == T)
  small_fa <- all[index]
  writeXStringSet(small_fa, paste0("../data/Bin_fa/", bin), format = "fasta")
}
```

Run BUSCO on my own computer, has to be done in a dedicated environment. I am using the generic bacteria database to search for single copy marker genes.

```{bash}
conda activate busco
busco -i github/jordan-marinimicrobia/data/Bin_fa -l bacteria_odb10 \
-m geno -o github/jordan-marinimicrobia/output/busco_outputs -c 8
```

Combine BUSCO output table with bins summary

```{r}
busco <- read.delim("../output/batch_summary.txt")
colnames(busco)[1] <- "bin"
mar_bins <- merge(mar_bins, busco)
write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F)
```

Get sequence length of all the bins and add it to bin summary table

```{r}
bin_path <- list.files("../data/Bin_fa") # annotated fasta files for each bin
lengths <- NULL

for (i in 1:length(bin_path)){
  
  bin <- bin_path[i]
  bin_fullname <- paste0("../data/Bin_fa/", bin)
  
  lengths$bin[i] <- bin
  fa <- readDNAStringSet(bin_fullname, format = "fasta")
  lengths$sum_len[i] <- sum(nchar(fa))
  
}

lengths <- as.data.frame(lengths)
mar_bins <- merge(mar_bins, lengths)
write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F)
```

Get clade names for bins from previously identified 16S. Not all bins will have 16S, so for now some bins will remain unidentified.

```{r}
meta <- read.csv("../data/reads_to_clades.csv")
mar_bins <- read.csv("../output/all_mar_bins.csv")
path <- "../data/Bins/"

mar_bins$clade <- NA

for (i in 1:nrow(mar_bins)){
  
  bin_number <- mar_bins$bin[i]
  folder <- mar_bins$sample[i]
  bin_pathname <- paste0(path, folder, "/", bin_number)
  contig_names <- read.csv(bin_pathname, header=F)
  clade <- NA
  
  for (j in 1:nrow(contig_names)){
    contigname <- contig_names[j,1]
    find_contig <- meta %>% filter(str_detect(read_name, contigname))
    if (nrow(find_contig) > 0){
      print(find_contig$clade)
      clade <- find_contig$clade[1]
    }
    
  }
  
  mar_bins$clade[i] <- clade
  
}

write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F)
```

Alluvial plot to check what contigs/reads went where in each binning method.
47 Bin 508: P41300E03
47 Bin 81: Sulfitobacter
24 Bin 133: unknown, 25% complete

```{r}
# open file
alluvial <- read.csv("../output/for_alluvial_plot.csv")

# choose starting bin
alluvial <- subset(alluvial, X24_sample_bam_bins == "24_sample_bam_bin.133")

alluvial <- pivot_longer(alluvial, cols=c(colnames(alluvial)[2:7]), names_to="bin_method", values_to="bin")
alluvial <- subset(alluvial, is.na(bin) == F)
alluvial$bin_method <- factor(alluvial$bin_method, levels = c("X24_sample_bam_bins","X47_sample_bam_bins", "short_reads_bam_bins",
  "transcriptomes_bam_bins","assembly_plus_bins","assembly_bins"))

ggplot(alluvial,
       aes(x = bin_method, stratum = bin, alluvium = contig_name,
           fill = bin, label = bin)) +
  geom_flow(stat = "alluvium", lode.guidance = "frontback",
            color = "darkgray") +
  geom_stratum() +
  theme_bw() +
  ggtitle("24 Bin")
```