*This document goes over the steps to check the metagenome bins for completeness and contamination.* Load required packages. ```{r} library(tidyverse) library(GEOquery) library(Biostrings) library(dplyr) library(ggalluvial) ``` Import data. Note that this data is not publicly available yet, so I'm getting it from my Google Drive. ```{r} # set path to all data path <- "~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/" ``` Find bins that are Sulfitobacter These bins are MAGs (metagenome assembled genomes), and I'll find which are Sulfitobacter by using the CATBAT consensus results of what each contig is. Since I know what microbe each contig is, I'll look for bins where most contigs are Sulfitobacter. ```{r} # open file classification <- read.csv(paste0(path, "CAT-BAT/1058_P1_2018_585_0.2um/1058_P1_2018_585_0.2um_assembly_plus.contig2classification_names.txt"), sep = "\t") # get only those labeled as Marinimicrobia classification <- classification[str_detect(classification$genus, "Sulfitobacter"),] # add this classification file to the repo write.csv(classification, "../output/CATBAT_sulfitobacter.csv") ``` Now that I have a classification file and my bins, I can find bins where the most reads and contigs are Sulfitobacter. ```{r} # get Sulfitobacter bins classification <- read.csv("../data/CATBAT_all.txt", sep = "\t") # run through each binning method bin_path <- list.dirs("../data/Bins") sulf_bins <- NULL a <- 0 for (i in 2:7){ pathname <- paste0(bin_path[i], "/") bins <- list.files(pathname) for (binname in bins){ print(binname) bin_pathname <- paste0(pathname, binname) contig_names <- read.csv(bin_pathname, header=F) all_contigs <- NULL for (i in 1:nrow(contig_names)){ contigname <- contig_names[i,1] find_contig <- classification %>% filter(str_detect(X..contig, contigname)) all_contigs <- rbind(all_contigs, find_contig) } # majority of ORFs have to be Marinimicrobia for me to look at the bin all_contigs$ORFs <- str_split_fixed(all_contigs$reason, "[ ]", 4)[,3] all_contigs$ORFs_true <- as.numeric(str_split_fixed(all_contigs$ORFs, "[/]", 4)[,1]) consensus <- all_contigs %>% group_by(genus) %>% summarize(support = sum(ORFs_true)) %>% filter(genus != "no support") # only to get all possible Mar bins index <- which(str_detect(consensus$genus, "Sulfitobacter")) consensus$genus[index] <- "Sulfitobacter" consensus <- consensus$genus[which(consensus$support == max(consensus$support))] print(consensus) if (identical(consensus[str_detect(consensus, "Sulfitobacter")], character(0)) == F){ a <- a + 1 sulf_bins$bin[a] <- binname sulf_bins$sample[a] <- basename(pathname) } } } sulf_bins <- as.data.frame(sulf_bins) write.csv(sulf_bins, "../output/all_mar_bins.csv") ``` Use BUSCO (which is similar to CheckM to get completeness, contamination, etc of bins). This is run on my own computer, hence the Google Drive files. ```{r} # Get fasta files for each bin mar_bins <- read.csv("../output/all_mar_bins.csv") all <- readDNAStringSet("~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/Assembly/1058_P1_2018_585_0.2um_assembly_plus.fa") path <- "~/Library/CloudStorage/GoogleDrive-jwinter2@uw.edu/Shared drives/Rocap Lab/Project_ODZ_Marinimicrobia_Jordan/Bins/" for (i in 1:nrow(mar_bins)){ bin <- mar_bins$bin[i] bintype <- mar_bins$sample[i] bin_nms <- read.csv(paste0(path, bintype, "/", bin), header = F) small_fa <- str_detect(all@ranges@NAMES, paste(bin_nms$V1, collapse = "|")) index <- which(small_fa == T) small_fa <- all[index] writeXStringSet(small_fa, paste0("../data/Bin_fa/", bin), format = "fasta") } ``` Run BUSCO on my own computer, has to be done in a dedicated environment. I am using the generic bacteria database to search for single copy marker genes. ```{bash} conda activate busco busco -i github/jordan-marinimicrobia/data/Bin_fa -l bacteria_odb10 \ -m geno -o github/jordan-marinimicrobia/output/busco_outputs -c 8 ``` Combine BUSCO output table with bins summary ```{r} busco <- read.delim("../output/batch_summary.txt") colnames(busco)[1] <- "bin" mar_bins <- merge(mar_bins, busco) write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F) ``` Get sequence length of all the bins and add it to bin summary table ```{r} bin_path <- list.files("../data/Bin_fa") # annotated fasta files for each bin lengths <- NULL for (i in 1:length(bin_path)){ bin <- bin_path[i] bin_fullname <- paste0("../data/Bin_fa/", bin) lengths$bin[i] <- bin fa <- readDNAStringSet(bin_fullname, format = "fasta") lengths$sum_len[i] <- sum(nchar(fa)) } lengths <- as.data.frame(lengths) mar_bins <- merge(mar_bins, lengths) write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F) ``` Get clade names for bins from previously identified 16S. Not all bins will have 16S, so for now some bins will remain unidentified. ```{r} meta <- read.csv("../data/reads_to_clades.csv") mar_bins <- read.csv("../output/all_mar_bins.csv") path <- "../data/Bins/" mar_bins$clade <- NA for (i in 1:nrow(mar_bins)){ bin_number <- mar_bins$bin[i] folder <- mar_bins$sample[i] bin_pathname <- paste0(path, folder, "/", bin_number) contig_names <- read.csv(bin_pathname, header=F) clade <- NA for (j in 1:nrow(contig_names)){ contigname <- contig_names[j,1] find_contig <- meta %>% filter(str_detect(read_name, contigname)) if (nrow(find_contig) > 0){ print(find_contig$clade) clade <- find_contig$clade[1] } } mar_bins$clade[i] <- clade } write.csv(mar_bins, "../output/all_mar_bins.csv", row.names = F) ``` Alluvial plot to check what contigs/reads went where in each binning method. 47 Bin 508: P41300E03 47 Bin 81: Sulfitobacter 24 Bin 133: unknown, 25% complete ```{r} # open file alluvial <- read.csv("../output/for_alluvial_plot.csv") # choose starting bin alluvial <- subset(alluvial, X24_sample_bam_bins == "24_sample_bam_bin.133") alluvial <- pivot_longer(alluvial, cols=c(colnames(alluvial)[2:7]), names_to="bin_method", values_to="bin") alluvial <- subset(alluvial, is.na(bin) == F) alluvial$bin_method <- factor(alluvial$bin_method, levels = c("X24_sample_bam_bins","X47_sample_bam_bins", "short_reads_bam_bins", "transcriptomes_bam_bins","assembly_plus_bins","assembly_bins")) ggplot(alluvial, aes(x = bin_method, stratum = bin, alluvium = contig_name, fill = bin, label = bin)) + geom_flow(stat = "alluvium", lode.guidance = "frontback", color = "darkgray") + geom_stratum() + theme_bw() + ggtitle("24 Bin") ```