--- title: "FISH546 - Association Study Presentation" author: "Sam Cryan" format: revealjs editor: visual --- ## Within the grand array of microbial genes, which appear together most frequently? I'm conducting an association study across a range of microbial organisms to see the association between different genes. ## The Pipeline - Download genomes - Extract genes and make list for each genome - Merge list together - Choose reference gene - Compute similarity score of all genes compared to reference - Output list of most and least correlated based on similarity score ## Download Code ```{r download_genes, eval=FALSE, echo=TRUE} library(data.table) library(tidyverse) accession_list <- c("GCF_000006765.1","GCF_000011965.2","GCF_000157115.2","GCF_000012345.1") new_accession_list <- c("GCF_000012345.1") # Paste the accession IDs together with a URL-safe comma separating them accession_string <- paste(new_accession_list, collapse = "%2C") # Paste the string into the API's URL # Request the genome GFF with annotations accession_url <- paste0("https://api.ncbi.nlm.nih.gov/datasets/", "v2alpha/genome/accession/", accession_string, "/download?include_annotation_type=GENOME_GFF&", "filename=string_file.zip") # You can change where this is saved with the "destfile" argument, righ # now it's just the working directory download.file(accession_url, destfile = "./data/temp.zip") unzip("./data/temp.zip",exdir="./data") # Loop over each entry in accession list, read in the table from the # associated folder, and extract the gene IDs gene_presence_list <- lapply(accession_list, function(accession_number_i){ filename <- paste0("./data/ncbi_dataset/data/", accession_number_i, "/genomic.gff") # Apparently the files have different length headers so we need to skip them dynamically # from https://stackoverflow.com/q/18920777 # This makes the reader much slower but I can't figure out how to avoid it otherwise lineskip_command <- paste("grep -v '^#'", filename) # fread is data.table's fast table reader function # we also index for column 3 equaling "gene" at the end here tab_i <- fread(cmd=lineskip_command, sep = "\t", fill = TRUE, )[V3=="gene"] # Approach number 1 - downloading the GeneID name # This regex is a lookbehind (?<=) and grabs all digits (\\d) following # the lookbehind match # gene_ids <- as.numeric(str_extract(tab_i$V9, pattern = "(?<=Dbxref=GeneID:)\\d+")) # Approach number 2 - using the gene name technique gene_names <- str_extract(tab_i$V9, pattern="(?<=gene=)[^;]*(?=;)") # Remove all but 1 NA gene_names <- unique(gene_names, incomparables=FALSE, fromLast=FALSE, by=seq_along(gene_names)) # Rename to avoid name collision on merging output_table <- data.table(1, gene_names) names(output_table) <- c(accession_number_i, "gene_names") return(output_table) }) # Reduce using the merge function, keeping all rows, accumulating genes each time genes_merged <- purrr::reduce(gene_presence_list, dplyr::full_join, by="gene_names") ``` ## How to determine association? ![Jaccard Similarity Metric](images/jaccard.png){fig-align="center"} ## Simulated Data Setup ```{r create_simulated_data,cache=TRUE, echo=TRUE} #| code-line-numbers: "|11|16,17" perm_letters <- expand.grid(c(rep("A", 9), rep("B", 4), LETTERS), LETTERS, LETTERS) # paste0 (and the more general paste) combines the three columns into a single # character vector list_of_genes <- paste0(perm_letters$Var1, perm_letters$Var2, perm_letters$Var3) # Same method as above perm_genome_names <- head(expand.grid(LETTERS, 1:40), 1000) genome_names <- paste0(perm_genome_names$Var1, perm_genome_names$Var2) # Loop over each genome to be created and produce a data frame genome_list <- lapply(genome_names, function(genome_name){ # work from the inside out on this one, it's deeply nested # sample(100:200) to get a random number of genes in a genome # sample(list_of_genes) to figure out exactly which genes those are # Fill in the "present" column with 1s since they're all found in this case v <- data.frame(genes=unique(sample(list_of_genes, size = sample(100:200, size = 1))), genome_name=1) names(v) <- c("genes", genome_name) v }) ``` ## Create the presence/absence matrix The first 10x10 in the simulated data matrix: ```{r presence_absence, echo=TRUE,cache=TRUE} gene_genome_matrix_raw <- purrr::reduce(genome_list, dplyr::full_join, by="genes") gene_genome_matrix <- gene_genome_matrix_raw gene_genome_matrix[is.na(gene_genome_matrix)] <- 0 gene_genome_matrix <- gene_genome_matrix[order(gene_genome_matrix$genes),] rownames(gene_genome_matrix) <- gene_genome_matrix$genes gene_genome_matrix$genes <- NULL gene_genome_matrix[1:10, 1:10] ``` ## Example of a comparison between two genes Comparing gene 'AAA' to gene 'YVR' based on which organisms they appear in: ```{r comparison} table = table(as.numeric(gene_genome_matrix["AAA",]), as.numeric(gene_genome_matrix["YVR",])) table ``` ## Example distribution of randomly generated genes compared to gene 'AAA' Assuming around 17,000 possible genes, and that each genome contains 100-200 genes, with varying frequencies of different gene types. ```{r distr, eval=TRUE, cache=TRUE} library(tidyverse) ref_gene <- as.numeric(gene_genome_matrix["AAA",]) # Quick calculation of Jaccard by finding the two different values gene_matches <- colSums(t(gene_genome_matrix)+ref_gene==2) gene_totalhits <- colSums(t(gene_genome_matrix)+ref_gene>0) fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE) ``` ## ```{r, echo=FALSE} #| label: fig-sim-data #| fig-cap: "The Jaccard similarity score for all genes compared to AAA" #| fig-width: 6 #| fig-height: 3.5 library(data.table) library(tidyverse) x_steps = seq(1, length(fave_genes)-1, 1) y_steps = fave_genes[-1] df = data.frame(x=x_steps,y=y_steps) ggplot(data=df,mapping = aes(x, y))+geom_point()+labs(x="Rank Order",y="Similarity Score") ```