library(data.table)
library(tidyverse)

genes_merged <- read.csv("../data/merged_genes_full.csv",row.names = 1, header= TRUE)
genes_merged <- column_to_rownames(genes_merged,var="gene_names")
ref_gene <- as.numeric(genes_merged["acnA",])

# gene_count =as.data.frame(rowSums(genes_merged))
# gene_count <- tibble::rownames_to_column(gene_count, "gene_name")


# Calculate Jaccard Similarity
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE)
# Check the confusion matrix of the top 10 best matched genes
test <- lapply(names(head(fave_genes, 10)), function(gene_name){
  table(as.numeric(genes_merged["acnA",]), 
        as.numeric(genes_merged[gene_name,]))
})
# plot the ordered data based on similarity score
x_steps = seq(1, length(fave_genes)-1, 1)
y_steps = fave_genes[-1]
df = data.frame(x=x_steps,y=y_steps)
ggplot(data=df,mapping = aes(x, y))+geom_point()

xlab("Rank Order")
$x
[1] "Rank Order"

attr(,"class")
[1] "labels"
ylab("Similarity Score")
$y
[1] "Similarity Score"

attr(,"class")
[1] "labels"

nifH is kinda interesting - lots of metal binding genes associated amidst the other nif genes

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkodGlkeXZlcnNlKQoKZ2VuZXNfbWVyZ2VkIDwtIHJlYWQuY3N2KCIuLi9kYXRhL21lcmdlZF9nZW5lc19mdWxsLmNzdiIscm93Lm5hbWVzID0gMSwgaGVhZGVyPSBUUlVFKQojIGdlbmVzX21lcmdlZCA8LSBjb2x1bW5fdG9fcm93bmFtZXMoZ2VuZXNfbWVyZ2VkLHZhcj0iZ2VuZV9uYW1lcyIpCmBgYAoKYGBge3IgSmFjY2FyZH0KcmVmX2dlbmUgPC0gYXMubnVtZXJpYyhnZW5lc19tZXJnZWRbImFjbkEiLF0pCgojIGdlbmVfY291bnQgPWFzLmRhdGEuZnJhbWUocm93U3VtcyhnZW5lc19tZXJnZWQpKQojIGdlbmVfY291bnQgPC0gdGliYmxlOjpyb3duYW1lc190b19jb2x1bW4oZ2VuZV9jb3VudCwgImdlbmVfbmFtZSIpCgoKIyBDYWxjdWxhdGUgSmFjY2FyZCBTaW1pbGFyaXR5CmdlbmVfbWF0Y2hlcyA8LSBjb2xTdW1zKHQoZ2VuZXNfbWVyZ2VkKStyZWZfZ2VuZT09MikKZ2VuZV90b3RhbGhpdHMgPC0gY29sU3Vtcyh0KGdlbmVzX21lcmdlZCkrcmVmX2dlbmU+MCkKZmF2ZV9nZW5lcyA8LSBzb3J0KGdlbmVfbWF0Y2hlcy9nZW5lX3RvdGFsaGl0cywgZGVjcmVhc2luZyA9IFRSVUUpCgpgYGAKCgpgYGB7ciBtYXRyaXh9CiMgQ2hlY2sgdGhlIGNvbmZ1c2lvbiBtYXRyaXggb2YgdGhlIHRvcCAxMCBiZXN0IG1hdGNoZWQgZ2VuZXMKdGVzdCA8LSBsYXBwbHkobmFtZXMoaGVhZChmYXZlX2dlbmVzLCAxMCkpLCBmdW5jdGlvbihnZW5lX25hbWUpewogIHRhYmxlKGFzLm51bWVyaWMoZ2VuZXNfbWVyZ2VkWyJhY25BIixdKSwgCiAgICAgICAgYXMubnVtZXJpYyhnZW5lc19tZXJnZWRbZ2VuZV9uYW1lLF0pKQp9KQpgYGAKCgpgYGAge3IgbmFyRyBwbG90fQojIHBsb3QgdGhlIG9yZGVyZWQgZGF0YSBiYXNlZCBvbiBzaW1pbGFyaXR5IHNjb3JlCnhfc3RlcHMgPSBzZXEoMSwgbGVuZ3RoKGZhdmVfZ2VuZXMpLTEsIDEpCnlfc3RlcHMgPSBmYXZlX2dlbmVzWy0xXQpkZiA9IGRhdGEuZnJhbWUoeD14X3N0ZXBzLHk9eV9zdGVwcykKZ2dwbG90KGRhdGE9ZGYsbWFwcGluZyA9IGFlcyh4LCB5KSkrZ2VvbV9wb2ludCgpCnhsYWIoIlJhbmsgT3JkZXIiKQp5bGFiKCJTaW1pbGFyaXR5IFNjb3JlIikKYGBgCgpuaWZIIGlzIGtpbmRhIGludGVyZXN0aW5nIC0gbG90cyBvZiBtZXRhbCBiaW5kaW5nIGdlbmVzIGFzc29jaWF0ZWQgYW1pZHN0IHRoZSBvdGhlciBuaWYgZ2VuZXMK