---
title: "R Notebook"
output: html_notebook
---
```{r}
library(data.table)
library(tidyverse)
genes_merged <- read.csv("../data/merged_genes_full.csv",row.names = 1, header= TRUE)
# genes_merged <- column_to_rownames(genes_merged,var="gene_names")
```
```{r Jaccard}
ref_gene <- as.numeric(genes_merged["acnA",])
# gene_count =as.data.frame(rowSums(genes_merged))
# gene_count <- tibble::rownames_to_column(gene_count, "gene_name")
# Calculate Jaccard Similarity
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE)
```
```{r matrix}
# Check the confusion matrix of the top 10 best matched genes
test <- lapply(names(head(fave_genes, 10)), function(gene_name){
table(as.numeric(genes_merged["acnA",]),
as.numeric(genes_merged[gene_name,]))
})
```
``` {r narG plot}
# plot the ordered data based on similarity score
x_steps = seq(1, length(fave_genes)-1, 1)
y_steps = fave_genes[-1]
df = data.frame(x=x_steps,y=y_steps)
ggplot(data=df,mapping = aes(x, y))+geom_point()
xlab("Rank Order")
ylab("Similarity Score")
```
nifH is kinda interesting - lots of metal binding genes associated amidst the other nif genes