library(data.table)
library(tidyverse)
genes_merged <- read.csv("../../data/merged_genes_full.csv",row.names = 1, header= TRUE)
gene_list = genes_merged[,"gene_names"]
genes_merged <- column_to_rownames(genes_merged,var="gene_names")
```r
ref_gene <- as.numeric(genes_merged[\acnA\,])
# gene_count =as.data.frame(rowSums(genes_merged))
# gene_count <- tibble::rownames_to_column(gene_count, \gene_name\)
# Calculate Jaccard Similarity
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE)
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
Generates a confusion matrix of the top 10 best hits
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxuIyBDaGVjayB0aGUgY29uZnVzaW9uIG1hdHJpeCBvZiB0aGUgdG9wIDEwIGJlc3QgbWF0Y2hlZCBnZW5lc1xudGVzdCA8LSBsYXBwbHkobmFtZXMoaGVhZChmYXZlX2dlbmVzLCAxMCkpLCBmdW5jdGlvbihnZW5lX25hbWUpe1xuICB0YWJsZShhcy5udW1lcmljKGdlbmVzX21lcmdlZFtcXGFjbkFcXCxdKSwgXG4gICAgICAgIGFzLm51bWVyaWMoZ2VuZXNfbWVyZ2VkW2dlbmVfbmFtZSxdKSlcbn0pXG5gYGBcbmBgYCJ9 -->
```r
```r
# Check the confusion matrix of the top 10 best matched genes
test <- lapply(names(head(fave_genes, 10)), function(gene_name){
table(as.numeric(genes_merged[\acnA\,]),
as.numeric(genes_merged[gene_name,]))
})
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxuIyBwbG90IHRoZSBvcmRlcmVkIGRhdGEgYmFzZWQgb24gc2ltaWxhcml0eSBzY29yZVxueF9zdGVwcyA9IHNlcSgxLCBsZW5ndGgoZmF2ZV9nZW5lcyktMSwgMSlcbnlfc3RlcHMgPSBmYXZlX2dlbmVzWy0xXVxuZGYgPSBkYXRhLmZyYW1lKHg9eF9zdGVwcyx5PXlfc3RlcHMpXG5nZ3Bsb3QoZGF0YT1kZixtYXBwaW5nID0gYWVzKHgsIHkpKStnZW9tX3BvaW50KClcbmBgYFxuYGBgIn0= -->
```r
```r
# plot the ordered data based on similarity score
x_steps = seq(1, length(fave_genes)-1, 1)
y_steps = fave_genes[-1]
df = data.frame(x=x_steps,y=y_steps)
ggplot(data=df,mapping = aes(x, y))+geom_point()
<!-- rnb-source-end -->
<!-- rnb-plot-begin -->
<img src="" />
<!-- rnb-plot-end -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxueGxhYihcXFJhbmsgT3JkZXJcXClcbmBgYFxuYGBgIn0= -->
```r
```r
xlab(\Rank Order\)
<!-- rnb-source-end -->
<!-- rnb-output-begin eyJkYXRhIjoiJHhcblsxXSBcXFJhbmsgT3JkZXJcXFxuXG5hdHRyKCxcXGNsYXNzXFwpXG5bMV0gXFxsYWJlbHNcXFxuIn0= -->
$x [1] Order
attr(,) [1]
<!-- rnb-output-end -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxueWxhYihcXFNpbWlsYXJpdHkgU2NvcmVcXClcbmBgYFxuYGBgIn0= -->
```r
```r
ylab(\Similarity Score\)
<!-- rnb-source-end -->
<!-- rnb-output-begin eyJkYXRhIjoiJHlcblsxXSBcXFNpbWlsYXJpdHkgU2NvcmVcXFxuXG5hdHRyKCxcXGNsYXNzXFwpXG5bMV0gXFxsYWJlbHNcXFxuIn0= -->
$y [1] Score
attr(,) [1]
<!-- rnb-output-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
nifH is kinda interesting - lots of metal binding genes associated amidst the other nif genes
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxucm93Lm5hbWVzKGphY2NhcmRfbWF0cml4KVxuXG5gYGAifQ== -->
```r
row.names(jaccard_matrix)
Error in row.names(jaccard_matrix) : object 'jaccard_matrix' not found