---
title: "mirTarRnaSeq"
author: "Jill Ashey"
date: "2024-10-01"
output: html_document
---

11/4/24 - so confused. trying to run my data but it is very unhappy with me. truly no clue...I have been running the test data and that has worked fine. When I try to run my own data, it fails. 

Things to try next: 

- normalizing gene expression data 
- normalizing miRNA expression data 
- Making 'fake' data from a subset of my own data in the exact format of the test data 


This script will use the R package [mirTarRnaSeq](https://bioconductor.org/packages/3.13/bioc/html/mirTarRnaSeq.html), which an be used for interactive mRNA miRNA sequencing statistical analysis, to assess coexpression between mRNAs and miRNAs. 

I will be using Part 1 of their workflow (see [vignette](https://bioconductor.org/packages/3.13/bioc/vignettes/mirTarRnaSeq/inst/doc/mirTarRnaSeq.pdf)). Part 1 looks at the mRNA miRNA associations across sample cohorts using count matrices and miranda output as input. The data is then modeled and the best regression is selected based on the best AIC value. Using a specific regression model, the pvalue, FDR and coefficients are provided for mRNA-miRNA associations, which will answer the question: is there statistical evidence for mRNA-miRNA associations in the dataset? 

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

#BiocManager::install("SPONGE")

library(tidyverse)
library(mirTarRnaSeq)
library(reshape2)
library(SPONGE)
library(pheatmap)
```

Read in gene expression data 
```{r}
gene_counts <- read_csv("../../D-Apul/output/07-Apul-Hisat/gene_count_matrix.csv")
gene_counts <- as.data.frame(gene_counts)
rownames(gene_counts) <- gene_counts[,1] #set first column that contains gene names as rownames
gene_counts <- gene_counts[,-1] # remove column w/ gene names 
```

Read in miRNA expression data 
```{r}
miRNA_counts <- read.delim("../../D-Apul/output/03.1-Apul-sRNA-summary/Apul_counts_miRNA_normalized.txt")
```

Read in miranda info 
```{r}
miranda_apul <- read.delim("~/Desktop/PutnamLab/Repositories/deep-dive/D-Apul/output/21-Apul-miRNA-target-prediction/miranda_strict_all_1kb_parsed_apul.txt", header = F)

colnames(miranda_apul) <- c("miRNA", "gene_coord", "score", "energy", "query_start_end", "subject_start_end", "total_bp_shared", "query_similar", "subject_similar")

# Format miranda df
miranda_apul$miRNA <- sub(">", "", miranda_apul$miRNA)

# Separate gene_coord into 'gene' and the rest of the original column
miranda_apul <- miranda_apul %>%
  separate(gene_coord, into = c("gene", "gene_coord"), sep = "::")
  
# Select specific cols
miranda_apul <- miranda_apul %>%
  select(miRNA, gene, score, energy)
```

The gene and miRNA counts do not have the same number of samples - gene counts has only 4 (ACR-140, ACR-145, ACR-173, ACR-179) and miRNA counts has 5 (sample140, sample 145, sample 150, sample173, and sample178). Remove sample150 from the miRNA counts data and rename columns so they are the same in each df 
```{r}
# Remove sample150 column from mirna counts
miRNA_counts <- miRNA_counts %>%
  select(-sample150)

# Rename gene count cols
colnames(gene_counts) <- c("sample140.gene", "sample145.gene", "sample173.gene", "sample178.gene")
```

Select only genes that are present in interaction data 
```{r}
filtered_gene_counts <- gene_counts[rownames(gene_counts) %in% miranda_apul$gene, ]
```

Join miranda w/ mirna counts
```{r}
miRNA_counts$miRNA <- rownames(miRNA_counts)
miranda_mi <- miranda_apul %>%
  full_join(miRNA_counts, by = "miRNA")
```

Join miranda + miRNA info with gene count info 
```{r}
filtered_gene_counts$gene <- rownames(filtered_gene_counts)
all_stuff <- miranda_mi %>%
  full_join(filtered_gene_counts, by = "gene")
```

Look at correlation
```{r}
# Initialize a data frame to store the results
correlation_results <- data.frame(
  miRNA = character(),
  gene = character(),
  score = numeric(),
  energy = numeric(),
  correlation = numeric(),
  p_value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each row in all_stuff to calculate correlations
for (i in 1:nrow(all_stuff)) {
  # Extract the miRNA and gene names for the current row
  miRNA <- all_stuff$miRNA[i]
  gene <- all_stuff$gene[i]
  score <- all_stuff$score[i]
  energy <- all_stuff$energy[i]
  
  # Extract miRNA expression values
  miRNA_values <- as.numeric(all_stuff[i, c("sample140", "sample145", "sample173", "sample178")])
  
  # Extract gene expression values
  gene_values <- as.numeric(all_stuff[i, c("sample140.gene", "sample145.gene", "sample173.gene", "sample178.gene")])
  
  # Calculate Pearson correlation
  cor_test <- cor.test(miRNA_values, gene_values, method = "pearson")
  
  # Store the results
  correlation_results <- rbind(correlation_results, 
                               data.frame(miRNA = miRNA,
                                          gene = gene,
                                          score = score,
                                          energy = energy,
                                          correlation = cor_test$estimate,
                                          p_value = cor_test$p.value))
}
```


Select miRNA of interest
```{r}
miRNA_select <- c("apul-mir-100")
```


```{r}
miranda_mirnas <- unique(miranda_apul$miRNA)
expression_mirnas <- rownames(miRNA_counts)
print(sum(miranda_mirnas %in% expression_mirnas))
print(sum(expression_mirnas %in% miranda_mirnas))
# same number of miRNAs
```

Subset data 
```{r}
# Select a subset of miRNAs and mRNAs
subset_mirnas <- head(rownames(miRNA_counts), 5)
subset_mrnas <- head(rownames(filtered_gene_counts), 1000)

# Filter your data
subset_miRNA_counts <- miRNA_counts[subset_mirnas, , drop = FALSE]
subset_gene_counts <- filtered_gene_counts[subset_mrnas, , drop = FALSE]
subset_miranda <- miranda_apul[miranda_apul$miRNA %in% subset_mirnas & miranda_apul$gene %in% subset_mrnas, ]

# Run the model on the subset
subset_run <- runAllMirnaModels(
  mirnas = rownames(subset_miRNA_counts),
  DiffExpmRNA = subset_gene_counts,
  DiffExpmiRNA = subset_miRNA_counts,
  miranda_data = subset_miranda,
  prob = 0.75,
  cutoff = 0.05,
  fdr_cutoff = 0.1,
  method = "fdr",
  family = glm_multi(),
  scale = 2,
  mode = "multi"
)

print("Gene names in filtered_gene_counts:")
print(head(rownames(filtered_gene_counts)))
print("Gene names in miranda_apul:")
print(head(miranda_apul$gene))
print("Number of matching genes:")
print(sum(rownames(filtered_gene_counts) %in% miranda_apul$gene))

print(str(miranda_apul))
print(table(miranda_apul$miRNA))
```

Getting error: Error in apply(p_adjusted, 1, function(x) any(is.finite(x) & x < value)) : dim(X) must have a positive length. Going to debug 
```{r}

miRanComp_custom <- function(DiffExpmRNA, miranda_data, mirna_pair) {
    # Filter miranda_data for the specific miRNA pair
    miranda_subset <- miranda_data[miranda_data$miRNA %in% mirna_pair, ]
    
    # Get the genes targeted by either miRNA in the pair
    targeted_genes <- unique(miranda_subset$gene)
    
    # Subset DiffExpmRNA to include only the targeted genes
    DiffExpmRNASub <- DiffExpmRNA[rownames(DiffExpmRNA) %in% targeted_genes, , drop = FALSE]
    
    print(paste("Number of genes targeted by", paste(mirna_pair, collapse = " or "), ":", nrow(DiffExpmRNASub)))
    
    return(DiffExpmRNASub)
}

runAllMirnaModels_debug <- function(mirnas, DiffExpmRNA, DiffExpmiRNA, miranda_data,
                                    prob = 0.75, fdr_cutoff = 0.1, method = "fdr", cutoff = 0.05,
                                    all_coeff = FALSE, mode = NULL, family = glm_poisson(), scale = 1) {
    assertthat::assert_that(is.character(mirnas))

    # make unique
    unique_mirnas <- unique(mirnas)
    print("Unique miRNAs:")
    print(unique_mirnas)

    # generate combinations of 2 miRNAs
    if (!is.null(mode) && mode == "multi") {
        unique_mirnas <- combn(unique_mirnas, 2, simplify = FALSE)
        print("miRNA combinations:")
        print(head(unique_mirnas))
    }

    # run models
    count <- 0
    ret <- lapply(unique_mirnas, function(mirna_pair) {
        count <<- count + 1
        cat(sprintf("%d: %s\n", count, paste(mirna_pair, collapse = ", ")))

        # For Determining Prob it needs to percent
        valmMedia <- quantile(miranda_data$score, probs = prob)[1]
        print(paste("valmMedia:", valmMedia))

        # MirandaParsingOfmyRNA
        DiffExpmRNASub <- miRanComp_custom(DiffExpmRNA, miranda_data, mirna_pair)
        print("Dimensions of DiffExpmRNASub:")
        print(dim(DiffExpmRNASub))

        if (nrow(DiffExpmRNASub) == 0) {
            print("No genes targeted by this miRNA pair. Skipping...")
            return(NULL)
        }

        # combiner
        Combine <- combiner(DiffExpmRNASub, DiffExpmiRNA, mirna_pair)
        print("Dimensions of Combine:")
        print(dim(Combine))

        # GeneVari
        geneVariant <- geneVari(Combine, mirna_pair)
        print("Number of gene variants:")
        print(length(geneVariant))

        if (length(geneVariant) == 0) {
            print("No gene variants found. Skipping...")
            return(NULL)
        }

        MRun <- runModels(Combine, geneVariant, mirna_pair, mode = mode, family = family, scale = scale, cutoff = cutoff)

        print("MRun results:")
        print(str(MRun))

        # Bonferroni sig Genes
        FDRModel <- fdrSig_fixed(MRun, value = fdr_cutoff, method = method)

        print("FDRModel results:")
        print(str(FDRModel))

        if (length(FDRModel$sig_models) > 0) {
            # look for negative coeffs; "-1" to exclude intercept while checking...#If true all if false any
            if (all_coeff) {
                valid_models <- sapply(FDRModel$sig_models, function(m) all(modelCoefficients(m) < 0))
            } else {
                valid_models <- sapply(FDRModel$sig_models, function(m) any(modelCoefficients(m) < 0))
            }
            # subset fdrmodel's things
            FDRModel$sig_models <- FDRModel$sig_models[valid_models]
            FDRModel$sig_pvalues <- FDRModel$sig_pvalues[valid_models, , drop = FALSE]
            FDRModel$sig_p_adjusted <- FDRModel$sig_p_adjusted[valid_models, , drop = FALSE]
            FDRModel$sig_genes <- FDRModel$sig_genes[valid_models]
        } else {
            print("No significant models found")
        }
        # make siggenes data.frame
        SigFDRGenes <- data.frame(cbind(FDRModel$sig_pvalues, FDRModel$sig_p_adjusted))

        names(SigFDRGenes) <- c("P Value", "FDR")

        return(list(SigFDRGenes = SigFDRGenes, FDRModel = FDRModel))
    })
    names(ret) <- sapply(unique_mirnas, paste, collapse = " and ")
    ret <- ret[!sapply(ret, is.null)]  # Remove NULL results
    return(ret)
}
```

Run debug code 
```{r}
debug_run <- runAllMirnaModels_debug(
  mirnas = rownames(miRNA_counts),
  DiffExpmRNA = filtered_gene_counts,
  DiffExpmiRNA = miRNA_counts,
  miranda_data = miranda_apul,
  prob = 0.75,
  cutoff = 0.05,
  fdr_cutoff = 0.1,
  method = "fdr",
  family = glm_multi(),
  scale = 2,
  mode = "multi"
)
```


```{r}
vMiRNA <- rownames(miRNA_counts)

All_miRNAs_run <- runAllMirnaModels(
  mirnas = "apul-mir-100",
  DiffExpmRNA = filtered_gene_counts,
  DiffExpmiRNA = miRNA_counts,
  miranda_data = miranda_apul,
  prob = 0.75,
  cutoff = 0.05,
  fdr_cutoff = 0.1,
  method = "fdr",
  family = glm_gaussian(),
  scale = 2
)
```


Run regression model
```{r}
regression_results <- runModels(combined_data, gene_variant, miRNA_select, 
                                family = glm_gaussian(), 
                                scale = 100)
test <- regression_results$
min(pval_multi)
max(pval_multi)


regression_results_p <- runModels(combined_data, gene_variant, miRNA_select, 
                                family = glm_poisson(), 
                                scale = 100)
```

ahhh so model looks at what interactions are significant...and then we run correlation analysis???

```{r}
library(mirTarRnaSeq)

# Ensure your data is in the correct format
gene_counts <- filtered_gene_counts
mirna_counts <- miRNA_counts
miranda_data <- miranda_apul

# Function to run model for a single miRNA-mRNA pair
run_single_model <- function(mRNA, miRNA, combined_data) {
  formula <- as.formula(paste(mRNA, "~", miRNA))
  model_result <- tryCatch({
    runModel(formula, combined_data, model = glm_poisson(), scale = 100)
  }, error = function(e) {
    message("Error in model for ", mRNA, " and ", miRNA, ": ", e$message)
    return(NULL)
  })
  return(model_result)
}

# Get all possible miRNA-mRNA pairs from miranda_data
miRNA_mRNA_pairs <- unique(miranda_data[, c("miRNA", "gene")])

# Combine gene and miRNA data
all_data <- combiner(gene_counts, mirna_counts, rownames(mirna_counts))

# Run models for all pairs
results <- lapply(1:nrow(miRNA_mRNA_pairs), function(i) {
  mRNA <- miRNA_mRNA_pairs$gene[i]
  miRNA <- miRNA_mRNA_pairs$miRNA[i]
  
  if (mRNA %in% colnames(all_data) && miRNA %in% colnames(all_data)) {
    model <- run_single_model(mRNA, miRNA, all_data)
    if (!is.null(model)) {
      return(data.frame(
        mRNA = mRNA,
        miRNA = miRNA,
        coefficient = coef(model)[2],
        p_value = summary(model)$coefficients[2, 4],
        AIC = AIC(model)
      ))
    }
  }
  return(NULL)
})

# Combine results and remove NULL entries
results_df <- do.call(rbind, results[!sapply(results, is.null)])

# Add FDR-adjusted p-values
results_df$adj_p_value <- p.adjust(results_df$p_value, method = "fdr")

# Sort by adjusted p-value
results_df <- results_df[order(results_df$adj_p_value), ]

# Print the top results
print(head(results_df, 20))

# Save results to a CSV file
#write.csv(results_df, "univariate_results.csv", row.names = FALSE)

# Plot top significant results
library(ggplot2)

top_results <- head(results_df, 50)  # Adjust the number as needed
ggplot(top_results, aes(x = reorder(paste(miRNA, mRNA, sep = " - "), adj_p_value), y = -log10(adj_p_value))) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top Significant miRNA-mRNA Interactions",
       x = "miRNA - mRNA Pair",
       y = "-log10(Adjusted P-value)") +
  theme_minimal()
```


```{r}
print(dim(filtered_gene_counts))
print(dim(miRNA_counts))
print(dim(miranda_apul))

print(head(rownames(miRNA_counts)))
print(head(colnames(filtered_gene_counts)))
print(head(miranda_apul$miRNA))
print(head(miranda_apul$gene))

sum(is.na(filtered_gene_counts))
sum(is.infinite(as.matrix(filtered_gene_counts)))
sum(is.na(miRNA_counts))
sum(is.infinite(as.matrix(miRNA_counts)))

print(nrow(miranda_apul))

# Select a subset of miRNAs and mRNAs
subset_mirnas <- head(rownames(miRNA_counts), 10)
subset_mrnas <- head(rownames(filtered_gene_counts), 100)

# Filter your data
subset_miRNA_counts <- miRNA_counts[subset_mirnas, ]
subset_gene_counts <- filtered_gene_counts[subset_mrnas, ]
subset_miranda <- miranda_apul[miranda_apul$miRNA %in% subset_mirnas & miranda_apul$gene %in% subset_mrnas, ]

# Run the model on the subset
subset_run <- runAllMirnaModels(
  mirnas = rownames(subset_miRNA_counts),
  DiffExpmRNA = subset_gene_counts,
  DiffExpmiRNA = subset_miRNA_counts,
  miranda_data = subset_miranda,
  prob = 0.75,
  cutoff = 0.05,
  fdr_cutoff = 0.1,
  method = "fdr",
  family = glm_multi(),
  scale = 2,
  mode = "multi"
)
```


```{r}
All_miRNAs_run <- runAllMirnaModels(
  mirnas = rownames(miRNA_counts),  # Use all miRNAs
  DiffExpmRNA = filtered_gene_counts,    # Your mRNA expression data
  DiffExpmiRNA = miRNA_counts,      # Your miRNA expression data
  miranda_data = miranda_apul,       # Your miRanda predictions
  prob = 0.75,                  # Probability threshold for filtering
  cutoff = 0.05,                # P-value cutoff
  fdr_cutoff = 0.1,             # FDR cutoff
  method = "fdr",               # Multiple testing correction method
  family = glm_multi(),         # Use multiple model types
  scale = 2,                    # Scaling factor
  mode = "multi"                # Multivariate mode
)
```


```{r}
multivariate_results <- runModels(combined_data, gene_variant, miRNA_select, 
                                  family = glm_multi(models=list(glm_gaussian(), glm_poisson())),
                                  mode = "multi", 
                                  scale = 10)

interaction_results <- runModels(combined_data, gene_variant, miRNA_select, 
                                 family = glm_multi(models=list(glm_gaussian(), glm_poisson())),
                                 mode = "inter", 
                                 scale = 10)
```

```{r}
# Extract coefficients from multivariate models
coef_matrix <- do.call(rbind, lapply(names(multivariate_results$all_models), function(gene) {
  model <- multivariate_results$all_models[[gene]]
  coef(model)[-1]  # Remove intercept
}))
rownames(coef_matrix) <- names(multivariate_results$all_models)

# Create heatmap
pheatmap(coef_matrix, 
         main = "miRNA-mRNA Multivariate Coefficients",
         color = colorRampPalette(c("blue", "white", "red"))(100),
         cluster_rows = F, 
         cluster_cols = F)
```


Look at results 
```{r}
# View model summaries
regression_results$all_models

# Plot results
plot(regression_results$all_models$FUN_035039)

aic_values <- do.call(rbind.data.frame, regression_results$AICvalues, regression_results_p$AICvalues)
ggplot(data = melt(aic_values), aes(x = value)) +
  geom_density(adjust = 1.5, alpha = 0.3) +
  ggtitle("AIC Distribution for miRNA-mRNA Regressions") +
  xlab("AIC Value") +
  ylab("Density")
```


Having trouble getting models to run...

```{r}
# Step 1: Filter miranda_apul to only include miRNA and gene names that are present in miRNA_counts and filtered_gene_counts
filtered_pairs <- miranda_apul %>%
  filter(gene %in% rownames(filtered_gene_counts) & miRNA %in% rownames(miRNA_counts))

# Step 2: Initialize an empty data frame to store correlation results
correlation_results <- data.frame(miRNA = character(),
                                  gene = character(),
                                  correlation = numeric(),
                                  p_value = numeric(),
                                  stringsAsFactors = FALSE)

# Step 3: Loop through each miRNA-mRNA pair in filtered_pairs
for (i in 1:nrow(filtered_pairs)) {
  # Extract the miRNA and gene names for the current pair
  miRNA <- filtered_pairs$miRNA[i]
  gene <- filtered_pairs$gene[i]
  
  # Get the expression values for the miRNA and gene
  miRNA_values <- as.numeric(miRNA_counts[miRNA, ])
  gene_values <- as.numeric(filtered_gene_counts[gene, ])
  
  # Calculate Pearson correlation between miRNA and gene expression values
  cor_test <- cor.test(miRNA_values, gene_values, method = "pearson")
  
  # Store the results
  correlation_results <- rbind(correlation_results, 
                               data.frame(miRNA = miRNA,
                                          gene = gene,
                                          correlation = cor_test$estimate,
                                          p_value = cor_test$p.value))
}

# View the results
head(correlation_results)

```


Run one to one gaussian regression model 
```{r}
j <- runModel(`FUN_038206` ~ `apul-mir-100`,
              Combine, model = glm_poisson(),
              scale = 100)

# Obtain pvalues for terms in model formula
print(modelTermPvalues(j)) 
modelModelName(j)
```

Run multiple models at once 
```{r}
#miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p")
#Combine <- combiner(DiffExp, miRNAExp, miRNA_select)
#geneVariant <- geneVari(Combine, miRNA_select)
MultiModel <- runModels(Combine, geneVariant,
                        miRNA_select, family = glm_multi(),
                        mode="multi", scale = 10)
# Print the name of the models used for the analysis (note the printed outputs are the number of models ran by various models based on the AIC scores using the glm_multi())
print(table(unlist(lapply(MultiModel$all_models, modelModelName))))
```


Run gaussian model over all potential interactions with miRNA of interest 
```{r}
blaGaus <- runModels(Combine,
                     geneVariant, miRNA_select,
                     family = glm_gaussian(),
                     scale = 100)

# Plot the regression relationship for the mRNA and miRNAs (BHLF1 is the EBV mRNA). Note, these are standard quality check plot outputs from plot regression.
par(oma=c(2,2,2,2))
par(mfrow=c(2,3),mar=c(4,3,3,2))
plot(blaGaus[["all_models"]][["BALF2"]])
plot(modelData(blaGaus[["all_models"]][["BALF2"]]))

# To comprehend the performance of all models we look at Akaike information criterion (AIC) across all miRNA-mRNA model performances and then look at the density plots. Note, the models with lower comparitive AICs have better performace.
G <- do.call(rbind.data.frame, blaGaus[["AICvalues"]])
names(G) <- c("AIC_G")
# Low values seems like a reasonable model
plot(density(G$AIC_G))
# Print out the AIC of all miRNA-mRNA models ( All observed AIC values for the
# miRNA-mRNA models)
GM <- melt(G)
```


Get correlation 
```{r}
corr_test <- corMirnaRna(filtered_gene_counts, miRNA_counts)
```


```{r}
corr_test <- corMirnaRna(gene_counts, miRNA_counts)
newcorr <- corMirnaRnaMiranda(DiffExpmRNASub, miRNAExp, -0.1, miRanda)
mirRnaHeatmap(newcorr)
```

Select only genes in 
```{r}

```


Using the test data as an initial pass to understand Part 1 of the package. Part 1 has a lot of different options one can explore. For the deep dive data, I think the univariate model or the synergistic model will be the best for our data. The univariate model can look at the relationshop of all miRNAs in regards to all mRNAs of interest (one to one relationship). 

I will use the `miRNA` df as miRNA counts input and `mRNA` df as mRNA counts input; this data is a part of the package. I will also use `miRanda` for interaction data. Looking at the manual pages and source [code](https://rdrr.io/github/Mercedeh66/mirTarRnaSeq/man/), prior to combining the files with the function `combiner`, they need to be transformed using `TZtrans`. 

Transform data
```{r}
miRNA_transform <- tzTrans(miRNA)
mRNA_transform <- tzTrans(mRNA)
```

Select only mRNAs that were identified as targets 
```{r}
mRNA_transform_sub <- miRanComp(mRNA_transform, miRanda)
```

Before combining, the `combiner` function requires an miRNA vector of miRNAs of interest. The manual says: "A vector of character's for miRNAs which the user is interested in investigating if glm is use 1 miRNA should be input. If multivariate several miRNAs should be imported, same goes for interaction determination for miRNAs. Note we do not recommend more than 3-4 miRNAs at a time for the latter cases."


Select specific miRNA. Not sure why this needs to be done...
```{r}
miRNA_select<-rownames(miRNA)
```

Combine miRNA and mRNA files; make list of files to be used in model 
```{r}
combine <- combiner(mRNASub, miRNA, miRNA_select)
gene_var <- geneVari(combine, miRNA_select)
```

Run Gausian model 
```{r}
test <- runModels(combine, geneVariant, miRNA_select, family = glm_gaussian(), scale = 100)

par(oma=c(2,2,2,2))
par(mfrow=c(2,3),mar=c(4,3,3,2))
plot(test[["all_models"]][["BHLF1"]])
plot(modelData(test[["all_models"]][["BHLF1"]]))
#To test AIC model performance
G <- do.call(rbind.data.frame, test[["AICvalues"]])
names(G) <- c("AIC_G")
#Low values seems like a reasonable model
plot(density(G$AIC_G))
```

Run all combos
```{r}
vmiRNA <- rownames(miRNA)

all_miRNA_models <- runAllMirnaModels(mirnas = vmiRNA[1:6], DiffExpmRNA = mRNA, DiffExpmiRNA = miRNA, miranda_data = miRanda, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode = "multi")

hasgenes <- lapply(all_miRNA_models, function(x) nrow(x$SigFDRGenes)) > 0
all_miRNA_models <- all_miRNA_models[hasgenes]
```

Look at data 
```{r}
all_miRNA_models$`ebv-mir-bart1-3p and ebv-mir-bart1-5p`
```

Need to play with test data more 


Playing w data November 3-4 24

# Part1 - miRNA mRNA regressions across sample cohorts
The users can utilize TPM/RPKM or count data for this section of analysis as long as if the data is normalized (TPM/RPKM) is used for the mRNA expression files the miRNA expression files are also normalized (TPM). The format of the files for miRNA and mRNA needs to be odd the class dataframe with the sample names as colnames and mRNA names as rownames. Note, for this section of analysis we have only included all differentially expressed mRNAs for the analysis but the user can included all mRNAs. However, the user should realize if they choose to use all mRNAs expressed the analysis will take a longer time.

Load data 
```{r}
# Helper function to access test files in extdata
get_test_file <- function(file_name) {
  if (missing(file_name)) {
    return(system.file("extdata", "test", package="mirTarRnaSeq"))
  }
  return(system.file("extdata", "test", file_name, package="mirTarRnaSeq"))
}

# Read input files
DiffExp<-read.table(get_test_file("EBV_mRNA.txt.gz"), as.is=TRUE, header=TRUE, row.names=1)
miRNAExp<-read.table(get_test_file("EBV_miRNA.txt.gz"), as.is=TRUE, header=TRUE, row.names=1)
```

`DiffExp` represents mRNA expression. Each row represents a gene and each column represents a sample. I'm not sure if these counts are already normalized. 
`miRNAExp` represents miRNA expression. Each row repsents a miRNA and each column represents a sample. Not sure if these are normalized but they are higher than the mRNA expression. The two dfs also do not have the same number of columns. 

Ensure both dfs have same number of samples and that sample columns are in the same order. 
```{r}
# Reorder columns in miRNAExp to match the column order in DiffExp
miRNAExp <- miRNAExp[, colnames(DiffExp)]

# Check if columns are now ordered the same
all(colnames(miRNAExp) == colnames(DiffExp)) # should be true 
```

Load miranda. Note, for other organisms (not provided by the pacakged) the appropriate format for miRanda input file is V1 column: name of the miRNA, V2 column: name of the predicted gene, V3: column score/rank of the miranda interaction (or any other score for interaction if TargetScan ( for TargetScan prediction user can provide context++ score and threshold accordingly) is used, note these three columns are required), V4 column: folding energy of miRNA-mRNA interaction. V5 column: target Identity value and V6 column: miRNA idnetity value. (Note, all these values are available after miRanda analysis but V1-V3 must be provided not matter the prediction method)
```{r}
miRanda <- getInputSpecies("Epstein_Barr", threshold = 140)

# Select only 
DiffExpmRNASub <- miRanComp(DiffExp, miRanda)
```

`miRanComp` only keeps the mRNAs that are present in the expression file. 

Select miRNA
```{r}
miRNA_select<-c("ebv-mir-BART9-5p")
```

Combine miRNA and mRNA file and define boundaries
```{r}
# Make intersection df for mRNA and miRNA(s) of interest to be tested 
Combine <- combiner(DiffExp, miRNAExp, miRNA_select)
# The rows are the sample, and the columns are the mRNAs of interest and the miRNA(s) of interest

# Define boundaries of mRNAs vs miRNAs - essentially, just looking at the mRNAs targeted by the miRNA of interest
geneVariant <- geneVari(Combine, miRNA_select)
```

Running various univariate models(1 to 1 miRNA-mRNA relationships) with various miRNA-mRNA distribution assumptions for either 1 miRNA and 1 mRNA relationship chosen/selected by the user or across all miRNAs and mRNAs in the expression dataframes. Note,the users can choose to run any of the available distribution model assumptions (glm_poisson, glm_gaussian, glm_nb (negative binomial), glm_zeroinfl (zero inflated model or zero inflated negative binomial)).

First, run a one to one miRNA/mRNA gaussian regression model (univariate model for 1 miRNA and 1 mRNA). BALF2 is EBV mRNA, the ebv-mir-BART9-5p is the miRNA and we are running a glm poisson model.

Run one to one gaussian regression model 
```{r}
j <- runModel(`BALF2` ~ `ebv-mir-BART9-5p`,
              Combine, model = glm_poisson(),
              scale = 100)

# Obtain pvalues for terms in model formula
print(modelTermPvalues(j)) 
modelModelName(j)
```

So what does this mean? There is a correlation between BALF2 and ebv-mir-BART9-5p because there is a significant pvalue? 

I think I am interested in running the multi mode, where the best fit model for every miRNA-mRNA relationship is selected through AIC comparisons for model selection. After determining the appropriate model, statistical predictions can be made and significant miRNA-mRNA relationships/correlations can be identified. 

```{r}
newcorr <- corMirnaRnaMiranda(DiffExpmRNASub, miRNAExp, -0.1, miRanda)
mirRnaHeatmap(newcorr)

# my data
corr_test_me <- corMirnaRnaMiranda(filtered_gene_counts, miRNA_counts, -0.1, miranda_apul)
```

so confused...why does test code work but my data does not work????


Run gaussian model over all potential interactions with miRNA of interest 
```{r}
blaGaus <- runModels(Combine,
                     geneVariant, miRNA_select,
                     family = glm_gaussian(),
                     scale = 100)

# Plot the regression relationship for the mRNA and miRNAs (BHLF1 is the EBV mRNA). Note, these are standard quality check plot outputs from plot regression.
par(oma=c(2,2,2,2))
par(mfrow=c(2,3),mar=c(4,3,3,2))
plot(blaGaus[["all_models"]][["BALF2"]])
plot(modelData(blaGaus[["all_models"]][["BALF2"]]))

# To comprehend the performance of all models we look at Akaike information criterion (AIC) across all miRNA-mRNA model performances and then look at the density plots. Note, the models with lower comparitive AICs have better performace.
G <- do.call(rbind.data.frame, blaGaus[["AICvalues"]])
names(G) <- c("AIC_G")
# Low values seems like a reasonable model
plot(density(G$AIC_G))
# Print out the AIC of all miRNA-mRNA models ( All observed AIC values for the
# miRNA-mRNA models)
GM <- melt(G)
```

Run poisson model
```{r}
blaPois <- runModels(Combine,
                     geneVariant, miRNA_select,
                     family = glm_poisson(),
                     scale = 100)
par(oma=c(2,2,2,2))
par(mfrow=c(2,3),mar=c(4,3,3,2))
plot(blaPois[["all_models"]][["LMP-2A"]])
plot(modelData(blaPois[["all_models"]][["LMP-2A"]]))
P <- do.call(rbind.data.frame, blaPois[["AICvalues"]])
names(P) <- c("AIC_Po")
PM <- melt(P)
```

Run NB model
```{r}
blaNB <- runModels(Combine,
                   geneVariant, miRNA_select,
                   family = glm_nb(), scale = 100)
par(mar=c(4,3,3,2))
plot(modelData(blaNB[["all_models"]][["BALF1"]]))
B <- do.call(rbind.data.frame, blaNB[["AICvalues"]])
names(B) <- c("AIC_NB")
BM <- melt(B)
```

Run zero inflated NB model 
```{r}
blazeroinflNB <- runModels(Combine, geneVariant,
                            miRNA_select,
                            family = glm_zeroinfl(dist = "negbin"),
                            scale = 100)
# To test AIC model performance
ZNB <- do.call(rbind.data.frame, blazeroinflNB[["AICvalues"]])
names(ZNB) <- c("AIC_ZNB")
par(mar=c(4,3,3,2))
plot(density(ZNB$AIC_ZNB))
ZNBM<-melt(ZNB)
```

Run zero inflated poisson binomial model 
```{r}
blazeroinfl <- runModels(Combine, geneVariant,
                         miRNA_select,
                         family = glm_zeroinfl(),
                         scale = 100)
# To test AIC model performance
Zp <- do.call(rbind.data.frame, blazeroinfl[["AICvalues"]])
names(Zp) <- c("AIC_Zp")
par(mar=c(4,3,3,2))
plot(density(Zp$AIC_Zp))
ZpM <- melt(Zp)
```

Figure out which model to use 
```{r}
bindM <- rbind(PM, BM, GM, ZpM, ZNBM)
p2 <- ggplot(data = bindM, aes(x = value, group = variable,
                               fill = variable)) +
  geom_density(adjust = 1.5, alpha = .3) +
  xlim(-400, 2000)+
  ggtitle("Plot of of AIC for ebv-mir-BART9-5p regressed all mRNAs ")+
  ylab("Density")+ xlab ("AIC Value")
p2
```

The user can decide to use runModels() with glm_multi() (with multi and inter mode options). When using glm_multi(), where all available models will be run, the AICs will be compared and the best model will be chosen based on the miRNA-mRNA model AIC score. In the example bellow we are using the mode= "multi" option for combination of 2 miRNAs (multivariate model) for interaction model the user can choose the mode= "inter" option. Note all_coeff parameters defaults TRUE all interactions (only negative miRNA-mRNA relationships) are reported. More comments on the mode is provided in the next section of the vignette. If all miRNA-mRNA relationships are wanted this parameter can be set to false.

Run multiple models at once 
```{r}
miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p")
Combine <- combiner(DiffExp, miRNAExp, miRNA_select)
geneVariant <- geneVari(Combine, miRNA_select)
MultiModel <- runModels(Combine, geneVariant,
                        miRNA_select, family = glm_multi(),
                        mode="multi", scale = 10)
# Print the name of the models used for the analysis (note the printed outputs are the number of models ran by various models based on the AIC scores using the glm_multi())
print(table(unlist(lapply(MultiModel$all_models, modelModelName))))
```

Run multiple models again? 
```{r}
miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p")
Combine <- combiner(DiffExp, miRNAExp, miRNA_select)
geneVariant <- geneVari(Combine, miRNA_select)
InterModel <- runModels(Combine,
                        geneVariant, miRNA_select,
                        family = glm_multi(
                          models=list(glm_gaussian,
                          glm_poisson())),mode="inter", scale = 10)

# Print the name of the models used for the analysis (note, you can see although
# we have defined for the models to be run using either gaussian or poisson
# options, mirTarRnaSeq chooses the poisson model for all as it performs with a
# better AIC (lower AIC value) for all miRNA-mRNA interactions)
print(table(unlist(lapply(InterModel$all_models, modelModelName))))
```

Run all combos
```{r}
vMiRNA<-rownames(miRNAExp)
# Note, the user can run all miRNAs but for speed reasons we have chosen the first 5 here for mirnas input for the analysis.
All_miRNAs_run<-runAllMirnaModels(mirnas =vMiRNA[1:5] ,
                                  DiffExpmRNA = DiffExpmRNASub,
                                  DiffExpmiRNA = miRNAExp,
                                  miranda_data = miRanda,prob=0.75,
                                  cutoff=0.05,fdr_cutoff = 0.1, method = "fdr",
                                  family = glm_multi(), scale = 2, mode="multi")

#select significant genes
hasgenes <- lapply(All_miRNAs_run, function(x) nrow(x$SigFDRGenes)) > 0
All_miRNAs_run <- All_miRNAs_run[hasgenes]
print(table(unlist(lapply
                   (All_miRNAs_run[[1]][["FDRModel"]][["all_models"]],
                     modelModelName))))
# Print specific models for specific miRNAs (in this example the significant multivariate model for ebv-mir-BART1-5p and ebv-mir-BART11-3p )
print(
  table(
    unlist(lapply(
      (All_miRNAs_run[["ebv-mir-BART1-5p and ebv-mir-BART11-3p"]]
                     [["FDRModel"]]
                     [["all_models"]]),
        modelModelName))))

```

```{r}
# Make miRanda file compatible with SPONGE package mir_predicted_targets()
sponge_mir_input<-miranda_sponge_predict(miRNAExp,DiffExp,miRanda)

#Perform sparse partial correlation for miRNA-mRNA relationships
one2many_output<-one2manySponge(miRNAExp,DiffExp,sponge_mir_input)
```