--- title: "mirTarRnaSeq" author: "Jill Ashey" date: "2024-10-01" output: html_document --- 11/4/24 - so confused. trying to run my data but it is very unhappy with me. truly no clue...I have been running the test data and that has worked fine. When I try to run my own data, it fails. Things to try next: - normalizing gene expression data - normalizing miRNA expression data - Making 'fake' data from a subset of my own data in the exact format of the test data This script will use the R package [mirTarRnaSeq](https://bioconductor.org/packages/3.13/bioc/html/mirTarRnaSeq.html), which an be used for interactive mRNA miRNA sequencing statistical analysis, to assess coexpression between mRNAs and miRNAs. I will be using Part 1 of their workflow (see [vignette](https://bioconductor.org/packages/3.13/bioc/vignettes/mirTarRnaSeq/inst/doc/mirTarRnaSeq.pdf)). Part 1 looks at the mRNA miRNA associations across sample cohorts using count matrices and miranda output as input. The data is then modeled and the best regression is selected based on the best AIC value. Using a specific regression model, the pvalue, FDR and coefficients are provided for mRNA-miRNA associations, which will answer the question: is there statistical evidence for mRNA-miRNA associations in the dataset? ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) #BiocManager::install("SPONGE") library(tidyverse) library(mirTarRnaSeq) library(reshape2) library(SPONGE) library(pheatmap) ``` Read in gene expression data ```{r} gene_counts <- read_csv("../../D-Apul/output/07-Apul-Hisat/gene_count_matrix.csv") gene_counts <- as.data.frame(gene_counts) rownames(gene_counts) <- gene_counts[,1] #set first column that contains gene names as rownames gene_counts <- gene_counts[,-1] # remove column w/ gene names ``` Read in miRNA expression data ```{r} miRNA_counts <- read.delim("../../D-Apul/output/03.1-Apul-sRNA-summary/Apul_counts_miRNA_normalized.txt") ``` Read in miranda info ```{r} miranda_apul <- read.delim("~/Desktop/PutnamLab/Repositories/deep-dive/D-Apul/output/21-Apul-miRNA-target-prediction/miranda_strict_all_1kb_parsed_apul.txt", header = F) colnames(miranda_apul) <- c("miRNA", "gene_coord", "score", "energy", "query_start_end", "subject_start_end", "total_bp_shared", "query_similar", "subject_similar") # Format miranda df miranda_apul$miRNA <- sub(">", "", miranda_apul$miRNA) # Separate gene_coord into 'gene' and the rest of the original column miranda_apul <- miranda_apul %>% separate(gene_coord, into = c("gene", "gene_coord"), sep = "::") # Select specific cols miranda_apul <- miranda_apul %>% select(miRNA, gene, score, energy) ``` The gene and miRNA counts do not have the same number of samples - gene counts has only 4 (ACR-140, ACR-145, ACR-173, ACR-179) and miRNA counts has 5 (sample140, sample 145, sample 150, sample173, and sample178). Remove sample150 from the miRNA counts data and rename columns so they are the same in each df ```{r} # Remove sample150 column from mirna counts miRNA_counts <- miRNA_counts %>% select(-sample150) # Rename gene count cols colnames(gene_counts) <- c("sample140.gene", "sample145.gene", "sample173.gene", "sample178.gene") ``` Select only genes that are present in interaction data ```{r} filtered_gene_counts <- gene_counts[rownames(gene_counts) %in% miranda_apul$gene, ] ``` Join miranda w/ mirna counts ```{r} miRNA_counts$miRNA <- rownames(miRNA_counts) miranda_mi <- miranda_apul %>% full_join(miRNA_counts, by = "miRNA") ``` Join miranda + miRNA info with gene count info ```{r} filtered_gene_counts$gene <- rownames(filtered_gene_counts) all_stuff <- miranda_mi %>% full_join(filtered_gene_counts, by = "gene") ``` Look at correlation ```{r} # Initialize a data frame to store the results correlation_results <- data.frame( miRNA = character(), gene = character(), score = numeric(), energy = numeric(), correlation = numeric(), p_value = numeric(), stringsAsFactors = FALSE ) # Loop through each row in all_stuff to calculate correlations for (i in 1:nrow(all_stuff)) { # Extract the miRNA and gene names for the current row miRNA <- all_stuff$miRNA[i] gene <- all_stuff$gene[i] score <- all_stuff$score[i] energy <- all_stuff$energy[i] # Extract miRNA expression values miRNA_values <- as.numeric(all_stuff[i, c("sample140", "sample145", "sample173", "sample178")]) # Extract gene expression values gene_values <- as.numeric(all_stuff[i, c("sample140.gene", "sample145.gene", "sample173.gene", "sample178.gene")]) # Calculate Pearson correlation cor_test <- cor.test(miRNA_values, gene_values, method = "pearson") # Store the results correlation_results <- rbind(correlation_results, data.frame(miRNA = miRNA, gene = gene, score = score, energy = energy, correlation = cor_test$estimate, p_value = cor_test$p.value)) } ``` Select miRNA of interest ```{r} miRNA_select <- c("apul-mir-100") ``` ```{r} miranda_mirnas <- unique(miranda_apul$miRNA) expression_mirnas <- rownames(miRNA_counts) print(sum(miranda_mirnas %in% expression_mirnas)) print(sum(expression_mirnas %in% miranda_mirnas)) # same number of miRNAs ``` Subset data ```{r} # Select a subset of miRNAs and mRNAs subset_mirnas <- head(rownames(miRNA_counts), 5) subset_mrnas <- head(rownames(filtered_gene_counts), 1000) # Filter your data subset_miRNA_counts <- miRNA_counts[subset_mirnas, , drop = FALSE] subset_gene_counts <- filtered_gene_counts[subset_mrnas, , drop = FALSE] subset_miranda <- miranda_apul[miranda_apul$miRNA %in% subset_mirnas & miranda_apul$gene %in% subset_mrnas, ] # Run the model on the subset subset_run <- runAllMirnaModels( mirnas = rownames(subset_miRNA_counts), DiffExpmRNA = subset_gene_counts, DiffExpmiRNA = subset_miRNA_counts, miranda_data = subset_miranda, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode = "multi" ) print("Gene names in filtered_gene_counts:") print(head(rownames(filtered_gene_counts))) print("Gene names in miranda_apul:") print(head(miranda_apul$gene)) print("Number of matching genes:") print(sum(rownames(filtered_gene_counts) %in% miranda_apul$gene)) print(str(miranda_apul)) print(table(miranda_apul$miRNA)) ``` Getting error: Error in apply(p_adjusted, 1, function(x) any(is.finite(x) & x < value)) : dim(X) must have a positive length. Going to debug ```{r} miRanComp_custom <- function(DiffExpmRNA, miranda_data, mirna_pair) { # Filter miranda_data for the specific miRNA pair miranda_subset <- miranda_data[miranda_data$miRNA %in% mirna_pair, ] # Get the genes targeted by either miRNA in the pair targeted_genes <- unique(miranda_subset$gene) # Subset DiffExpmRNA to include only the targeted genes DiffExpmRNASub <- DiffExpmRNA[rownames(DiffExpmRNA) %in% targeted_genes, , drop = FALSE] print(paste("Number of genes targeted by", paste(mirna_pair, collapse = " or "), ":", nrow(DiffExpmRNASub))) return(DiffExpmRNASub) } runAllMirnaModels_debug <- function(mirnas, DiffExpmRNA, DiffExpmiRNA, miranda_data, prob = 0.75, fdr_cutoff = 0.1, method = "fdr", cutoff = 0.05, all_coeff = FALSE, mode = NULL, family = glm_poisson(), scale = 1) { assertthat::assert_that(is.character(mirnas)) # make unique unique_mirnas <- unique(mirnas) print("Unique miRNAs:") print(unique_mirnas) # generate combinations of 2 miRNAs if (!is.null(mode) && mode == "multi") { unique_mirnas <- combn(unique_mirnas, 2, simplify = FALSE) print("miRNA combinations:") print(head(unique_mirnas)) } # run models count <- 0 ret <- lapply(unique_mirnas, function(mirna_pair) { count <<- count + 1 cat(sprintf("%d: %s\n", count, paste(mirna_pair, collapse = ", "))) # For Determining Prob it needs to percent valmMedia <- quantile(miranda_data$score, probs = prob)[1] print(paste("valmMedia:", valmMedia)) # MirandaParsingOfmyRNA DiffExpmRNASub <- miRanComp_custom(DiffExpmRNA, miranda_data, mirna_pair) print("Dimensions of DiffExpmRNASub:") print(dim(DiffExpmRNASub)) if (nrow(DiffExpmRNASub) == 0) { print("No genes targeted by this miRNA pair. Skipping...") return(NULL) } # combiner Combine <- combiner(DiffExpmRNASub, DiffExpmiRNA, mirna_pair) print("Dimensions of Combine:") print(dim(Combine)) # GeneVari geneVariant <- geneVari(Combine, mirna_pair) print("Number of gene variants:") print(length(geneVariant)) if (length(geneVariant) == 0) { print("No gene variants found. Skipping...") return(NULL) } MRun <- runModels(Combine, geneVariant, mirna_pair, mode = mode, family = family, scale = scale, cutoff = cutoff) print("MRun results:") print(str(MRun)) # Bonferroni sig Genes FDRModel <- fdrSig_fixed(MRun, value = fdr_cutoff, method = method) print("FDRModel results:") print(str(FDRModel)) if (length(FDRModel$sig_models) > 0) { # look for negative coeffs; "-1" to exclude intercept while checking...#If true all if false any if (all_coeff) { valid_models <- sapply(FDRModel$sig_models, function(m) all(modelCoefficients(m) < 0)) } else { valid_models <- sapply(FDRModel$sig_models, function(m) any(modelCoefficients(m) < 0)) } # subset fdrmodel's things FDRModel$sig_models <- FDRModel$sig_models[valid_models] FDRModel$sig_pvalues <- FDRModel$sig_pvalues[valid_models, , drop = FALSE] FDRModel$sig_p_adjusted <- FDRModel$sig_p_adjusted[valid_models, , drop = FALSE] FDRModel$sig_genes <- FDRModel$sig_genes[valid_models] } else { print("No significant models found") } # make siggenes data.frame SigFDRGenes <- data.frame(cbind(FDRModel$sig_pvalues, FDRModel$sig_p_adjusted)) names(SigFDRGenes) <- c("P Value", "FDR") return(list(SigFDRGenes = SigFDRGenes, FDRModel = FDRModel)) }) names(ret) <- sapply(unique_mirnas, paste, collapse = " and ") ret <- ret[!sapply(ret, is.null)] # Remove NULL results return(ret) } ``` Run debug code ```{r} debug_run <- runAllMirnaModels_debug( mirnas = rownames(miRNA_counts), DiffExpmRNA = filtered_gene_counts, DiffExpmiRNA = miRNA_counts, miranda_data = miranda_apul, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode = "multi" ) ``` ```{r} vMiRNA <- rownames(miRNA_counts) All_miRNAs_run <- runAllMirnaModels( mirnas = "apul-mir-100", DiffExpmRNA = filtered_gene_counts, DiffExpmiRNA = miRNA_counts, miranda_data = miranda_apul, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_gaussian(), scale = 2 ) ``` Run regression model ```{r} regression_results <- runModels(combined_data, gene_variant, miRNA_select, family = glm_gaussian(), scale = 100) test <- regression_results$ min(pval_multi) max(pval_multi) regression_results_p <- runModels(combined_data, gene_variant, miRNA_select, family = glm_poisson(), scale = 100) ``` ahhh so model looks at what interactions are significant...and then we run correlation analysis??? ```{r} library(mirTarRnaSeq) # Ensure your data is in the correct format gene_counts <- filtered_gene_counts mirna_counts <- miRNA_counts miranda_data <- miranda_apul # Function to run model for a single miRNA-mRNA pair run_single_model <- function(mRNA, miRNA, combined_data) { formula <- as.formula(paste(mRNA, "~", miRNA)) model_result <- tryCatch({ runModel(formula, combined_data, model = glm_poisson(), scale = 100) }, error = function(e) { message("Error in model for ", mRNA, " and ", miRNA, ": ", e$message) return(NULL) }) return(model_result) } # Get all possible miRNA-mRNA pairs from miranda_data miRNA_mRNA_pairs <- unique(miranda_data[, c("miRNA", "gene")]) # Combine gene and miRNA data all_data <- combiner(gene_counts, mirna_counts, rownames(mirna_counts)) # Run models for all pairs results <- lapply(1:nrow(miRNA_mRNA_pairs), function(i) { mRNA <- miRNA_mRNA_pairs$gene[i] miRNA <- miRNA_mRNA_pairs$miRNA[i] if (mRNA %in% colnames(all_data) && miRNA %in% colnames(all_data)) { model <- run_single_model(mRNA, miRNA, all_data) if (!is.null(model)) { return(data.frame( mRNA = mRNA, miRNA = miRNA, coefficient = coef(model)[2], p_value = summary(model)$coefficients[2, 4], AIC = AIC(model) )) } } return(NULL) }) # Combine results and remove NULL entries results_df <- do.call(rbind, results[!sapply(results, is.null)]) # Add FDR-adjusted p-values results_df$adj_p_value <- p.adjust(results_df$p_value, method = "fdr") # Sort by adjusted p-value results_df <- results_df[order(results_df$adj_p_value), ] # Print the top results print(head(results_df, 20)) # Save results to a CSV file #write.csv(results_df, "univariate_results.csv", row.names = FALSE) # Plot top significant results library(ggplot2) top_results <- head(results_df, 50) # Adjust the number as needed ggplot(top_results, aes(x = reorder(paste(miRNA, mRNA, sep = " - "), adj_p_value), y = -log10(adj_p_value))) + geom_bar(stat = "identity") + coord_flip() + labs(title = "Top Significant miRNA-mRNA Interactions", x = "miRNA - mRNA Pair", y = "-log10(Adjusted P-value)") + theme_minimal() ``` ```{r} print(dim(filtered_gene_counts)) print(dim(miRNA_counts)) print(dim(miranda_apul)) print(head(rownames(miRNA_counts))) print(head(colnames(filtered_gene_counts))) print(head(miranda_apul$miRNA)) print(head(miranda_apul$gene)) sum(is.na(filtered_gene_counts)) sum(is.infinite(as.matrix(filtered_gene_counts))) sum(is.na(miRNA_counts)) sum(is.infinite(as.matrix(miRNA_counts))) print(nrow(miranda_apul)) # Select a subset of miRNAs and mRNAs subset_mirnas <- head(rownames(miRNA_counts), 10) subset_mrnas <- head(rownames(filtered_gene_counts), 100) # Filter your data subset_miRNA_counts <- miRNA_counts[subset_mirnas, ] subset_gene_counts <- filtered_gene_counts[subset_mrnas, ] subset_miranda <- miranda_apul[miranda_apul$miRNA %in% subset_mirnas & miranda_apul$gene %in% subset_mrnas, ] # Run the model on the subset subset_run <- runAllMirnaModels( mirnas = rownames(subset_miRNA_counts), DiffExpmRNA = subset_gene_counts, DiffExpmiRNA = subset_miRNA_counts, miranda_data = subset_miranda, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode = "multi" ) ``` ```{r} All_miRNAs_run <- runAllMirnaModels( mirnas = rownames(miRNA_counts), # Use all miRNAs DiffExpmRNA = filtered_gene_counts, # Your mRNA expression data DiffExpmiRNA = miRNA_counts, # Your miRNA expression data miranda_data = miranda_apul, # Your miRanda predictions prob = 0.75, # Probability threshold for filtering cutoff = 0.05, # P-value cutoff fdr_cutoff = 0.1, # FDR cutoff method = "fdr", # Multiple testing correction method family = glm_multi(), # Use multiple model types scale = 2, # Scaling factor mode = "multi" # Multivariate mode ) ``` ```{r} multivariate_results <- runModels(combined_data, gene_variant, miRNA_select, family = glm_multi(models=list(glm_gaussian(), glm_poisson())), mode = "multi", scale = 10) interaction_results <- runModels(combined_data, gene_variant, miRNA_select, family = glm_multi(models=list(glm_gaussian(), glm_poisson())), mode = "inter", scale = 10) ``` ```{r} # Extract coefficients from multivariate models coef_matrix <- do.call(rbind, lapply(names(multivariate_results$all_models), function(gene) { model <- multivariate_results$all_models[[gene]] coef(model)[-1] # Remove intercept })) rownames(coef_matrix) <- names(multivariate_results$all_models) # Create heatmap pheatmap(coef_matrix, main = "miRNA-mRNA Multivariate Coefficients", color = colorRampPalette(c("blue", "white", "red"))(100), cluster_rows = F, cluster_cols = F) ``` Look at results ```{r} # View model summaries regression_results$all_models # Plot results plot(regression_results$all_models$FUN_035039) aic_values <- do.call(rbind.data.frame, regression_results$AICvalues, regression_results_p$AICvalues) ggplot(data = melt(aic_values), aes(x = value)) + geom_density(adjust = 1.5, alpha = 0.3) + ggtitle("AIC Distribution for miRNA-mRNA Regressions") + xlab("AIC Value") + ylab("Density") ``` Having trouble getting models to run... ```{r} # Step 1: Filter miranda_apul to only include miRNA and gene names that are present in miRNA_counts and filtered_gene_counts filtered_pairs <- miranda_apul %>% filter(gene %in% rownames(filtered_gene_counts) & miRNA %in% rownames(miRNA_counts)) # Step 2: Initialize an empty data frame to store correlation results correlation_results <- data.frame(miRNA = character(), gene = character(), correlation = numeric(), p_value = numeric(), stringsAsFactors = FALSE) # Step 3: Loop through each miRNA-mRNA pair in filtered_pairs for (i in 1:nrow(filtered_pairs)) { # Extract the miRNA and gene names for the current pair miRNA <- filtered_pairs$miRNA[i] gene <- filtered_pairs$gene[i] # Get the expression values for the miRNA and gene miRNA_values <- as.numeric(miRNA_counts[miRNA, ]) gene_values <- as.numeric(filtered_gene_counts[gene, ]) # Calculate Pearson correlation between miRNA and gene expression values cor_test <- cor.test(miRNA_values, gene_values, method = "pearson") # Store the results correlation_results <- rbind(correlation_results, data.frame(miRNA = miRNA, gene = gene, correlation = cor_test$estimate, p_value = cor_test$p.value)) } # View the results head(correlation_results) ``` Run one to one gaussian regression model ```{r} j <- runModel(`FUN_038206` ~ `apul-mir-100`, Combine, model = glm_poisson(), scale = 100) # Obtain pvalues for terms in model formula print(modelTermPvalues(j)) modelModelName(j) ``` Run multiple models at once ```{r} #miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p") #Combine <- combiner(DiffExp, miRNAExp, miRNA_select) #geneVariant <- geneVari(Combine, miRNA_select) MultiModel <- runModels(Combine, geneVariant, miRNA_select, family = glm_multi(), mode="multi", scale = 10) # Print the name of the models used for the analysis (note the printed outputs are the number of models ran by various models based on the AIC scores using the glm_multi()) print(table(unlist(lapply(MultiModel$all_models, modelModelName)))) ``` Run gaussian model over all potential interactions with miRNA of interest ```{r} blaGaus <- runModels(Combine, geneVariant, miRNA_select, family = glm_gaussian(), scale = 100) # Plot the regression relationship for the mRNA and miRNAs (BHLF1 is the EBV mRNA). Note, these are standard quality check plot outputs from plot regression. par(oma=c(2,2,2,2)) par(mfrow=c(2,3),mar=c(4,3,3,2)) plot(blaGaus[["all_models"]][["BALF2"]]) plot(modelData(blaGaus[["all_models"]][["BALF2"]])) # To comprehend the performance of all models we look at Akaike information criterion (AIC) across all miRNA-mRNA model performances and then look at the density plots. Note, the models with lower comparitive AICs have better performace. G <- do.call(rbind.data.frame, blaGaus[["AICvalues"]]) names(G) <- c("AIC_G") # Low values seems like a reasonable model plot(density(G$AIC_G)) # Print out the AIC of all miRNA-mRNA models ( All observed AIC values for the # miRNA-mRNA models) GM <- melt(G) ``` Get correlation ```{r} corr_test <- corMirnaRna(filtered_gene_counts, miRNA_counts) ``` ```{r} corr_test <- corMirnaRna(gene_counts, miRNA_counts) newcorr <- corMirnaRnaMiranda(DiffExpmRNASub, miRNAExp, -0.1, miRanda) mirRnaHeatmap(newcorr) ``` Select only genes in ```{r} ``` Using the test data as an initial pass to understand Part 1 of the package. Part 1 has a lot of different options one can explore. For the deep dive data, I think the univariate model or the synergistic model will be the best for our data. The univariate model can look at the relationshop of all miRNAs in regards to all mRNAs of interest (one to one relationship). I will use the `miRNA` df as miRNA counts input and `mRNA` df as mRNA counts input; this data is a part of the package. I will also use `miRanda` for interaction data. Looking at the manual pages and source [code](https://rdrr.io/github/Mercedeh66/mirTarRnaSeq/man/), prior to combining the files with the function `combiner`, they need to be transformed using `TZtrans`. Transform data ```{r} miRNA_transform <- tzTrans(miRNA) mRNA_transform <- tzTrans(mRNA) ``` Select only mRNAs that were identified as targets ```{r} mRNA_transform_sub <- miRanComp(mRNA_transform, miRanda) ``` Before combining, the `combiner` function requires an miRNA vector of miRNAs of interest. The manual says: "A vector of character's for miRNAs which the user is interested in investigating if glm is use 1 miRNA should be input. If multivariate several miRNAs should be imported, same goes for interaction determination for miRNAs. Note we do not recommend more than 3-4 miRNAs at a time for the latter cases." Select specific miRNA. Not sure why this needs to be done... ```{r} miRNA_select<-rownames(miRNA) ``` Combine miRNA and mRNA files; make list of files to be used in model ```{r} combine <- combiner(mRNASub, miRNA, miRNA_select) gene_var <- geneVari(combine, miRNA_select) ``` Run Gausian model ```{r} test <- runModels(combine, geneVariant, miRNA_select, family = glm_gaussian(), scale = 100) par(oma=c(2,2,2,2)) par(mfrow=c(2,3),mar=c(4,3,3,2)) plot(test[["all_models"]][["BHLF1"]]) plot(modelData(test[["all_models"]][["BHLF1"]])) #To test AIC model performance G <- do.call(rbind.data.frame, test[["AICvalues"]]) names(G) <- c("AIC_G") #Low values seems like a reasonable model plot(density(G$AIC_G)) ``` Run all combos ```{r} vmiRNA <- rownames(miRNA) all_miRNA_models <- runAllMirnaModels(mirnas = vmiRNA[1:6], DiffExpmRNA = mRNA, DiffExpmiRNA = miRNA, miranda_data = miRanda, prob = 0.75, cutoff = 0.05, fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode = "multi") hasgenes <- lapply(all_miRNA_models, function(x) nrow(x$SigFDRGenes)) > 0 all_miRNA_models <- all_miRNA_models[hasgenes] ``` Look at data ```{r} all_miRNA_models$`ebv-mir-bart1-3p and ebv-mir-bart1-5p` ``` Need to play with test data more Playing w data November 3-4 24 # Part1 - miRNA mRNA regressions across sample cohorts The users can utilize TPM/RPKM or count data for this section of analysis as long as if the data is normalized (TPM/RPKM) is used for the mRNA expression files the miRNA expression files are also normalized (TPM). The format of the files for miRNA and mRNA needs to be odd the class dataframe with the sample names as colnames and mRNA names as rownames. Note, for this section of analysis we have only included all differentially expressed mRNAs for the analysis but the user can included all mRNAs. However, the user should realize if they choose to use all mRNAs expressed the analysis will take a longer time. Load data ```{r} # Helper function to access test files in extdata get_test_file <- function(file_name) { if (missing(file_name)) { return(system.file("extdata", "test", package="mirTarRnaSeq")) } return(system.file("extdata", "test", file_name, package="mirTarRnaSeq")) } # Read input files DiffExp<-read.table(get_test_file("EBV_mRNA.txt.gz"), as.is=TRUE, header=TRUE, row.names=1) miRNAExp<-read.table(get_test_file("EBV_miRNA.txt.gz"), as.is=TRUE, header=TRUE, row.names=1) ``` `DiffExp` represents mRNA expression. Each row represents a gene and each column represents a sample. I'm not sure if these counts are already normalized. `miRNAExp` represents miRNA expression. Each row repsents a miRNA and each column represents a sample. Not sure if these are normalized but they are higher than the mRNA expression. The two dfs also do not have the same number of columns. Ensure both dfs have same number of samples and that sample columns are in the same order. ```{r} # Reorder columns in miRNAExp to match the column order in DiffExp miRNAExp <- miRNAExp[, colnames(DiffExp)] # Check if columns are now ordered the same all(colnames(miRNAExp) == colnames(DiffExp)) # should be true ``` Load miranda. Note, for other organisms (not provided by the pacakged) the appropriate format for miRanda input file is V1 column: name of the miRNA, V2 column: name of the predicted gene, V3: column score/rank of the miranda interaction (or any other score for interaction if TargetScan ( for TargetScan prediction user can provide context++ score and threshold accordingly) is used, note these three columns are required), V4 column: folding energy of miRNA-mRNA interaction. V5 column: target Identity value and V6 column: miRNA idnetity value. (Note, all these values are available after miRanda analysis but V1-V3 must be provided not matter the prediction method) ```{r} miRanda <- getInputSpecies("Epstein_Barr", threshold = 140) # Select only DiffExpmRNASub <- miRanComp(DiffExp, miRanda) ``` `miRanComp` only keeps the mRNAs that are present in the expression file. Select miRNA ```{r} miRNA_select<-c("ebv-mir-BART9-5p") ``` Combine miRNA and mRNA file and define boundaries ```{r} # Make intersection df for mRNA and miRNA(s) of interest to be tested Combine <- combiner(DiffExp, miRNAExp, miRNA_select) # The rows are the sample, and the columns are the mRNAs of interest and the miRNA(s) of interest # Define boundaries of mRNAs vs miRNAs - essentially, just looking at the mRNAs targeted by the miRNA of interest geneVariant <- geneVari(Combine, miRNA_select) ``` Running various univariate models(1 to 1 miRNA-mRNA relationships) with various miRNA-mRNA distribution assumptions for either 1 miRNA and 1 mRNA relationship chosen/selected by the user or across all miRNAs and mRNAs in the expression dataframes. Note,the users can choose to run any of the available distribution model assumptions (glm_poisson, glm_gaussian, glm_nb (negative binomial), glm_zeroinfl (zero inflated model or zero inflated negative binomial)). First, run a one to one miRNA/mRNA gaussian regression model (univariate model for 1 miRNA and 1 mRNA). BALF2 is EBV mRNA, the ebv-mir-BART9-5p is the miRNA and we are running a glm poisson model. Run one to one gaussian regression model ```{r} j <- runModel(`BALF2` ~ `ebv-mir-BART9-5p`, Combine, model = glm_poisson(), scale = 100) # Obtain pvalues for terms in model formula print(modelTermPvalues(j)) modelModelName(j) ``` So what does this mean? There is a correlation between BALF2 and ebv-mir-BART9-5p because there is a significant pvalue? I think I am interested in running the multi mode, where the best fit model for every miRNA-mRNA relationship is selected through AIC comparisons for model selection. After determining the appropriate model, statistical predictions can be made and significant miRNA-mRNA relationships/correlations can be identified. ```{r} newcorr <- corMirnaRnaMiranda(DiffExpmRNASub, miRNAExp, -0.1, miRanda) mirRnaHeatmap(newcorr) # my data corr_test_me <- corMirnaRnaMiranda(filtered_gene_counts, miRNA_counts, -0.1, miranda_apul) ``` so confused...why does test code work but my data does not work???? Run gaussian model over all potential interactions with miRNA of interest ```{r} blaGaus <- runModels(Combine, geneVariant, miRNA_select, family = glm_gaussian(), scale = 100) # Plot the regression relationship for the mRNA and miRNAs (BHLF1 is the EBV mRNA). Note, these are standard quality check plot outputs from plot regression. par(oma=c(2,2,2,2)) par(mfrow=c(2,3),mar=c(4,3,3,2)) plot(blaGaus[["all_models"]][["BALF2"]]) plot(modelData(blaGaus[["all_models"]][["BALF2"]])) # To comprehend the performance of all models we look at Akaike information criterion (AIC) across all miRNA-mRNA model performances and then look at the density plots. Note, the models with lower comparitive AICs have better performace. G <- do.call(rbind.data.frame, blaGaus[["AICvalues"]]) names(G) <- c("AIC_G") # Low values seems like a reasonable model plot(density(G$AIC_G)) # Print out the AIC of all miRNA-mRNA models ( All observed AIC values for the # miRNA-mRNA models) GM <- melt(G) ``` Run poisson model ```{r} blaPois <- runModels(Combine, geneVariant, miRNA_select, family = glm_poisson(), scale = 100) par(oma=c(2,2,2,2)) par(mfrow=c(2,3),mar=c(4,3,3,2)) plot(blaPois[["all_models"]][["LMP-2A"]]) plot(modelData(blaPois[["all_models"]][["LMP-2A"]])) P <- do.call(rbind.data.frame, blaPois[["AICvalues"]]) names(P) <- c("AIC_Po") PM <- melt(P) ``` Run NB model ```{r} blaNB <- runModels(Combine, geneVariant, miRNA_select, family = glm_nb(), scale = 100) par(mar=c(4,3,3,2)) plot(modelData(blaNB[["all_models"]][["BALF1"]])) B <- do.call(rbind.data.frame, blaNB[["AICvalues"]]) names(B) <- c("AIC_NB") BM <- melt(B) ``` Run zero inflated NB model ```{r} blazeroinflNB <- runModels(Combine, geneVariant, miRNA_select, family = glm_zeroinfl(dist = "negbin"), scale = 100) # To test AIC model performance ZNB <- do.call(rbind.data.frame, blazeroinflNB[["AICvalues"]]) names(ZNB) <- c("AIC_ZNB") par(mar=c(4,3,3,2)) plot(density(ZNB$AIC_ZNB)) ZNBM<-melt(ZNB) ``` Run zero inflated poisson binomial model ```{r} blazeroinfl <- runModels(Combine, geneVariant, miRNA_select, family = glm_zeroinfl(), scale = 100) # To test AIC model performance Zp <- do.call(rbind.data.frame, blazeroinfl[["AICvalues"]]) names(Zp) <- c("AIC_Zp") par(mar=c(4,3,3,2)) plot(density(Zp$AIC_Zp)) ZpM <- melt(Zp) ``` Figure out which model to use ```{r} bindM <- rbind(PM, BM, GM, ZpM, ZNBM) p2 <- ggplot(data = bindM, aes(x = value, group = variable, fill = variable)) + geom_density(adjust = 1.5, alpha = .3) + xlim(-400, 2000)+ ggtitle("Plot of of AIC for ebv-mir-BART9-5p regressed all mRNAs ")+ ylab("Density")+ xlab ("AIC Value") p2 ``` The user can decide to use runModels() with glm_multi() (with multi and inter mode options). When using glm_multi(), where all available models will be run, the AICs will be compared and the best model will be chosen based on the miRNA-mRNA model AIC score. In the example bellow we are using the mode= "multi" option for combination of 2 miRNAs (multivariate model) for interaction model the user can choose the mode= "inter" option. Note all_coeff parameters defaults TRUE all interactions (only negative miRNA-mRNA relationships) are reported. More comments on the mode is provided in the next section of the vignette. If all miRNA-mRNA relationships are wanted this parameter can be set to false. Run multiple models at once ```{r} miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p") Combine <- combiner(DiffExp, miRNAExp, miRNA_select) geneVariant <- geneVari(Combine, miRNA_select) MultiModel <- runModels(Combine, geneVariant, miRNA_select, family = glm_multi(), mode="multi", scale = 10) # Print the name of the models used for the analysis (note the printed outputs are the number of models ran by various models based on the AIC scores using the glm_multi()) print(table(unlist(lapply(MultiModel$all_models, modelModelName)))) ``` Run multiple models again? ```{r} miRNA_select<-c("ebv-mir-BART9-5p","ebv-mir-BART6-3p") Combine <- combiner(DiffExp, miRNAExp, miRNA_select) geneVariant <- geneVari(Combine, miRNA_select) InterModel <- runModels(Combine, geneVariant, miRNA_select, family = glm_multi( models=list(glm_gaussian, glm_poisson())),mode="inter", scale = 10) # Print the name of the models used for the analysis (note, you can see although # we have defined for the models to be run using either gaussian or poisson # options, mirTarRnaSeq chooses the poisson model for all as it performs with a # better AIC (lower AIC value) for all miRNA-mRNA interactions) print(table(unlist(lapply(InterModel$all_models, modelModelName)))) ``` Run all combos ```{r} vMiRNA<-rownames(miRNAExp) # Note, the user can run all miRNAs but for speed reasons we have chosen the first 5 here for mirnas input for the analysis. All_miRNAs_run<-runAllMirnaModels(mirnas =vMiRNA[1:5] , DiffExpmRNA = DiffExpmRNASub, DiffExpmiRNA = miRNAExp, miranda_data = miRanda,prob=0.75, cutoff=0.05,fdr_cutoff = 0.1, method = "fdr", family = glm_multi(), scale = 2, mode="multi") #select significant genes hasgenes <- lapply(All_miRNAs_run, function(x) nrow(x$SigFDRGenes)) > 0 All_miRNAs_run <- All_miRNAs_run[hasgenes] print(table(unlist(lapply (All_miRNAs_run[[1]][["FDRModel"]][["all_models"]], modelModelName)))) # Print specific models for specific miRNAs (in this example the significant multivariate model for ebv-mir-BART1-5p and ebv-mir-BART11-3p ) print( table( unlist(lapply( (All_miRNAs_run[["ebv-mir-BART1-5p and ebv-mir-BART11-3p"]] [["FDRModel"]] [["all_models"]]), modelModelName)))) ``` ```{r} # Make miRanda file compatible with SPONGE package mir_predicted_targets() sponge_mir_input<-miranda_sponge_predict(miRNAExp,DiffExp,miRanda) #Perform sparse partial correlation for miRNA-mRNA relationships one2many_output<-one2manySponge(miRNAExp,DiffExp,sponge_mir_input) ```