Ptuh mRNA-miRNA interactions ================ Jill Ashey 2025-02-06 This script will investigate mRNA-miRNA interactions using miranda predictions for Ptuh. Read in mRNA data ``` r mRNA_counts <- read_csv("../output/06.2-Ptuh-Hisat/Ptuh-gene_count_matrix.csv") ``` ## Rows: 31840 Columns: 6 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "," ## chr (1): gene_id ## dbl (5): RNA-POC-47, RNA-POC-48, RNA-POC-50, RNA-POC-53, RNA-POC-57 ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. ``` r mRNA_counts <- as.data.frame(mRNA_counts) rownames(mRNA_counts) <- mRNA_counts[,1] #set first column that contains gene names as rownames mRNA_counts <- mRNA_counts[,-1] # remove column w/ gene names # Remove any genes with 0 counts across samples mRNA_counts<-mRNA_counts %>% mutate(Total = rowSums(.[, 1:5]))%>% filter(!Total==0)%>% dplyr::select(!Total) ``` Read in miRNA data ``` r miRNA_counts <- read.delim("../output/03.1-Ptuh-sRNA-summary/Ptuh_miRNA_ShortStack_counts_formatted.txt") head(miRNA_counts) ``` ## sample47 sample48 sample50 sample53 sample57 ## Cluster_21 4846 2433 1968 1452 1397 ## Cluster_36 39 44 20 41 33 ## Cluster_360 142599 101636 141199 84760 52034 ## Cluster_390 741 614 748 876 353 ## Cluster_757 2201 960 1596 944 621 ## Cluster_925 27988 22742 20636 26150 8840 ``` r # Remove any miRNAs with 0 for all samples miRNA_counts <- miRNA_counts %>% mutate(Total = rowSums(.[, 1:5]))%>% filter(!Total==0)%>% dplyr::select(!Total) ``` Rename columns to match miRNA df ``` r mRNA_counts <- mRNA_counts %>% dplyr::rename("sample47" = "RNA-POC-47", "sample48" = "RNA-POC-48", "sample50" = "RNA-POC-50", "sample53" = "RNA-POC-53", "sample57" = "RNA-POC-57") ``` Read in miranda data ``` r miranda_ptuh <- read.delim("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/miranda_strict_all_1kb_parsed_ptuh_updated.txt", header = F) colnames(miranda_ptuh) <- c("miRNA", "mRNA", "score", "energy", "query_start_end", "subject_start_end", "total_bp_shared", "query_similar", "subject_similar") # Format miranda df miranda_ptuh$miRNA <- sub("^>", "", miranda_ptuh$miRNA) # Remove leading ">" miranda_ptuh$miRNA <- sub("\\..*", "", miranda_ptuh$miRNA) # Remove everything from the first period onwards miranda_ptuh$mRNA <- sub("::.*", "", miranda_ptuh$mRNA) # Remove everything from "::" onwards miranda_ptuh$mRNA <- sub("ID=", "", miranda_ptuh$mRNA) # Remove ID= length(unique(miranda_ptuh$miRNA)) ``` ## [1] 37 ``` r length(unique(miranda_ptuh$mRNA)) ``` ## [1] 3628 ``` r dim(miranda_ptuh) ``` ## [1] 4105 9 Summarize how many mRNAs each miRNA are predicted to interact with. Summarize how many miRNAs each mRNA are predicted to interact with. ``` r sum_mirna <- miranda_ptuh %>% group_by(miRNA) %>% summarise(mRNA_count = n_distinct(mRNA)) %>% arrange(desc(mRNA_count)) ggplot(sum_mirna %>% arrange(desc(mRNA_count)), aes(x = forcats::fct_reorder(miRNA, mRNA_count), y = mRNA_count)) + geom_col() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "miRNA", y = "mRNA Count", title = "miRNA Regulation of mRNAs") #+ ``` ![](11-Ptuh-mRNA-miRNA-interactions_files/figure-gfm/unnamed-chunk-5-1.png) ``` r #coord_flip() # Optional: horizontal bar plot for better readability ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_number_of_targets.png", last_plot(), width = 12, height = 10, dpi = 300) ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_number_of_targets.pdf", last_plot(), width = 12, height = 10, dpi = 300) # mRNA sum_mrna <- miranda_ptuh %>% group_by(mRNA) %>% summarise(miRNA_count = n_distinct(miRNA)) ``` Normalize counts ``` r # Function to normalize counts (simple RPM normalization) normalize_counts <- function(counts) { rpm <- t(t(counts) / colSums(counts)) * 1e6 return(rpm) } # Normalize miRNA and mRNA counts miRNA_norm <- normalize_counts(miRNA_counts) #miRNA_norm <- as.matrix(miRNA_counts_filt) mRNA_norm <- normalize_counts(mRNA_counts) #mRNA_norm <- as.matrix(mRNA_counts_filt) ``` In my [Apul mRNA-miRNA interactions code](https://github.com/urol-e5/deep-dive-expression/blob/main/D-Apul/code/09-Apul-mRNA-miRNA-interactions.Rmd), I calculated the distance correlation coefficient using the energy package. I am not going to do that now, but may come back to it later. Calculate PCC ``` r # Function to calculate PCC and p-value for a pair of vectors calc_pcc <- function(x, y) { result <- cor.test(x, y, method = "pearson") return(c(PCC = result$estimate, p_value = result$p.value)) } # Create a data frame of all miRNA-mRNA pairs pairs <- expand.grid(miRNA = rownames(miRNA_norm), mRNA = rownames(mRNA_norm)) # Calculate PCC and p-value for each pair pcc_results <- pairs %>% rowwise() %>% mutate( pcc_stats = list(calc_pcc(miRNA_norm[miRNA,], mRNA_norm[mRNA,])) ) %>% unnest_wider(pcc_stats) # Adjust p-values for FDR pcc_results <- pcc_results %>% mutate(adjusted_p_value = p.adjust(p_value, method = "fdr")) # Remove 'gene-' from mRNA names pcc_results$mRNA <- gsub("gene-", "", pcc_results$mRNA) # Save as csv write.csv(pcc_results, "../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-PCC_miRNA_mRNA.csv") ``` This df is too large to push to github, instead stored here: `https://gannet.fish.washington.edu/kdurkin1/ravenbackups/deep-dive-expression/F-Ptuh/output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-PCC_miRNA_mRNA.csv` Load back in for additional use ``` r pcc_results <- read.csv("https://gannet.fish.washington.edu/kdurkin1/ravenbackups/deep-dive-expression/F-Ptuh/output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-PCC_miRNA_mRNA.csv") ``` Merge with miranda data ``` r combined_data_pcc <- pcc_results %>% inner_join(miranda_ptuh, by = c("miRNA", "mRNA")) %>% na.omit() dim(combined_data_pcc) ``` ## [1] 3389 13 ``` r length(unique(combined_data_pcc$miRNA)) ``` ## [1] 37 ``` r length(unique(combined_data_pcc$mRNA)) ``` ## [1] 2989 ``` r # Save as csv write.csv(combined_data_pcc, "../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miranda_PCC_miRNA_mRNA.csv") ``` Plot number of positive and negative interactions for each miRNA ``` r # Summarize the data summary_data <- combined_data_pcc %>% group_by(miRNA) %>% summarise( positive_count = sum(PCC.cor > 0), negative_count = sum(PCC.cor < 0) ) %>% mutate(total_count = positive_count + negative_count) %>% arrange(desc(total_count)) sum(summary_data$positive_count) ``` ## [1] 1660 ``` r sum(summary_data$negative_count) ``` ## [1] 1729 ``` r # Assess how many miRNAs have more positive correlations vs more negative ones positive_dominant <- sum(summary_data$positive_count > summary_data$negative_count) negative_dominant <- sum(summary_data$negative_count > summary_data$positive_count) equal <- sum(summary_data$positive_count == summary_data$negative_count) cat("miRNAs with more positive correlations:", positive_dominant, "\n") ``` ## miRNAs with more positive correlations: 14 ``` r cat("miRNAs with more negative correlations:", negative_dominant, "\n") ``` ## miRNAs with more negative correlations: 21 ``` r cat("miRNAs with equal positive and negative correlations:", equal, "\n") ``` ## miRNAs with equal positive and negative correlations: 2 ``` r # Reshape the data for plotting plot_data <- summary_data %>% tidyr::pivot_longer(cols = c(positive_count, negative_count), names_to = "correlation_type", values_to = "count") # Create the stacked bar plot ggplot(plot_data, aes(x = reorder(miRNA, -total_count), y = count, fill = correlation_type)) + geom_bar(stat = "identity") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + labs(x = "miRNA", y = "Count", fill = "Correlation Type", title = "Positive and Negative PCC Correlations per miRNA") ``` ![](11-Ptuh-mRNA-miRNA-interactions_files/figure-gfm/unnamed-chunk-10-1.png) ``` r ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_positive_negative_correlations.png", last_plot(), width = 12, height = 10, dpi = 300) ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_positive_negative_correlations.pdf", last_plot(), width = 12, height = 10, dpi = 300) ``` Investigate data ``` r # How many p-values are < 0.05 or < 0.1? pvalue_summary_pcc <- combined_data_pcc %>% summarise( pvalue_0.05 = sum(p_value < 0.05), pvalue_0.1 = sum(p_value < 0.1), ) print(pvalue_summary_pcc) ``` ## pvalue_0.05 pvalue_0.1 ## 1 203 353 ``` r # How many pairs have a PCC correlation > |0.5|? corr_0.5 <- sum(abs(combined_data_pcc$PCC.cor) > 0.5) cat("\nPairs with Pearson Correlation > 0.5:", corr_0.5, "\n") ``` ## ## Pairs with Pearson Correlation > 0.5: 1301 ``` r # Are there any pairs that have a PCC correlation > |0.5| and a p-value < 0.05? pairs_of_interest_pcc <- combined_data_pcc %>% filter(abs(PCC.cor) > 0.5 & p_value < 0.05 ) cat("PCC correlation > |0.5| and a p-value < 0.05:", nrow(pairs_of_interest_pcc), "\n") ``` ## PCC correlation > |0.5| and a p-value < 0.05: 203 ``` r # How many unique miRNAs and mRNAs have a PCC correlation > |0.5| and a p-value < 0.05? length(unique(pairs_of_interest_pcc$miRNA)) ``` ## [1] 29 ``` r length(unique(pairs_of_interest_pcc$mRNA)) ``` ## [1] 191 ``` r # Save pairs of interest df as csv write.csv(pairs_of_interest_pcc, "../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miranda_PCC_sig_miRNA_mRNA.csv") ``` Read in data so that PCC does not have to be run again, as it is computationally intensive ``` r pairs_of_interest_pcc <- read.csv("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miranda_PCC_sig_miRNA_mRNA.csv") ``` Plot number of SIGNIFICANT positive and negative interactions for each miRNA ``` r # Summarize the data summary_data <- pairs_of_interest_pcc %>% group_by(miRNA) %>% summarise( positive_count = sum(PCC.cor > 0), negative_count = sum(PCC.cor < 0) ) %>% mutate(total_count = positive_count + negative_count) %>% arrange(desc(total_count)) sum(summary_data$positive_count) ``` ## [1] 97 ``` r sum(summary_data$negative_count) ``` ## [1] 106 ``` r # Assess how many miRNAs have more positive correlations vs more negative ones positive_dominant <- sum(summary_data$positive_count > summary_data$negative_count) negative_dominant <- sum(summary_data$negative_count > summary_data$positive_count) equal <- sum(summary_data$positive_count == summary_data$negative_count) cat("miRNAs with more positive correlations:", positive_dominant, "\n") ``` ## miRNAs with more positive correlations: 13 ``` r cat("miRNAs with more negative correlations:", negative_dominant, "\n") ``` ## miRNAs with more negative correlations: 11 ``` r cat("miRNAs with equal positive and negative correlations:", equal, "\n") ``` ## miRNAs with equal positive and negative correlations: 5 ``` r # Reshape the data for plotting plot_data <- summary_data %>% tidyr::pivot_longer(cols = c(positive_count, negative_count), names_to = "correlation_type", values_to = "count") # Create the stacked bar plot ggplot(plot_data, aes(x = reorder(miRNA, -total_count), y = count, fill = correlation_type)) + geom_bar(stat = "identity") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + labs(x = "miRNA", y = "Count", fill = "Correlation Type", title = "Significant Positive and Negative PCC Correlations per miRNA") ``` ![](11-Ptuh-mRNA-miRNA-interactions_files/figure-gfm/unnamed-chunk-13-1.png) ``` r ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_significant_positive_negative_correlations.png", last_plot(), width = 12, height = 10, dpi = 300) ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-miRNA_significant_positive_negative_correlations.pdf", last_plot(), width = 12, height = 10, dpi = 300) ``` Plot using igraph ``` r # Create the graph g <- graph_from_data_frame(pairs_of_interest_pcc, directed = FALSE) # Add edge attributes E(g)$weight <- abs(E(g)$PCC.cor) # Use absolute PCC for edge weight E(g)$color <- ifelse(E(g)$PCC.cor > 0, "blue", "red") # Blue for positive, red for negative correlations # Add node attributes V(g)$type <- ifelse(V(g)$name %in% pairs_of_interest_pcc$miRNA, "miRNA", "mRNA") # Convert to tbl_graph for ggraph g_tbl <- as_tbl_graph(g) # Create the plot p <- ggraph(g_tbl, layout = "fr") + geom_edge_link(aes(edge_width = weight, color = color), alpha = 0.6) + geom_node_point(aes(color = type), size = 5) + #geom_node_text(aes(label = name), repel = TRUE, size = 3) + scale_edge_width(range = c(0.5, 3)) + scale_color_manual(values = c("miRNA" = "lightblue", "mRNA" = "lightgreen", "Positive correlation" = "blue", "Negative correlation" = "red")) + theme_graph() + labs(title = "miRNA-mRNA Interaction Network", subtitle = "Edge width represents |PCC|, color represents correlation direction");p ``` ![](11-Ptuh-mRNA-miRNA-interactions_files/figure-gfm/unnamed-chunk-14-1.png) ``` r ggsave("../output/11-Ptuh-mRNA-miRNA-interactions/three_prime_interaction/Ptuh-significant_miRNA_mRNA_network.png", p, width = 12, height = 10, dpi = 300) ```