---
title: "17-Apul merge matrix"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(methylKit)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```


Read in mRNA data 

```{r}
mRNA_counts <- read.csv(file = "../output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv", check.names = FALSE)
mRNA_counts <- as.data.frame(mRNA_counts)
rownames(mRNA_counts) <- mRNA_counts[,1] #set first column that contains gene names as rownames
mRNA_counts <- mRNA_counts[,-1] # remove column w/ gene names 

# Remove any genes with 0 counts across samples 
mRNA_counts<-mRNA_counts %>%
     mutate(Total = rowSums(.[, 1:4]))%>%
    filter(!Total==0)%>%
    dplyr::select(!Total)

```


```{r}
# Rename columns to include colony ID and timepoint
# read in metadata
metadata <- read.csv("../../M-multi-species/data/rna_metadata.csv") %>% filter(Species.Strain == "Acropora pulchra") %>% dplyr::select(AzentaSampleName, ColonyID, Timepoint) 


metadata$Sample <- paste(metadata$AzentaSampleName, metadata$ColonyID, metadata$Timepoint, sep="_")
# Rename columns
colnames(mRNA_counts) <- colnames(mRNA_counts) %>%
  replace(., . %in% metadata$AzentaSampleName, metadata$Sample[match(., metadata$AzentaSampleName)])

# format metadata for DESeq2
list<-colnames(mRNA_counts)
list<-as.factor(list)
metadata$Sample<-as.factor(metadata$Sample)
# Re-order the levels
metadata$Sample <- factor(as.character(metadata$Sample), levels=list)
# Re-order the data.frame
metadata_ordered <- metadata[order(metadata$Sample),]
metadata_ordered$Sample
rownames(metadata_ordered) <- metadata_ordered$Sample
```


Read in miRNA data

```{r}
# Note the miRNA counts I'm loading have already been normalized
# Were normalized with all sRNA counts using DESeq2 (same design as used above for mRNA)
miRNA_counts <- read.delim("../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_counts_miRNA_normalized.txt")
head(miRNA_counts)

# Remove any miRNAs with 0 for all samples 
miRNA_counts <- miRNA_counts %>%
     mutate(Total = rowSums(.[, 1:4]))%>%
    filter(!Total==0)%>%
    dplyr::select(!Total)

# Fix miRNa column names
colnames(miRNA_counts) <- gsub("X", "", colnames(miRNA_counts))
colnames(miRNA_counts) <- gsub("\\.", "-", colnames(miRNA_counts))
colnames(miRNA_counts) <- gsub("\\.", "_", colnames(miRNA_counts))
```


Filter reads by proportion in sample 
```{r}
ffun<-filterfun(pOverA(0.1,5))  #set up filtering parameters--must have at least 5 counts present in at least 4 samples

# miRNA
filt_outrm_poa <- genefilter((miRNA_counts), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left
miRNA_counts_filt <- miRNA_counts[filt_outrm_poa,] #keep only rows that passed filter

# mRNA
filt_outrm_poa <- genefilter((mRNA_counts), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left
mRNA_counts_filt <- mRNA_counts[filt_outrm_poa,] #keep only rows that passed filter
```


# lncRNA counts

```{r}
lncRNA_counts <- read.delim("../output/08-Apul-lncRNA/lncRNA_counts.txt") %>%
  dplyr::select(-c(2:6))

lncRNA_counts <- as.data.frame(lncRNA_counts)
rownames(lncRNA_counts) <- lncRNA_counts[,1] #set first column that contains gene names as rownames
lncRNA_counts <- lncRNA_counts[,-1] # remove column w/ gene names 


# Fix miRNa column names
colnames(lncRNA_counts) <- gsub("X", "", colnames(lncRNA_counts))
colnames(lncRNA_counts) <- gsub("\\.", "-", colnames(lncRNA_counts))
colnames(lncRNA_counts) <- gsub("\\.", "_", colnames(lncRNA_counts))


head(lncRNA_counts)

```


Filter reads by proportion in sample 
```{r}
ffun<-filterfun(pOverA(0.1,5))  #set up filtering parameters--must have at least 5 counts present in at least 4 samples

# miRNA
filt_outrm_poa <- genefilter((miRNA_counts), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left
miRNA_counts_filt <- miRNA_counts[filt_outrm_poa,] #keep only rows that passed filter

# mRNA
filt_outrm_poa <- genefilter((mRNA_counts), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left
mRNA_counts_filt <- mRNA_counts[filt_outrm_poa,] #keep only rows that passed filter


# lncRBA
filt_outrm_poa <- genefilter((lncRNA_counts), ffun) #apply filter
sum(filt_outrm_poa) #count number of genes left
lncRNA_counts_filt <- lncRNA_counts[filt_outrm_poa,] #keep only rows that passed filter
```


Normalize counts
```{r}
# Function to normalize counts (simple RPM normalization)
normalize_counts <- function(counts) {
  rpm <- t(t(counts) / colSums(counts)) * 1e6
  return(rpm)
}

# Normalize miRNA and mRNA counts
#miRNA_norm <- normalize_counts(miRNA_counts_filt)
miRNA_norm <- as.matrix(miRNA_counts_filt)

#mRNA_norm <- normalize_counts(mRNA_counts_filt)
mRNA_norm <- as.matrix(mRNA_counts_filt)

#lncRNA_norm <- normalize_counts(mRNA_counts_filt)
lncRNA_norm <- as.matrix(lncRNA_counts)
```


```{r}
# Number of samples
n_samples <- ncol(miRNA_counts)


# Compute correlation matrix
cor_matrix <- cor(t(miRNA_counts), t(lncRNA_counts), method = "pearson")

# Calculate t-statistics
t_stat <- cor_matrix * sqrt((n_samples - 2) / (1 - cor_matrix^2))

# Calculate p-values
p_values <- 2 * pt(-abs(t_stat), df = n_samples - 2)

# Filter significant correlations (e.g., p < 0.05 and |correlation| > 0.7)
cor_threshold <- 0.7
sig_indices <- which(p_values < 0.05 & abs(cor_matrix) > cor_threshold, arr.ind = TRUE)

# Extract significant miRNA-gene pairs
significant_with_pvalues <- data.frame(
  miRNA = rownames(cor_matrix)[sig_indices[, 1]],
  lncRNA = colnames(cor_matrix)[sig_indices[, 2]],
  Correlation = cor_matrix[sig_indices],
  P_value = p_values[sig_indices]
)

# View results
head(significant_with_pvalues)


```

```{r}
write.csv(significant_with_pvalues, "../output/17-Apul-merge-matrix/miRNA_lnc_correlations.csv", row.names = FALSE)
```


```{r}
write.csv(miRNA_counts_filt, "../output/17-Apul-merge-matrix/miRNA_counts.csv", row.names = TRUE)
```