---
title: "18.1_Peve-piRNA-PPmeter"
author: "Javier Rodriguez Casariego"
date: '`r Sys.Date()`'
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = TRUE)
knitr::opts_knit$set(root.dir = '/Users/jarcasariego/Dropbox/Other_projects/E5_work/deep-dive/F-Pmea/')
```

PING PONG SIGNATURES FOR ALL SPECIES WAS COMPLETED IN: deep-dive/D-Apul/code/18.1_Apul-piRNA-PPmeter

```{r}
#Ping-pong analysis using the output of Ping-Pong Meter

#set prefix
my.outprefix <- paste(Sys.Date(),"ppr_Pmea_analysis",sep="_")

#load list of file names
L = list.files(path = "output/18-Pmea-piRNA/1_PPmeter/ppr_Pmea/", pattern=".txt", recursive = T)
names <- gsub("_ppr.txt", "", L)

#this function keeps only the instances of 10 bp overlaps per million read pairs
DFs = lapply(L, function(x) {
  data <- read.table(paste0("output/18-Pmea-piRNA/1_PPmeter/ppr_Pmea/", x), skip = 2, nrows = 100)
  data <- data[,c(1,33)]
  colnames(data) <- c("rep", "score")
  score <- median(data$score)
  return(score)
})

#run for loop to get median ppm per replicate
med_ppm <- data.frame()
for(i in 1:length(L)) {
  total <- cbind(names[i], DFs[[i]])
  med_ppm <- rbind(med_ppm, total)
}
colnames(med_ppm) <- c("group", "score")
med_ppm$score <- as.numeric(med_ppm$score)

#divide ppr-mbr score by 10e5 for plotting
med_ppm$score <- med_ppm$score/100000


#my.boxplot.out <- paste0("output/18-Pmea-piRNA//1_PPmeter/", my.outprefix,"_PP_Percentages.pdf")
#pdf(my.boxplot.out, height = 5, width = 5)
boxplot(med_ppm$score,
        xlab = "Median x10e5 ppr-mbr (100 bootstraps)",
        horizontal = TRUE)
#dev.off()
```
Next, I need to create a matrix of the lenghts of the pairs of sequenced with a ping-pong signal. There might be a way of extracting this information from the PPmeter output, but I cannot find it and no one has it public. Hence I will mcgiver a way that has hogh potential to be wrong, but let's see.

First, I'm mapping all _preproc reads to the AMIL genome with bowtie and creating bed files with bedops showing the location of each mapped read. 

```{r, engine='bash'}

bowtie-build /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/PMEA/Pocillopora_meandrina_HIv1.assembly.fa PMEA_index

for f in $(find "." -name '*_preproc')
do
        f2="${f}";
        of="${f}_mapped";
        of_bam="${of}.bam"
        of_sorted="${of}_sorted"
        of_sorted_bam="${of_sorted}.bam"
        of_sorted_bambai="${of_sorted}.bam.bai"
        of_sorted_bai="${of_sorted}.bai"

        bowtie -v 3 -a -p 4 --best --strata -S PMEA_index $f2 $of
        samtools view -bS -o $of_bam $of
        rm $of
        samtools sort $of_bam -o $of_sorted_bam
        rm $of_bam
        samtools index $of_sorted_bam
        mv $of_sorted_bambai $of_sorted_bai

done

#Now we need to convert to bed files for subsequent processing steps

for f in $(find "." -name '*.bam')
        do
    f2=$(basename "${f}" | sed 's/\.bam/\.bed/g');

bamToBed -i $f > $f2

done


```

Now, create bed files for + reads starting at 5' (start) and extending 10 bases, and for - reads starting at 5' (end) and extending -10 bases.
 
 5' U----------
    A---------- 5'

something like this.

If I get complete overlap of these reads I got pingpong pairs. 
I will create a column with the original sizes to build the ping-pong read length matrix.

```{r, engine='bash'}

for f in *.bed
do
pos_bed=${f/_sorted.bed}_5p_10_positive.bed
neg_bed=${f/_sorted.bed}_5p_10_negative.bed
pos_bed_sorted=${pos_bed/.bed}_sorted.bed
neg_bed_sorted=${neg_bed/.bed}_sorted.bed
outfile=${f/_preproc_mapped_sorted.bed}_ping_pong_lengths.txt

awk '$6 == "+" {print $1"\t"$2"\t"$2+10"\t"$4"\t"$5"\t"$6"\t"$3-$2}' $f > $pos_bed
awk '$6 == "-" {print $1"\t"$3-10"\t"$3"\t"$4"\t"$5"\t"$6"\t"$3-$2}' $f > $neg_bed

sort -k1,1 -k2,2n $pos_bed > $pos_bed_sorted
sort -k1,1 -k2,2n $neg_bed > $neg_bed_sorted

bedtools intersect -a $pos_bed_sorted -b $neg_bed_sorted -wo -f 1  | awk 'BEGIN { OFS = "\t" } { print $1, $4, $7, $11, $14 }' > $outfile
done

# merge all outputs and only keep the lengths pairs

cat *_lengths.txt | awk '{print $3"\t"$5}' > Pmea_pingpong_matrix.txt

```

```{r}

pp_df <- read.table("output/18-Pmea-piRNA/1_PPmeter/Pmea_pingpong_matrix.txt", header = F)
colnames(pp_df) <- c("pos", "neg")

pp_matrix <- as_tibble(pp_df) %>% 
    arrange(pos, neg) %>% 
    group_by(pos, neg) %>% 
    summarise(num_pairs = n(), .groups = "drop") %>% 
    pivot_wider(names_from = neg, values_from = num_pairs) %>% 
    remove_rownames() %>%
    gather("neg", "count", -pos)

# plotting the heatmap
plt <- ggplot(pp_matrix,aes(neg, pos, fill=count)) + 
  geom_tile() + 
  scale_y_continuous(n.breaks = 7) +
  scale_fill_gradient(low = "white",
  high = "darkred",
  space = "Lab",
  na.value = "grey50",
  guide = "colourbar",
  aesthetics = "fill" ) + 
  theme_linedraw() +
  labs(fill = "number of read pairs") +
  theme(axis.title = element_blank(),
        legend.text = element_blank(),
        legend.title = element_text(angle = 90))


 plt

ggsave(filename = paste0("output/18-Pmea-piRNA/1_PPmeter/", my.outprefix,"piRNA_pp_pair_heat.png"), plt, width = 6, height = 5)
```
```{r}
library(Biostrings)

#function to process ping-pong reads FASTA file and extract all, 1U, and 10A read length
process_fasta <- function(fasta_file) {
    sequences <- readDNAStringSet(fasta_file, format = "fasta")
    sequences_with_1U <- sequences[substr(as.character(sequences), 1, 1) == "T"] 
    sequences_with_10A <- sequences[substr(as.character(sequences), 10, 10) == "A"]
    sequence_1U_lengths <- data.frame(length = Biostrings::width(sequences_with_1U)) %>%
        group_by(length) %>%
        summarise(freq = n()) %>%
        mutate(group = "1U")
    sequence_10A_lengths <- data.frame(length = Biostrings::width(sequences_with_10A)) %>%
        group_by(length) %>%
        summarise(freq = n()) %>%
        mutate(group = "10A")
    sequence_lengths <- data.frame(length = head(Biostrings::width(as.character(sequences)),-1)) %>%
        group_by(length) %>%
        summarise(freq = n()) %>%
        mutate(group = "all")
    lengths_df <- rbind(sequence_1U_lengths, sequence_10A_lengths, sequence_lengths)
    return(lengths_df)
}

# Define the function to plot the distribution
plot_length_distribution <- function(lengths) {
    # Create a data frame for plotting
    lengths_df <- data.frame(Length =sequence_lengths)
    
    # Plot the length distribution using ggplot2
    ggplot(lengths_df) +
        geom_bar(aes(x = length, y = )) +
        theme_minimal() +
        labs(title = "Read Length Distribution",
             x = "Read Length",
             y = "Frequency")
}

# Path to your FASTA file
fasta_file <- "output/18-Pmea-piRNA/0_piRNA_pipeline_proTRAC/PMEA_mapped_piRNA.fasta"

# Process the FASTA file to get sequence lengths
sequence_lengths <- process_fasta(fasta_file)

# Plot the lengths distribution
plot_length_distribution(sequence_lengths)
```