---
title: "65 - Exon Coverage"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`" 
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
---

```{r setup, include=FALSE}
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(Biostrings)
library(methylKit)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = TRUE,     # Hide warnings
  message = TRUE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

Following Li et al. Will take bams and get some data..

Trimmed reads were mapped to the Aiptasia genome using HISAT2 v2.1.0, and mapping coverage per position was extracted using BEDTools v2.17.0. To assess spurious transcription levels, we deter- mined the coverage per exon normalized across all six replicates (assuming every replicate had a coverage of 1 million sequences in total) and then calculated the average coverage ratios of exons 2 to 6 versus exon 1 for every gene

Individual BAMS and their corresponding index files are in each individual sample subirectory:

https://gannet.fish.washington.edu/Atumefaciens/20230821-cvir-stringtie-GCF_002022765.2-isoforms/

A merged BAM of all samples is here (79GB):

https://gannet.fish.washington.edu/Atumefaciens/20230821-cvir-stringtie-GCF_002022765.2-isoforms/20230821_cvir_stringtie_GCF_002022765-sorted-bams-merged.bam

Merged BAM index file (23MB):

https://gannet.fish.washington.edu/Atumefaciens/20230821-cvir-stringtie-GCF_002022765.2-isoforms/20230821_cvir_stringtie_GCF_002022765-sorted-bams-merged.bam.bai

# Download BAMs

```{r, engine='bash'}
wget -r \
--no-directories --no-parent \
-P ../data/big/ \
-A "*sorted.bam" https://gannet.fish.washington.edu/Atumefaciens/20230821-cvir-stringtie-GCF_002022765.2-isoforms/
```

```{r, engine='bash'}
wget -r \
--no-directories --no-parent \
-P ../data/big/ \
-A "*sorted.bam.bai" https://gannet.fish.washington.edu/Atumefaciens/20230821-cvir-stringtie-GCF_002022765.2-isoforms/
```


# bedtools coverage

```{r, engine='bash'}
/home/shared/bedtools2/bin/bedtools coverage \
-a ../genome-features/GCF_002022765.2_C_virginica-3.0_genomic.gff \
-b ../data/big/S9M.sorted.bam \
> ../output/65-exon-coverage/S9gff_exon_coverage.txt
```

```{r, engine='bash', eval=TRUE}
head ../genome-features/GCF_002022765.2_C_virginica-3.0_genomic.gff
```

```{bash}
cd ../data

/home/shared/datasets download genome accession GCF_002022765.2 --include gff3,gtf
```


```{r, engine='bash', eval=TRUE}
head /home/shared/8TB_HDD_01/sr320/github/ceabigr/data/ncbi_dataset/data/GCF_002022765.2/genomic.gtf

```
```{bash}
cat ../data/ncbi_dataset/data/GCF_002022765.2/genomic.gtf | grep "Gnomon	exon" | head
```

```{bash}
cat ../data/ncbi_dataset/data/GCF_002022765.2/genomic.gtf | grep "Gnomon	exon" > ../genome-features/GCF_002022765.2_exon.gtf

```


```{r, engine='bash'}
cd    ../genome-features
curl -O https://eagle.fish.washington.edu/Cvirg_tracks/C_virginica-3.0_Gnomon_exon.bed
```


```{r, engine='bash', eval=TRUE}
head ../genome-features/C_virginica-3.0_Gnomon_exon.bed
```


Lets get bam that overlaps with exons

`bedtools intersect -a sorted.bam -b exons.gtf > exon_reads.bam`


`bedtools multicov -bams exon_reads.bam -bed exons.gtf > exon_expression_counts.txt`

```{r, engine='bash'}
/home/shared/bedtools2/bin/bedtools multicov \
-bams ../data/big/S77F.sorted.bam \
-bed ../genome-features/GCF_002022765.2_exon.gtf \
> ../output/65-exon-coverage/S77_exon_exp_counts.txt
```

```{r, engine='bash'}
tail -20 ../output/65-exon-coverage/S77_exon_exp_counts.txt
```


```{r, engine='bash'}
/home/shared/bedtools2/bin/bedtools coverage \
-a ../genome-features/GCF_902806645.1_cgigas_uk_roslin_v1_genomic-mito.gtf \
-b ../data/big/S9M.sorted.bam \
> ../output/65-exon-coverage/S9gtf_exon_coverage.txt
```

```{r, engine='bash', eval=TRUE}
head -1 ../output/65-exon-coverage/S9gtf_exon_coverage.txt
```

loops

bedtools intersect -a sorted.bam -b exons.gtf > exon_reads.bam

```{r, engine='bash'}
for bamfile in ../data/big/*sorted.bam; do
    # Calculate coverage using BEDTools
    echo ${bamfile%.bam}
done
```

```{r, engine='bash'}
for bamfile in ../data/big/*sorted.bam; do
    # Extract the base name of the file without the path and extension
    base_name=$(basename "$bamfile" .sorted.bam)

    # Define the output file name
    output_file="${base_name}_exon_reads.bam"

    # Calculate coverage using BEDTools
    /home/shared/bedtools2/bin/bedtools intersect -a "$bamfile" -b ../genome-features/GCF_002022765.2_exon.gtf > ../output/65-exon-coverage/$output_file
done
```

```{bash}
find ../data/big/ -name "*sorted.bam" | xargs -I {} -P 20 bash -c '
    bamfile="{}"
    base_name=$(basename "$bamfile" .sorted.bam)
    output_file="../output/65-exon-coverage/${base_name}_exon_reads.bam"
    /home/shared/bedtools2/bin/bedtools intersect -a "$bamfile" -b ../genome-features/GCF_002022765.2_exon.gtf > "$output_file"
'

```

```{bash}
for bamfile in ../output/65-exon-coverage/*.bam; do
    /home/shared/samtools-1.12/samtools index -@ 40 "$bamfile"
done
```


```{bash}
find ../output/65-exon-coverage/ -name "*exon_reads.bam" | xargs -I {} -P 40 bash -c '
    bamfile="{}"
    base_name=$(basename "$bamfile" .exon_reads.bam)
    output_file="../output/65-exon-coverage/${base_name}_multicov.txt"
    /home/shared/bedtools2/bin/bedtools multicov -bams "$bamfile" -bed ../genome-features/GCF_002022765.2_exon.gtf > "$output_file"
'

```


```{r, engine='bash'}
head ../output/65-exon-coverage/*extracted_data.txt | head -25

```

```{r}

S9data <- read.table("../output/65-exon-coverage/S9M_exon_reads.extracted_data.txt", header = TRUE)
```

```{r}

S9data <- read.table("../output/65-exon-coverage/S9M_exon_reads.extracted_data.txt", header = TRUE)

summarized_data <- S9data %>%
  group_by(GeneID, ExonNumber) %>%
  summarize(CoverageValue = mean(CoverageValue))

# Reshape the data using pivot_wider
reshaped_data <- summarized_data %>%
  pivot_wider(names_from = ExonNumber, values_from = CoverageValue, names_prefix = "Exon")

```

```{r}
library(tidyverse)

# Set the directory where your files are located
directory_path <- "../output/65-exon-coverage"

# List all files ending with 'extracted_data.txt'
files <- list.files(directory_path, pattern = "extracted_data\\.txt$", full.names = TRUE)

# Function to read, process, and write a single file
process_and_write_file <- function(file_path) {
  data <- read.table(file_path, header = TRUE)
  
  summarized_data <- data %>%
    group_by(GeneID, ExonNumber) %>%
    summarize(CoverageValue = mean(CoverageValue), .groups = 'drop')

  reshaped_data <- summarized_data %>%
    pivot_wider(names_from = ExonNumber, values_from = CoverageValue, names_prefix = "Exon")

  # Generate a new file name based on the original file name
  output_file_name <- gsub("extracted_data\\.txt$", "processed_data.csv", basename(file_path))
  output_file_path <- file.path(directory_path, output_file_name)

  # Write the reshaped data to a new file
  write_csv(reshaped_data, output_file_path)
}

# Apply the function to each file
lapply(files, process_and_write_file)

```


```{r, engine='bash'}
head ../output/65-exon-coverage/S31M*process*
```
```{bash}
head ../output/65-exon-coverage/S9M_exon_reads.processed_data.csv

```


```{r}
S9Mdata <- read.csv("../output/65-exon-coverage/S9M_exon_reads.processed_data.csv")
```


```{r}
test <- S9Mdata %>%
  dplyr::select(GeneID,Exon1,Exon2,Exon3,Exon4,Exon5,Exon6) %>%
  mutate(fold2 = log(Exon2 / Exon1)) %>%
  mutate(fold3 = log(Exon3 / Exon1)) %>%
  mutate(fold4 = log(Exon4 / Exon1)) %>%
  mutate(fold5 = log(Exon5 / Exon1)) %>%
  mutate(fold6 = log(Exon6 / Exon1))
```

```{r}
test02 <- S9Mdata %>%
  dplyr::select(GeneID,Exon1,Exon2,Exon3,Exon4,Exon5,Exon6) %>%
  na.omit() %>%
  mutate(fold2 = log(Exon2 / Exon1)) %>%
  mutate(fold3 = log(Exon3 / Exon1)) %>%
  mutate(fold4 = log(Exon4 / Exon1)) %>%
  mutate(fold5 = log(Exon5 / Exon1)) %>%
  mutate(fold6 = log(Exon6 / Exon1)) %>%
  dplyr::select(GeneID, fold2, fold3, fold4, fold5, fold6)
```

```{r}
head(test)
```
```{r}
# Remove NA values and exclude the specified column
cleaned_data <- test02

# Reshape the data to long format
long_data <- tidyr::pivot_longer(cleaned_data, cols = starts_with("fold"), names_to = "fold", values_to = "Values")

# Plotting a dot plot (scatter plot)
ggplot(long_data, aes(x = factor(fold), y = Values)) +
  geom_point(position = position_jitter(width = 0.3, height = 0), alpha = 0.15) +
  geom_smooth(method = "lm", se = FALSE) +  # Add linear regression lines +
  labs(x = "Exon", y = "Read Counts (log10)") +
  theme_minimal()


```


```{r}
long_df <- pivot_longer(test02, cols = starts_with("fold"), names_to = "fold", values_to = "value")

```


```{r}

ggplot(long_df, aes(x = fold, y = value, color = GeneID, group = GeneID)) + 
  geom_point() +  # Plot points
  geom_smooth(method = "lm", se = FALSE) +  # Add linear regression lines
  labs(title = "Gene Expression Over Different Folds", 
       x = "Fold", 
       y = "Expression Value")  +
  theme_minimal()

```


ADDING IN METHYLATION DATA TO DUmb


```{r}
data_long <- test %>%
  pivot_longer(cols = starts_with("fold"), names_to = "fold", values_to = "Value") %>%
  mutate(fold = as.factor(fold))  # Convert column to a factor 

```

```{r}
data_long$Value[data_long$Value == "-Inf" | data_long$Value == "Inf"] <- NA

```


```{r}
summary_data <- data_long %>%
  group_by(fold) %>%
  summarise(
    Mean = mean(Value, na.rm = TRUE),
    SE = sd(Value, na.rm = TRUE) / sqrt(sum(!is.na(Value)))
  )
```

```{r}
class(summary_data$Mean)
```


```{r}
ggplot(summary_data, aes(x = fold, y = Mean)) +
  geom_point() +
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2) +
  labs(
    x = "Exon",
    y = "Mean Value",
    title = "Mean Values with Standard Error by fold"
  ) +
  theme_minimal()

```

By Exon expression 

```{r}
data_long_exon <- test %>%
  pivot_longer(cols = starts_with("Exon"), names_to = "Exon", values_to = "Value") %>%
  mutate(Exon = as.factor(Exon))  # Convert column to a factor 

```


```{r}
summary_data_exon <- data_long_exon %>%
  group_by(Exon) %>%
  summarise(
    Mean = mean(Value, na.rm = TRUE),
    SE = sd(Value, na.rm = TRUE) / sqrt(sum(!is.na(Value)))
  )
```


```{r}
ggplot(summary_data_exon, aes(x = Exon, y = Mean)) +
  geom_point() +
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2) +
  labs(
    x = "Exon",
    y = "Mean Value",
    title = "Mean Values with Standard Error by fold"
  ) +
  theme_minimal()

```
```{r}
# Example data frame preparation
data <- data.frame(
  LOC_ID = rep(c("LOC111099029", "LOC111099030", "LOC111099031", "LOC111099032", "LOC111099033", "LOC111099034"), each = 4),
  Exon = rep(1:4, 6),
  Value = c(3, 18, 34, 22, 20, 43, 22, 11, 67, 9, 6.67, 9, 28, 213, NA, NA, 23, 38, 59, 44, 16, 14, 25, 113) # example values
)

# Install and load ggplot2

# Plot
ggplot(data, aes(x = Exon, y = Value)) + 
  geom_point() + 
  facet_wrap(~ LOC_ID) + 
  theme_minimal() + 
  labs(title = "Exon Data Plot", x = "Exon Number", y = "Value")

```

```{r}
# Example data frame preparation
data <- data.frame(
  LOC_ID = rep(c("LOC111099029", "LOC111099030", "LOC111099031", "LOC111099032", "LOC111099033", "LOC111099034"), each = 4),
  Exon = rep(1:4, 6),
  Value = c(3, 18, 34, 22, 20, 43, 22, 11, 67, 9, 6.67, 9, 28, 213, NA, NA, 23, 38, 59, 44, 16, 14, 25, 113) # example values
)


# Plot all in one graph, differentiating by color
ggplot(data, aes(x = Exon, y = Value, color = LOC_ID)) + 
  geom_point() + 
  theme_minimal() + 
  labs(title = "Exon Data Plot", x = "Exon Number", y = "Value", color = "LOC ID")

```