---
title: "lncRNA Comparison"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`" 
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true

---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(formattable)
library(Biostrings)
library(spaa)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

## grabbing 3 fastas..

```{r, engine='bash'}
cd ../data

curl -O https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/apul_bedtools_lncRNAs.fasta
```

```{r, engine='bash'}
cd ../data

curl -O https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/peve_bedtools_lncRNAs.fasta
```

```{r, engine='bash'}
cd ../data

curl -O https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/pmea_bedtools_lncRNAs.fasta
```


## File format

```{r, engine='bash', eval=TRUE}

head ../data/*fasta

```
## length distribution

```{r, eval=TRUE}
# Read FASTA file
fasta_file <- "../data/apul_bedtools_lncRNAs.fasta"  # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)

# Calculate sequence lengths
sequence_lengths <- width(sequences)

# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)

# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()
```

```{r, eval=TRUE}
# Read FASTA file
fasta_file <- "../data/peve_bedtools_lncRNAs.fasta"  # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)

# Calculate sequence lengths
sequence_lengths <- width(sequences)

# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)

# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()
```


```{r, eval=TRUE}
# Read FASTA file
fasta_file <- "../data/pmea_bedtools_lncRNAs.fasta"  # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)

# Calculate sequence lengths
sequence_lengths <- width(sequences)

# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)

# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()
```


## counts

```{r, engine='bash', eval=TRUE}
fgrep ">" -c ../data/*fasta
```

## Peve Count matrix

### avg expression distribution 

https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/peve_lncRNA.isoform.counts.matrix


```{r, engine='bash'}
cd ../data

curl -O https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/peve_lncRNA.isoform.counts.matrix
```


```{r, eval=TRUE}
pevect <- read.csv("../data/peve_lncRNA.isoform.counts.matrix", sep = '\t')
```

```{r, eval=TRUE}
pevect %>%
  rowwise() %>%
  mutate(avg = mean(c_across(2:5))) %>%
  ggplot(aes(x = avg)) +
  geom_histogram(bins = 100, fill = "blue", color = "white", alpha = 0.7) +
  xlim(0, 100) +
  labs(title = "Histogram of Average Column",
       x = "Average Expression Value",
       y = "Frequency") +
  theme_minimal()
```


## blast comparison


```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/apul_bedtools_lncRNAs.fasta \
-dbtype nucl \
-out ../data/blast/apul_bedtools_lncRNAs
```
```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/peve_bedtools_lncRNAs.fasta \
-dbtype nucl \
-out ../data/blast/peve_bedtools_lncRNAs
```

```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/pmea_bedtools_lncRNAs.fasta \
-dbtype nucl \
-out ../data/blast/pmea_bedtools_lncRNAs
```


apul_bedtools_lncRNAs
peve_bedtools_lncRNAs
pmea_bedtools_lncRNAs


```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ../data/apul_bedtools_lncRNAs.fasta \
-db ../data/blast/peve_bedtools_lncRNAs \
-out ../output/apul_peve_blastn.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ../output/apul_peve_blastn.tab
```

```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ../data/apul_bedtools_lncRNAs.fasta \
-db ../data/blast/pmea_bedtools_lncRNAs \
-out ../output/apul_pmea_blastn.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ../output/apul_pmea_blastn.tab
```

```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ../data/peve_bedtools_lncRNAs.fasta \
-db ../data/blast/pmea_bedtools_lncRNAs \
-out ../output/peve_pmea_blastn.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ../output/peve_pmea_blastn.tab
```

```{r, engine='bash', eval=TRUE}
wc -l ../output/*tab
```