---
title: "02 Apul lncRNA distribution"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`" 
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true

---


```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(formattable)
library(Biostrings)
library(spaa)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)

```

Lets take lncRNA file and see where in genome we find 


## lncRNA fasta 


```{r, engine='bash', eval=TRUE}
tail ../../DEF-cross-species/data/apul_bedtools_lncRNAs.fasta

fgrep ">" -c ../../DEF-cross-species/data/apul_bedtools_lncRNAs.fasta
```


```{r, engine='bash', eval=TRUE}

cat ../../DEF-cross-species/data/apul_bedtools_lncRNAs.fasta | grep '^>' | sed -n 's/.*::\([^:]*\):.*/\1/p' | sort | uniq -c | awk '{$1=$1; print}' |  head


```


```{r, engine='bash'}

cat ../../DEF-cross-species/data/apul_bedtools_lncRNAs.fasta | grep '^>' | sed -n 's/.*::\([^:]*\):.*/\1/p' | sort | uniq -c  | awk '{$1=$1; print}' > ../output/07-Apul-lncRNA-dist/scaffold-count.txt


```


```{r, eval=TRUE}
# Read the data using read.csv with a space separator
data <- read.csv("../output/07-Apul-lncRNA-dist/scaffold-count.txt", header = FALSE, sep = " ", col.names = c("Value", "Label"))

```


```{r, eval=TRUE}
ggplot(data, aes(x=Value)) +
  geom_histogram(binwidth=20, fill="blue", alpha=0.7) +
  ggtitle("Histogram of Values") +
  xlab("Value") +
  ylab("Frequency")
```