---
title: "28-Apul CpG Motifs"
author: "Steven Roberts"
date: "`r format(Sys.Date(), '%d %B, %Y')`"
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: false
editor_options: 
  markdown: 
    wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```


Annotating CpGs .... 

methylation calls derived at 
https://sr320.github.io/tumbling-oysters/posts/37-Apul-meth/

Now there are files for each of the Apul samples 

@ https://gannet.fish.washington.edu/seashell/bu-github/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/

Maybe do all CpG in genome..

# genome

```{r, engine='bash', eval=TRUE}
head ../data/Apulcra-genome.fa
```


```{r}
library(seqinr)

# Replace 'input.fasta' with the name of your multi-sequence fasta file
input_file <- "../data/Apulcra-genome.fa"
sequences <- read.fasta(input_file)

```


```{r}
# Set the seed for reproducibility (optional)
set.seed(42)

number_of_sequences_to_select <- 10

if (length(sequences) < number_of_sequences_to_select) {
  warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
  number_of_sequences_to_select <- length(sequences)
}

selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]

```


```{r}
# Replace 'output.fasta' with your desired output file name
output_file <- "../output/28-Apul-CpG-Annotation/output.fasta"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")
```


```{bash}
#likely will not need; fix issue where gff and fa name did not match
# sed -i 's/>lcl|/>/g' ../output/10_seqs.fa
```


```{bash}
#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx \
../output/28-Apul-CpG-Annotation/output.fasta
```


```{bash}
fuzznuc -sequence ../output/28-Apul-CpG-Annotation/output.fasta -pattern CG -rformat gff -outfile ../output/28-Apul-CpG-Annotation/CGoutput-10seq.gff
```


```{bash}
tail ../output/28-Apul-CpG-Annotation/CGoutput-10seq.gff
```

# Full run

```{r, engine='bash'}
fuzznuc -sequence ../data/Apulcra-genome.fa -pattern CG -rformat gff -outfile ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff
```

```{r, engine='bash'}
head ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff
```


# Intersectbed

```{r, engine='bash'}
bedtools intersect \
-a ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff \
-b ../data/Apulchra-genome.gff \
-wb \
> ../output/28-Apul-CpG-Annotation/intersect_both.gff
```

```{r, engine='bash'}
head ../output/28-Apul-CpG-Annotation/intersect_both.gff
```

```{r, engine='bash', eval=TRUE}
tail -50 ../data/Apulchra-genome.gff
```


```{r, engine='bash'}
awk -F'\t' '
BEGIN {
  header = "##gff-version 3"
}
$0 ~ /^#/ { next }
{
  outfile = "../output/28-Apul-CpG-Annotation/" $3 ".gff"
  if (!(outfile in written)) {
    print header > outfile
    written[outfile] = 1
  }
  print >> outfile
}' ../data/Apulchra-genome.gff
```


```{r, engine='bash', eval=TRUE}
ls ../output/28-Apul-CpG-Annotation/*gff
```

```{r, engine='bash'}
bedtools intersect \
-a ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff \
-b ../output/28-Apul-CpG-Annotation/mRNA.gff \
-wb \
> ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff
```


```{r, engine='bash', eval=TRUE}
head ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff
```


```{r, engine='bash', eval=TRUE}
grep "ptg000015l_9999753" ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff
```