--- title: "28-Apul CpG Motifs" author: "Steven Roberts" date: "`r format(Sys.Date(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: false editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(Biostrings) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` Annotating CpGs .... methylation calls derived at https://sr320.github.io/tumbling-oysters/posts/37-Apul-meth/ Now there are files for each of the Apul samples @ https://gannet.fish.washington.edu/seashell/bu-github/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/ Maybe do all CpG in genome.. # genome ```{r, engine='bash', eval=TRUE} head ../data/Apulcra-genome.fa ``` ```{r} library(seqinr) # Replace 'input.fasta' with the name of your multi-sequence fasta file input_file <- "../data/Apulcra-genome.fa" sequences <- read.fasta(input_file) ``` ```{r} # Set the seed for reproducibility (optional) set.seed(42) number_of_sequences_to_select <- 10 if (length(sequences) < number_of_sequences_to_select) { warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.") number_of_sequences_to_select <- length(sequences) } selected_indices <- sample(length(sequences), number_of_sequences_to_select) selected_sequences <- sequences[selected_indices] ``` ```{r} # Replace 'output.fasta' with your desired output file name output_file <- "../output/28-Apul-CpG-Annotation/output.fasta" write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w") ``` ```{bash} #likely will not need; fix issue where gff and fa name did not match # sed -i 's/>lcl|/>/g' ../output/10_seqs.fa ``` ```{bash} #needed downstream for IGV /home/shared/samtools-1.12/samtools faidx \ ../output/28-Apul-CpG-Annotation/output.fasta ``` ```{bash} fuzznuc -sequence ../output/28-Apul-CpG-Annotation/output.fasta -pattern CG -rformat gff -outfile ../output/28-Apul-CpG-Annotation/CGoutput-10seq.gff ``` ```{bash} tail ../output/28-Apul-CpG-Annotation/CGoutput-10seq.gff ``` # Full run ```{r, engine='bash'} fuzznuc -sequence ../data/Apulcra-genome.fa -pattern CG -rformat gff -outfile ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff ``` ```{r, engine='bash'} head ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff ``` # Intersectbed ```{r, engine='bash'} bedtools intersect \ -a ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff \ -b ../data/Apulchra-genome.gff \ -wb \ > ../output/28-Apul-CpG-Annotation/intersect_both.gff ``` ```{r, engine='bash'} head ../output/28-Apul-CpG-Annotation/intersect_both.gff ``` ```{r, engine='bash', eval=TRUE} tail -50 ../data/Apulchra-genome.gff ``` ```{r, engine='bash'} awk -F'\t' ' BEGIN { header = "##gff-version 3" } $0 ~ /^#/ { next } { outfile = "../output/28-Apul-CpG-Annotation/" $3 ".gff" if (!(outfile in written)) { print header > outfile written[outfile] = 1 } print >> outfile }' ../data/Apulchra-genome.gff ``` ```{r, engine='bash', eval=TRUE} ls ../output/28-Apul-CpG-Annotation/*gff ``` ```{r, engine='bash'} bedtools intersect \ -a ../output/28-Apul-CpG-Annotation/Apul-CG-motifs.gff \ -b ../output/28-Apul-CpG-Annotation/mRNA.gff \ -wb \ > ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff ``` ```{r, engine='bash', eval=TRUE} head ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff ``` ```{r, engine='bash', eval=TRUE} grep "ptg000015l_9999753" ../output/28-Apul-CpG-Annotation/CG_intersect_mRNA.gff ```