--- title: "05-IsoSeq-transcriptome-check" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 2 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(Biostrings) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` Lets take look at Isoseq fasta ```{bash} cd ../data/ curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Mtros-hq_transcripts.fasta ``` ```{r, eval=TRUE} # Load the sequences from a fasta file fasta_path <- "../data/Mtros-hq_transcripts.fasta" # Change to your FASTA file path dna_sequences <- readDNAStringSet(fasta_path) # Calculate the lengths of the sequences sequence_lengths <- width(dna_sequences) # Convert to a data frame for ggplot df <- data.frame(Length = sequence_lengths) # Plot the length distribution ggplot(df, aes(x = Length)) + geom_histogram(color = "black", fill = "white", bins = 30) + theme_minimal() + labs(title = "Sequence Length Distribution", x = "Sequence Length", y = "Count") # Number of sequences shorter than 500 bp short_sequences <- sum(sequence_lengths < 500) print(short_sequences) really_short_sequences <- sum(sequence_lengths < 200) print(really_short_sequences) ```