---
title: "11-genome-explore"
format: html
editor: visual
---
```{bash}
./
```
```{r}
# Install Biostrings if not already installed
if (!requireNamespace("Biostrings", quietly = TRUE)) {
install.packages("BiocManager")
BiocManager::install("Biostrings")
}
# Load the necessary library
library(Biostrings)
# Function to calculate basic genome statistics
genome_stats <- function(fasta_file) {
# Read the FASTA file
sequences <- readDNAStringSet(fasta_file)
# Number of sequences
num_sequences <- length(sequences)
# Total base pairs
total_length <- sum(width(sequences))
# GC content calculation
gc_count <- sum(letterFrequency(sequences, letters = c("G", "C")))
gc_content <- (gc_count / total_length) * 100
# Average sequence length
avg_length <- mean(width(sequences))
# Print statistics
cat("Number of sequences:", num_sequences, "\n")
cat("Total base pairs:", total_length, "\n")
cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "")
cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "")
}
# Example usage
fasta_file <- "../data/Apulcra-genome.fa" # Replace with your file path
genome_stats(fasta_file)
```
```{r}
# Install necessary Bioconductor packages if not already installed
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# Install required packages if not already installed
if (!requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
if (!requireNamespace("GenomicFeatures", quietly = TRUE)) {
BiocManager::install("GenomicFeatures")
}
# Load libraries
library(Biostrings)
library(GenomicFeatures)
# Function to calculate comprehensive genome statistics
genome_stats <- function(fasta_file, gff_file = NULL) {
# Read the genome FASTA file
sequences <- readDNAStringSet(fasta_file)
# Calculate basic statistics
num_sequences <- length(sequences)
total_length <- sum(width(sequences))
gc_count <- sum(letterFrequency(sequences, letters = c("G", "C")))
gc_content <- (gc_count / total_length) * 100
avg_length <- mean(width(sequences))
# Print basic statistics
cat("Number of sequences: ", num_sequences, "\n")
cat("Total base pairs: ", total_length, "\n")
cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "")
cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "")
# Check if a GFF/GTF file is provided for feature statistics
if (!is.null(gff_file)) {
# Load GFF/GTF file and create a TxDb object
txdb <- makeTxDbFromGFF(gff_file, format = ifelse(grepl("gtf$", gff_file), "gtf", "gff"))
# Get the number of features
feature_summary <- list(
Genes = length(genes(txdb)),
Transcripts = length(transcripts(txdb)),
Exons = length(exons(txdb)),
CDS = length(cds(txdb))
)
cat("\nFeature summary:\n")
print(feature_summary)
# Additional details on features
feature_details <- as.data.frame(genes(txdb))
cat("\nFirst few gene features:\n")
print(head(feature_details))
} else {
cat("No GFF/GTF file provided, skipping feature statistics.\n")
}
}
# Example usage:
fasta_file <- "../data/Apulcra-genome.fa" # Replace with your genome file path
gff_file <- "../data/Apulcra-genome.gff" # Replace with your GFF/GTF file path (optional)
genome_stats(fasta_file, gff_file)
```