---
title: "11-genome-explore"
format: html
editor: visual

---


```{bash}
./
```


```{r}
# Install Biostrings if not already installed
if (!requireNamespace("Biostrings", quietly = TRUE)) {
    install.packages("BiocManager")
    BiocManager::install("Biostrings")
}

# Load the necessary library
library(Biostrings)

# Function to calculate basic genome statistics
genome_stats <- function(fasta_file) {
    # Read the FASTA file
    sequences <- readDNAStringSet(fasta_file)
    
    # Number of sequences
    num_sequences <- length(sequences)
    
    # Total base pairs
    total_length <- sum(width(sequences))
    
    # GC content calculation
    gc_count <- sum(letterFrequency(sequences, letters = c("G", "C")))
    gc_content <- (gc_count / total_length) * 100
    
    # Average sequence length
    avg_length <- mean(width(sequences))
    
    # Print statistics
    cat("Number of sequences:", num_sequences, "\n")
    cat("Total base pairs:", total_length, "\n")
    cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "")
    cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "")
}

# Example usage
fasta_file <- "../data/Apulcra-genome.fa"  # Replace with your file path
genome_stats(fasta_file)

```


```{r}
# Install necessary Bioconductor packages if not already installed
if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

# Install required packages if not already installed
if (!requireNamespace("Biostrings", quietly = TRUE)) {
    BiocManager::install("Biostrings")
}

if (!requireNamespace("GenomicFeatures", quietly = TRUE)) {
    BiocManager::install("GenomicFeatures")
}

# Load libraries
library(Biostrings)
library(GenomicFeatures)

# Function to calculate comprehensive genome statistics
genome_stats <- function(fasta_file, gff_file = NULL) {
    # Read the genome FASTA file
    sequences <- readDNAStringSet(fasta_file)
    
    # Calculate basic statistics
    num_sequences <- length(sequences)
    total_length <- sum(width(sequences))
    gc_count <- sum(letterFrequency(sequences, letters = c("G", "C")))
    gc_content <- (gc_count / total_length) * 100
    avg_length <- mean(width(sequences))
    
    # Print basic statistics
    cat("Number of sequences: ", num_sequences, "\n")
    cat("Total base pairs: ", total_length, "\n")
    cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "")
    cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "")
    
    # Check if a GFF/GTF file is provided for feature statistics
    if (!is.null(gff_file)) {
        # Load GFF/GTF file and create a TxDb object
        txdb <- makeTxDbFromGFF(gff_file, format = ifelse(grepl("gtf$", gff_file), "gtf", "gff"))

        # Get the number of features
        feature_summary <- list(
            Genes = length(genes(txdb)),
            Transcripts = length(transcripts(txdb)),
            Exons = length(exons(txdb)),
            CDS = length(cds(txdb))
        )
        
        cat("\nFeature summary:\n")
        print(feature_summary)
        
        # Additional details on features
        feature_details <- as.data.frame(genes(txdb))
        cat("\nFirst few gene features:\n")
        print(head(feature_details))
    } else {
        cat("No GFF/GTF file provided, skipping feature statistics.\n")
    }
}

# Example usage:
fasta_file <- "../data/Apulcra-genome.fa"  # Replace with your genome file path
gff_file <- "../data/Apulcra-genome.gff"  # Replace with your GFF/GTF file path (optional)
genome_stats(fasta_file, gff_file)

```