--- title: "11-genome-explore" format: html editor: visual --- ```{bash} ./ ``` ```{r} # Install Biostrings if not already installed if (!requireNamespace("Biostrings", quietly = TRUE)) { install.packages("BiocManager") BiocManager::install("Biostrings") } # Load the necessary library library(Biostrings) # Function to calculate basic genome statistics genome_stats <- function(fasta_file) { # Read the FASTA file sequences <- readDNAStringSet(fasta_file) # Number of sequences num_sequences <- length(sequences) # Total base pairs total_length <- sum(width(sequences)) # GC content calculation gc_count <- sum(letterFrequency(sequences, letters = c("G", "C"))) gc_content <- (gc_count / total_length) * 100 # Average sequence length avg_length <- mean(width(sequences)) # Print statistics cat("Number of sequences:", num_sequences, "\n") cat("Total base pairs:", total_length, "\n") cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "") cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "") } # Example usage fasta_file <- "../data/Apulcra-genome.fa" # Replace with your file path genome_stats(fasta_file) ``` ```{r} # Install necessary Bioconductor packages if not already installed if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } # Install required packages if not already installed if (!requireNamespace("Biostrings", quietly = TRUE)) { BiocManager::install("Biostrings") } if (!requireNamespace("GenomicFeatures", quietly = TRUE)) { BiocManager::install("GenomicFeatures") } # Load libraries library(Biostrings) library(GenomicFeatures) # Function to calculate comprehensive genome statistics genome_stats <- function(fasta_file, gff_file = NULL) { # Read the genome FASTA file sequences <- readDNAStringSet(fasta_file) # Calculate basic statistics num_sequences <- length(sequences) total_length <- sum(width(sequences)) gc_count <- sum(letterFrequency(sequences, letters = c("G", "C"))) gc_content <- (gc_count / total_length) * 100 avg_length <- mean(width(sequences)) # Print basic statistics cat("Number of sequences: ", num_sequences, "\n") cat("Total base pairs: ", total_length, "\n") cat("GC content: ", sprintf("%.2f", gc_content), "%\n", sep = "") cat("Average sequence length: ", sprintf("%.2f", avg_length), " bp\n", sep = "") # Check if a GFF/GTF file is provided for feature statistics if (!is.null(gff_file)) { # Load GFF/GTF file and create a TxDb object txdb <- makeTxDbFromGFF(gff_file, format = ifelse(grepl("gtf$", gff_file), "gtf", "gff")) # Get the number of features feature_summary <- list( Genes = length(genes(txdb)), Transcripts = length(transcripts(txdb)), Exons = length(exons(txdb)), CDS = length(cds(txdb)) ) cat("\nFeature summary:\n") print(feature_summary) # Additional details on features feature_details <- as.data.frame(genes(txdb)) cat("\nFirst few gene features:\n") print(head(feature_details)) } else { cat("No GFF/GTF file provided, skipping feature statistics.\n") } } # Example usage: fasta_file <- "../data/Apulcra-genome.fa" # Replace with your genome file path gff_file <- "../data/Apulcra-genome.gff" # Replace with your GFF/GTF file path (optional) genome_stats(fasta_file, gff_file) ```