--- title: "12-Apul genome-explore" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) knitr::opts_chunk$set( echo = FALSE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Genome Analysis ```{r, engine='bash'} # Set the fasta file variable fasta_file="../data/Apulcra-genome.fa" # Check if input file exists if [ ! -f "$fasta_file" ]; then echo "File not found: $fasta_file" exit 1 fi # Number of sequences num_seqs=$(grep -c '^>' "$fasta_file") echo "Number of sequences: $num_seqs" # Get sequence lengths awk ' /^>/ { if (seqlen) { print seqlen; seqlen=0 } next } { seqlen += length($0) } END { if (seqlen) print seqlen } ' "$fasta_file" > seq_lengths.txt # Total length total_length=$(awk '{sum+=$1} END{print sum}' seq_lengths.txt) printf "Total length: %'d\n" "$total_length" # Longest and shortest sequence using a direct approach with proper handling of zero-length sequences read longest shortest < <(awk 'NR==1 {max=$1; min=$1} {if ($1>max) max=$1; if ($10) min=$1} END {print max, min}' seq_lengths.txt) printf "Longest sequence length: %'d\n" "$longest" printf "Shortest sequence length: %'d\n" "$shortest" # N50 and L50 calculations sort -nr seq_lengths.txt > seq_lengths_sorted.txt n50=$(awk -v total_length="$total_length" ' BEGIN { half_total = total_length / 2; sum = 0 } { sum += $1 if (sum >= half_total) { print $1 exit } } ' seq_lengths_sorted.txt) printf "N50: %'d\n" "$n50" l50=$(awk -v total_length="$total_length" ' BEGIN { half_total = total_length / 2; sum = 0; count = 0 } { sum += $1 count++ if (sum >= half_total) { print count exit } } ' seq_lengths_sorted.txt) echo "L50: $l50" # GC content and base counts awk ' /^>/ { next } { seq = toupper($0) g += gsub(/G/, "", seq) c += gsub(/C/, "", seq) a += gsub(/A/, "", seq) t += gsub(/T/, "", seq) n += gsub(/N/, "", seq) } END { total = a + c + g + t + n gc = g + c printf "Total bases: %'\''d\n", total printf "A: %'\''d\n", a printf "C: %'\''d\n", c printf "G: %'\''d\n", g printf "T: %'\''d\n", t printf "N: %'\''d\n", n printf "GC Content: %.2f%%\n", (gc / total) * 100 } ' "$fasta_file" # Cleanup temporary files rm seq_lengths.txt seq_lengths_sorted.txt ``` # GFF Stats ```{r, engine='bash'} # Define the path to the GFF file gff_file="../data/Apulcra-genome.gff" # Check if the file exists if [ ! -f "$gff_file" ]; then echo "File does not exist: $gff_file" exit 1 fi echo "Processing file: $gff_file" # Total number of entries total_entries=$(grep -vc '^#' $gff_file) echo "Total entries: $total_entries" # Number of unique features unique_features=$(cut -f3 $gff_file | grep -v '^#' | sort | uniq | wc -l) echo "Unique features: $unique_features" # Number of entries per feature type echo "Entries per feature type:" cut -f3 $gff_file | grep -v '^#' | sort | uniq -c | sort -nr # Number of unique sources and list each with counts echo "Unique sources and their counts:" unique_sources=$(cut -f2 $gff_file | grep -v '^#' | sort | uniq -c | sort -nr) echo "$unique_sources" ``` # Genes ```{r, engine='bash', eval=FALSE} curl -o "../data/Apul-genes.fasta" https://gannet.fish.washington.edu/seashell/bu-github/mechanism_to_model/data/Apul-genes.fasta ``` ```{r, engine='bash'} # Define the path to the FASTA file fasta_file="../data/Apul-genes.fasta" # Check if the file exists if [ ! -f "$fasta_file" ]; then echo "File does not exist: $fasta_file" exit 1 fi echo "Processing file: $fasta_file" # Total number of sequences total_sequences=$(grep -c '^>' $fasta_file) echo "Total sequences: $total_sequences" # Average, minimum, and maximum sequence length awk '/^>/ {if (seqname) {print seqlen; total += seqlen; if (seqlen < min || min == 0) min = seqlen; if (seqlen > max) max = seqlen;} seqname = $0; seqlen = 0; next;} {seqlen += length($0)} END {print seqlen; total += seqlen; if (seqlen < min || min == 0) min = seqlen; if (seqlen > max) max = seqlen; print total/NR, min, max}' $fasta_file | awk 'BEGIN {min = 0; max = 0; total = 0; count = 0} {total += $1; count++; if (NR == 1 || $1 < min) min = $1; if ($1 > max) max = $1} END {print "Average sequence length: " total/count; print "Shortest sequence length: " min; print "Longest sequence length: " max;}' # Total number of bases total_bases=$(awk '/^>/ {next} {total += length($0)} END {print total}' $fasta_file) echo "Total bases (nucleotides/amino acids): $total_bases" ```