---
title: "04-Ptuh genome-explore"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---


```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
  echo = FALSE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```


# Genome Analysis

```{r, engine='bash', eval=FALSE}
curl -o "../data/Pocillopora_meandrina_HIv1.assembly.fasta" https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.assembly.fasta
```


```{r, engine='bash'}
# Set the fasta file variable
fasta_file="../data/Pocillopora_meandrina_HIv1.assembly.fasta"


# Check if input file exists
if [ ! -f "$fasta_file" ]; then
    echo "File not found: $fasta_file"
    exit 1
fi

# Number of sequences
num_seqs=$(grep -c '^>' "$fasta_file")
echo "Number of sequences: $num_seqs"

# Get sequence lengths
awk '
    /^>/ { if (seqlen) { print seqlen; seqlen=0 } next }
    { seqlen += length($0) }
    END { if (seqlen) print seqlen }
' "$fasta_file" > seq_lengths.txt

# Total length
total_length=$(awk '{sum+=$1} END{print sum}' seq_lengths.txt)
printf "Total length: %'d\n" "$total_length"

# Longest and shortest sequence using a direct approach with proper handling of zero-length sequences
read longest shortest < <(awk 'NR==1 {max=$1; min=$1} {if ($1>max) max=$1; if ($1<min && $1>0) min=$1} END {print max, min}' seq_lengths.txt)

printf "Longest sequence length: %'d\n" "$longest"
printf "Shortest sequence length: %'d\n" "$shortest"

# N50 and L50 calculations
sort -nr seq_lengths.txt > seq_lengths_sorted.txt

n50=$(awk -v total_length="$total_length" '
BEGIN { half_total = total_length / 2; sum = 0 }
{
    sum += $1
    if (sum >= half_total) {
        print $1
        exit
    }
}
' seq_lengths_sorted.txt)
printf "N50: %'d\n" "$n50"

l50=$(awk -v total_length="$total_length" '
BEGIN { half_total = total_length / 2; sum = 0; count = 0 }
{
    sum += $1
    count++
    if (sum >= half_total) {
        print count
        exit
    }
}
' seq_lengths_sorted.txt)
echo "L50: $l50"

# GC content and base counts
awk '
    /^>/ { next }
    {
        seq = toupper($0)
        g += gsub(/G/, "", seq)
        c += gsub(/C/, "", seq)
        a += gsub(/A/, "", seq)
        t += gsub(/T/, "", seq)
        n += gsub(/N/, "", seq)
    }
    END {
        total = a + c + g + t + n
        gc = g + c
        printf "Total bases: %'\''d\n", total
        printf "A: %'\''d\n", a
        printf "C: %'\''d\n", c
        printf "G: %'\''d\n", g
        printf "T: %'\''d\n", t
        printf "N: %'\''d\n", n
        printf "GC Content: %.2f%%\n", (gc / total) * 100
    }
' "$fasta_file"

# Cleanup temporary files
rm seq_lengths.txt seq_lengths_sorted.txt

```


# GFF Stats 


```{r, engine='bash', eval=FALSE}
curl -o "../data/Pocillopora_meandrina_HIv1.genes.gff3" https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.genes.gff3
```

```{r, engine='bash'}
# Define the path to the GFF file
gff_file="../data/Pocillopora_meandrina_HIv1.genes.gff3"


# Check if the file exists
if [ ! -f "$gff_file" ]; then
    echo "File does not exist: $gff_file"
    exit 1
fi

echo "Processing file: $gff_file"

# Total number of entries
total_entries=$(grep -vc '^#' $gff_file)
echo "Total entries: $total_entries"

# Number of unique features
unique_features=$(cut -f3 $gff_file | grep -v '^#' | sort | uniq | wc -l)
echo "Unique features: $unique_features"

# Number of entries per feature type
echo "Entries per feature type:"
cut -f3 $gff_file | grep -v '^#' | sort | uniq -c | sort -nr

# Number of unique sources and list each with counts
echo "Unique sources and their counts:"
unique_sources=$(cut -f2 $gff_file | grep -v '^#' | sort | uniq -c | sort -nr)
echo "$unique_sources"
```

# Genes


```{r, engine='bash', eval=FALSE}
curl -o "../data/Pocillopora_meandrina_HIv1.genes.cds.fna" https://gannet.fish.washington.edu/acropora/E5-deep-dive/Transcripts/Pocillopora_meandrina_HIv1.genes.cds.fna
```


```{r, engine='bash'}
# Define the path to the FASTA file
fasta_file="../data/Pocillopora_meandrina_HIv1.genes.cds.fna"

# Check if the file exists
if [ ! -f "$fasta_file" ]; then
    echo "File does not exist: $fasta_file"
    exit 1
fi

echo "Processing file: $fasta_file"

# Total number of sequences
total_sequences=$(grep -c '^>' $fasta_file)
echo "Total sequences: $total_sequences"

# Average, minimum, and maximum sequence length
awk '/^>/ {if (seqname) {print seqlen; total += seqlen; if (seqlen < min || min == 0) min = seqlen; if (seqlen > max) max = seqlen;} seqname = $0; seqlen = 0; next;} {seqlen += length($0)} END {print seqlen; total += seqlen; if (seqlen < min || min == 0) min = seqlen; if (seqlen > max) max = seqlen; print total/NR, min, max}' $fasta_file | awk 'BEGIN {min = 0; max = 0; total = 0; count = 0} {total += $1; count++; if (NR == 1 || $1 < min) min = $1; if ($1 > max) max = $1} END {print "Average sequence length: " total/count; print "Shortest sequence length: " min; print "Longest sequence length: " max;}'

# Total number of bases
total_bases=$(awk '/^>/ {next} {total += length($0)} END {print total}' $fasta_file)
echo "Total bases (nucleotides/amino acids): $total_bases"
```