---
title: "Biomin blast"
author: "Jill Ashey"
date: "2025-11-14"
output: html_document
---

# Biomineralization protein BLAST 

Upload biomin fasta sequences to Unity (URI server) and check number of sequences.
```{bash}
grep -c ">" Biomineralization_Toolkit_FScucchia.fasta
```

172 biomineralization proteins sequences in the fasta. 

Blast the Styplophora pistillata against the proteins of our species. This was done on Unity (URI server). `nano biomin_blast.sh`
```{bash}
#!/usr/bin/env bash
#SBATCH --export=NONE
#SBATCH --nodes=1 --ntasks-per-node=1
#SBATCH --partition=uri-cpu
#SBATCH --no-requeue
#SBATCH --mem=100GB
#SBATCH -t 72:00:00
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH -o slurm-%j.out
#SBATCH -e slurm-%j.error
#SBATCH -D /work/pi_hputnam_uri_edu/jillashey/e5/biomin_blast

module load uri/main
module load BLAST+/2.15.0-gompi-2023a 

cd /work/pi_hputnam_uri_edu/jillashey/e5/biomin_blast

echo "BLAST protein seqs against biomin seqs" $(date)
echo "Apul BLAST" $(date)

makeblastdb \
  -in Apulchra-genome.pep.faa \
  -out Apul_prot \
  -dbtype prot
  
blastp \
  -query Biomineralization_Toolkit_FScucchia.fasta \
  -db Apul_prot \
  -out Apul_biomin_blast_results.tsv \
  -outfmt "6 qseqid sseqid evalue bitscore" \
  -evalue 1e-20 \
  -max_target_seqs 1 \
  -max_hsps 1

echo "Apul BLAST complete" $(date)
echo "Puta BLAST (using Pmea prot seqs)" $(date)

makeblastdb \
  -in Pocillopora_meandrina_HIv1.genes.pep.faa \
  -out Ptua_prot \
  -dbtype prot
  
blastp \
  -query Biomineralization_Toolkit_FScucchia.fasta \
  -db Ptua_prot \
  -out Ptua_biomin_blast_results.tsv \
  -outfmt "6 qseqid sseqid evalue bitscore" \
  -evalue 1e-20 \
  -max_target_seqs 1 \
  -max_hsps 1

echo "Ptua BLAST complete" $(date)
echo "Peve BLAST" $(date)

makeblastdb \
  -in Porites_evermanni_v1.annot.pep.fa \
  -out Peve_prot \
  -dbtype prot
  
blastp \
  -query Biomineralization_Toolkit_FScucchia.fasta \
  -db Peve_prot \
  -out Peve_biomin_blast_results.tsv \
  -outfmt "6 qseqid sseqid evalue bitscore" \
  -evalue 1e-20 \
  -max_target_seqs 1 \
  -max_hsps 1

echo "Peve BLAST complete" $(date)
```

Submitted batch job 48960546

```{r}
library(readr)
library(dplyr)
library(stringr)

# List the TSV files
tsv_files <- list.files("../output/31.1-biomin-blast", pattern = "\\.tsv$", full.names = TRUE)

# Specify col names 
my_colnames <- c("biomin_hit", "gene_id", "evalue", "score")

# Function to read in TSVs and add species id as a column
read_and_tag <- function(file) {
  species_id <- str_sub(basename(file), 1, 4)
  df <- read_tsv(file, col_names = my_colnames)
  df$species_id <- species_id
  return(df)
}

# Join blast results as one df
tsv_combined <- tsv_files %>% lapply(read_and_tag) %>% bind_rows()

# Read in ortholog csv 
mapping <- read.csv("../output/11.3-ortholog-annotation/gene_to_ortholog_mapping.csv")

# Join blast and ortholog dfs
joined <- left_join(tsv_combined, mapping, by = "gene_id")  

head(joined)
str(joined)
```

```{r}
library(readxl)
biomin_excel <- read_excel("../data/Biomineralization_Toolkit_FScucchia.xlsx")

# Join the joined df with the biomin excel annotations
joined <- left_join(joined, biomin_excel, by = c("biomin_hit" = "accessionnumber/geneID"))
head(joined)
str(joined)

# Write out csv
write_csv(joined, "../output/31.1-biomin-blast/biomin-blast-orthologs.csv")
```

Read in Apul counts
```{r}
apul_counts <- read.csv("../../D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv")
```

Subset Apul and remove -TXX from gene IDs 
```{r}
joined_apul <- joined %>%
  filter(species_id == "Apul")
joined_apul$gene_id <- sub("-.*", "", joined_apul$gene_id)
```

Merge counts and biomin info 
```{r}
joined_counts_apul <- left_join(joined_apul, apul_counts, by = "gene_id")

# Save as csv
write_csv(joined_counts_apul, "../output/31.1-biomin-blast/apul-biomin-blast-counts.csv")
```

```{r}
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)

# Reshape to long format, focusing on columns with TP time points
df_long <- joined_counts_apul %>%
  select(gene_id, starts_with("ACR")) %>%
  pivot_longer(
    cols = starts_with("ACR"),
    names_to = "sample_time",
    values_to = "count"
  ) %>%
  mutate(
    time = str_extract(sample_time, "TP[0-9]+"),
    time_numeric = as.numeric(str_remove(time, "TP"))
  )

# Calculate mean and standard error grouped by gene_id and time
summary_df <- df_long %>%
  group_by(gene_id, time_numeric) %>%
  summarise(
    mean_count = mean(count, na.rm = TRUE),
    se_count = sd(count, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

# Plot with mean and SE error bars, faceted by gene_id
ggplot(summary_df, aes(x = time_numeric, y = mean_count)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = mean_count - se_count, ymax = mean_count + se_count), width = 0.1) +
  facet_wrap(~ gene_id, scales = "free_y") +
  labs(x = "Time Point (TP)", y = "Mean Gene Count", title = "Gene Counts Across Time by Gene") +
  theme_minimal()
```