---
title: "04-Align-v1"
output: html_document
date: "2025-08-02"
---

```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate

# Set number of threads
threads=42

# Set paths
ref_genome="../data/merged_out.fasta.TBtools.fa"
input_dir="../output/02-RNAseq"
output_dir="../output/04-align-v1"

# Loop over each FASTQ file
for fq in ${input_dir}/*.fastq.gz; do
  # Extract sample name (e.g., Pum_barcode02 from Pum_barcode02_combined.fastq.gz)
  sample=$(basename "$fq" .fastq.gz)

  echo "Processing $sample..."

  # Run minimap2 and samtools sort
  minimap2 -ax splice -k14 --MD -t $threads "$ref_genome" "$fq" | \
    samtools sort -@ $threads -o "${output_dir}/${sample}.sorted.bam"

  # Index the sorted BAM
  samtools index "${output_dir}/${sample}.sorted.bam"
done
```


# Bigwig


```
find . -name "*.bam" | parallel 'bedtools genomecov -split -bg -ibam {} > {.}.sp.bedGraph'
```

# make bigwig
```{bash}

cut -f1,2 ../data/merged_out.fasta.TBtools.fa.fai > ../output/04-align-v1/genome.chrom.sizes
```

#!/bin/bash

# Loop through all bedGraph files in the current directory
for file in *.bedGraph; do
  echo "Fixing sorting for $file"
  LC_COLLATE=C sort -k1,1 -k2,2n "$file" > "${file}.sorted"
  mv "${file}.sorted" "$file"
done


```{bash}
for bed in *.bedGraph; do
  base=$(basename "$bed" _combined.sorted.sp.bedGraph)
  bedGraphToBigWig "$bed" genome.chrom.sizes "${base}.bw"
done
```


#stringtie


```{bash}
mkdir -p ../output/04-align-v1/stringtie_out

for bam in ../output/04-align-v1/*combined.sorted.bam; do
    sample=$(basename "$bam" _combined.sorted.bam)
    /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \
        -L \
        -o ../output/04-align-v1/stringtie_out/${sample}.gtf \
        -p 32 \
        -v
done
```
```{bash}
ls ../output/04-align-v1/stringtie_out/*.gtf > ../output/04-align-v1/stringtie_out/mergelist.txt

/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie --merge \
    -L \
    -G ../data/GN.gene.gtf \
    -o ../output/04-align-v1/stringtie_out/merged.gtf \
    ../output/04-align-v1/stringtie_out/mergelist.txt
```


```{bash}
mkdir -p ../output/04-align-v1/stringtie_quant

for bam in ../output/04-align-v1/*combined.sorted.bam; do
    sample=$(basename "$bam" _combined.sorted.bam)
    /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \
        -L \
        -e \
        -B \
        -G ../output/04-align-v1/stringtie_out/merged.gtf \
        -o ../output/04-align-v1/stringtie_quant/${sample}.gtf
done
```


```{bash}
python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
-i ../output/04-align-v1/stringtie_quant/samplelist.txt \
-g ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv \
-t ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv
```


```{bash}
wc -l ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv
wc -l ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv
```


```{r}
library(DESeq2)

# Load raw counts
gene_counts <- read.csv("../output/04-align-v1/stringtie_quant/gene_count_matrix.csv", row.names = 1)

# Make sample metadata
sample_info <- data.frame(
  row.names = colnames(gene_counts),
  condition = c("A", "A", "A","A", "B", "B", "B","B","C","C", "C", "C")  # example
)

# DESeq2 object and normalization
dds <- DESeqDataSetFromMatrix(countData = gene_counts, colData = sample_info, design = ~ condition)
dds <- DESeq(dds)
norm_counts <- counts(dds, normalized = TRUE)
```
```{r}
# Assume norm_counts is your normalized count matrix (genes x samples)
cv_values <- apply(norm_counts, 1, function(x) sd(x) / mean(x))

# Sort genes by increasing CV (lower = more stable)
cv_ranked <- sort(cv_values)
head(cv_ranked, 20)  # Top 10 most stable genes
```

```{r}
library(limma)
library(tidyverse)

# Prepare expression matrix (genes x samples)
expr <- as.matrix(norm_counts)

# Create design matrix
group <- factor(case_when(
  str_detect(colnames(expr), "^Pum") ~ "Pum",
  str_detect(colnames(expr), "^Qui") ~ "Qui",
  str_detect(colnames(expr), "^Rio") ~ "Rio"
))

design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)

# Fit linear model
fit <- lmFit(expr, design)

# Create contrast for all pairwise comparisons
contrast_matrix <- makeContrasts(
  Pum_vs_Qui = Pum - Qui,
  Pum_vs_Rio = Pum - Rio,
  Qui_vs_Rio = Qui - Rio,
  levels = design
)

fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)

# Get top DE genes across comparisons
top_table <- topTable(fit2, coef = "Qui_vs_Rio", number = 20)
head(top_table)
```

```{r}
library(tidyverse)
library(pheatmap)

# Read the data
# Start with DESeq2 normalized count matrix
# norm_counts is a matrix: genes x samples

# Convert to data frame and add gene_id column
df <- as.data.frame(norm_counts) %>%
  rownames_to_column("gene_id")

# Pivot longer and extract species prefix
df_long <- df %>%
  pivot_longer(
    cols = -gene_id,
    names_to = "sample",
    values_to = "count"
  ) %>%
  mutate(species = str_sub(sample, 1, 3))  # Get first 3 letters as species

# Summarize average count per gene per species
df_summary <- df_long %>%
  group_by(gene_id, species) %>%
  summarize(mean_count = mean(count), .groups = "drop")

# Pivot wider so each species is a column
df_wide <- df_summary %>%
  pivot_wider(names_from = species, values_from = mean_count)

# Compute variance across species per gene
df_var <- df_wide %>%
  rowwise() %>%
  mutate(variance = var(c_across(where(is.numeric)), na.rm = TRUE)) %>%
  ungroup()

# Get top 20 genes with highest variance
top_genes <- df_var %>%
  arrange(desc(variance)) %>%
  slice_head(n = 20)

# Set row names to gene IDs and remove 'variance' column
top_matrix <- top_genes %>%
  column_to_rownames("gene_id") %>%
  select(-variance) %>%
  as.matrix()

# Plot heatmap
pheatmap(
  mat = top_matrix,
  scale = "row", # scale across genes
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  fontsize_row = 8,
  main = "Top 20 Most Variable Genes Across Species"
)
```
```{r}
library(tidyverse)

# Step 1: Convert norm_counts to long format
norm_df <- as.data.frame(norm_counts) %>%
  rownames_to_column("gene_id") %>%
  pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
  mutate(population = str_sub(sample, 1, 3))  # Extract Pum, Qui, Rio
```

```{r}
top_genes <- norm_df %>%
  group_by(gene_id) %>%
  summarize(variance = var(expression, na.rm = TRUE)) %>%
  arrange(desc(variance)) %>%
  slice_head(n = 20) %>%
  pull(gene_id)
```

```{r}
# Filter to top genes
plot_df <- norm_df %>%
  filter(gene_id %in% top_genes)

# Plot
ggplot(plot_df, aes(x = sample, y = expression, fill = population)) +
  geom_col() +
  facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6),
    strip.text = element_text(size = 8)
  ) +
  labs(
    title = "Top 20 Most Variable Genes Across All Samples",
    x = "Sample",
    y = "Normalized Expression"
  )
```
```{r}

# Choose the gene ID
gene_id <- "MSTRG.27106"

# Extract expression values for that gene
gene_df <- data.frame(
  sample = colnames(norm_counts),
  expression = as.numeric(norm_counts[gene_id, ])
) %>%
  mutate(population = str_sub(sample, 1, 3))  # Extract Pum, Qui, Rio
```

```{r}
ggplot(gene_df, aes(x = sample, y = expression, fill = population)) +
  geom_col() +
  theme_bw() +
  labs(
    title = paste("Expression of", gene_id),
    x = "Sample",
    y = "Normalized Expression"
  ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))
```

```{r}
ggplot(gene_df, aes(x = population, y = expression, fill = population)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.7) +
  theme_minimal() +
  labs(
    title = paste("Expression of", gene_id, "by Population"),
    x = "Population",
    y = "Normalized Expression"
  )
```


```{r}
library(limma)

# Get DEGs (adjusted p < 0.05, for example)
deg_table <- topTable(fit2, adjust.method = "BH", p.value = 0.05, number = Inf)

# Extract gene IDs
deg_genes <- rownames(deg_table)
```


```{r}
deg_expr <- norm_counts[deg_genes, ]
```


```{r}
library(tidyverse)

deg_long <- as.data.frame(deg_expr) %>%
  rownames_to_column("gene_id") %>%
  pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
  mutate(
    population = str_sub(sample, 1, 3),
    log_expression = log2(expression + 1)  # optional
  )
```


```{r}
ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) +
  facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
  theme_bw() +
  labs(
    title = "Differentially Expressed Genes (limma)",
    x = "Population",
    y = "log2(Normalized Expression + 1)"
  ) +
  theme(strip.text = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 7))
```
```{r}
library(ggplot2)

# Get unique gene IDs
all_genes <- unique(deg_long$gene_id)

# Split into groups of 12 genes
gene_chunks <- split(all_genes, ceiling(seq_along(all_genes) / 12))

# Loop through chunks and plot
pdf("deg_boxplots.pdf", width = 11, height = 8.5)
for (genes in gene_chunks) {
  p <- ggplot(filter(deg_long, gene_id %in% genes),
              aes(x = population, y = log_expression, fill = population)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.7) +
    geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) +
    facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
    theme_bw(base_size = 10) +
    labs(
      title = "DEGs Across Populations",
      x = "Population",
      y = "log2(Normalized Expression + 1)"
    ) +
    theme(
      strip.text = element_text(size = 9),
      axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
    )
  print(p)
}
dev.off()
```


# all contrasts

```{r}
# Get all contrast names
contrast_names <- colnames(fit2$contrasts)

# Extract all DEGs per contrast
deg_list <- lapply(contrast_names, function(contrast) {
  tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "P")  # all genes
  tt$gene_id <- rownames(tt)
  tt$contrast <- contrast
  tt
})

# Combine into one long data frame
all_degs <- bind_rows(deg_list)
```


```{r}
# Filter for significant DEGs across all contrasts
sig_degs <- all_degs %>%
  filter(adj.P.Val < 0.05 & abs(logFC) > 1)
```


```{r}
sig_degs %>%
  count(contrast)
```

# annotate sig diffren

```{r}
library(tidyverse)

blast_annot <- read_tsv("../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab", col_names = FALSE)  # or read.delim("...") for .txt
```
```{r}
colnames(blast_annot)
```

```{r}
blast_annot <- blast_annot %>%
  mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+"))
```

```{r}
# Keep first hit per gene (e.g., based on lowest evalue or highest bit score)
blast_best <- blast_annot %>%
  mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+")) %>%
  group_by(gene_id) %>%
  slice_min(order_by = X3, n = 1) %>%  # replace X3 with correct column (e.g., evalue)
  ungroup()
```


```{r}

suppressWarnings({
  sig_degs_annotated <- sig_degs %>%
    left_join(blast_annot, by = "gene_id")
})
```


```{r}
library(UpSetR)

# Make DEG sets by contrast
deg_sets <- sig_degs %>%
  group_by(contrast) %>%
  summarize(genes = list(gene_id)) %>%
  deframe()

upset(fromList(deg_sets), nsets = length(deg_sets), order.by = "freq")
```

```{r}
library(limma)
library(tidyverse)
library(pheatmap)

# Get contrast names
contrast_names <- colnames(fit2$contrasts)

# Extract all logFCs into a list
logfc_list <- lapply(contrast_names, function(contrast) {
  tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "none")
  logFCs <- tt$logFC
  names(logFCs) <- rownames(tt)
  return(logFCs)
})

# Combine into a matrix (genes x contrasts)
logfc_matrix <- logfc_list %>%
  purrr::reduce(full_join, by = "row.names") %>%
  column_to_rownames("Row.names")

# Rename columns with contrast names
colnames(logfc_matrix) <- contrast_names
```

```{r}
library(tidyverse)

# If not already done, create a long-format data frame from norm_counts
expr_long <- as.data.frame(norm_counts) %>%
  rownames_to_column("gene_id") %>%
  pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
  mutate(population = str_sub(sample, 1, 3),
         log_expression = log2(expression + 1))
```

```{r}
write_csv(sig_degs_annotated, "../output/04-align-v1/sig_degs_annotated.csv")
```
```{r}
sig_degs_annotated %>%
  select(gene_id, contrast, starts_with("X")) %>%
  knitr::kable(format = "markdown")
```


```{r}
deg_genes <- unique(sig_degs$gene_id)  # from your filtered limma results
```

```{r}
library(tidyverse)
library(ggplot2)
library(gridExtra)

# Group genes into chunks of 12
gene_chunks <- split(deg_genes, ceiling(seq_along(deg_genes) / 12))

# Loop over chunks and save each group as a PNG page
for (i in seq_along(gene_chunks)) {
  
  genes <- gene_chunks[[i]]
  plot_list <- list()
  
  for (gene in genes) {
    p <- ggplot(filter(expr_long, gene_id == gene),
                aes(x = population, y = log_expression, fill = population)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.2, size = 1, alpha = 0.6) +
      theme_minimal() +
      labs(
        title = paste("Expression of", gene),
        x = "Population",
        y = "log2(Normalized Expression + 1)"
      ) +
      theme(plot.title = element_text(size = 9),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
            axis.title = element_text(size = 8))
    
    plot_list[[gene]] <- p
  }
  
  # Save as PNG
  png(filename = sprintf("DEG_boxplots_page_%02d.png", i), width = 10, height = 12, units = "in", res = 300)
  grid.arrange(grobs = plot_list, ncol = 3, nrow = 4)
  dev.off()
}
```

```{r}
library(ggplot2)
library(plotly)

# Plot 1: ggplot version
p <- ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, size = 0.5, alpha = 0.6, aes(text = sample)) +
  facet_wrap(~ gene_id, scales = "free_y", ncol = 3) +
  theme_minimal(base_size = 11) +
  labs(
    title = "Differentially Expressed Genes (Interactive)",
    x = "Population",
    y = "log2(Normalized Expression + 1)"
  ) +
  theme(
    strip.text = element_text(size = 9),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
  )

# Plot 2: make it interactive
ggplotly(p, tooltip = c("x", "y", "text"))
```


# BLAST


# get fasta

```{bash}
cut -f3 ../output/04-align-v1/stringtie_out/merged.gtf | grep -v "^#" | sort | uniq 
```


```{bash}
grep -v "^#" ../output/04-align-v1/stringtie_out/merged.gtf | awk '$3 == "transcript"' | \
awk 'BEGIN{OFS="\t"} {print $1, $4-1, $5, $10, ".", $7}' | sed -E 's/"//g; s/;//g' > ../output/04-align-v1/stringtie_out/merged_transcripts.bed
```


```{bash}
head ../output/04-align-v1/stringtie_out/merged_transcripts.bed
```
```{bash}
bedtools getfasta -fi ../data/merged_out.fasta.TBtools.fa \
-bed ../output/04-align-v1/stringtie_out/merged_transcripts.bed \
-s -name -fo ../output/04-align-v1/stringtie_out/transcripts.fa
```


```{bash}
grep -c ">" ../output/04-align-v1/stringtie_out/transcripts.fa
```


```{r, engine='bash'}
/home/shared/diamond-2.1.8 makedb \
--in ../data/uniprot_sprot_r2024_03.fasta \
--threads 30 \
--db ../blastdb/dia-uniprot_sprot_r2024_03
```

# Blast

```{bash}
/home/shared/diamond-2.1.8 help
```

```{r, engine='bash'}
/home/shared/diamond-2.1.8 blastx \
--query ../output/04-align-v1/stringtie_out/transcripts.fa \
--db ../blastdb/dia-uniprot_sprot_r2024_03 \
--out ../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab \
--threads 42 \
--max-target-seqs 1 \
--max-hsps 1 \
--outfmt 6
```


```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../output/04-align-v1/stringtie_out/transcripts.fa \
-db ../blastdb/uniprot_sprot_r2024_03 \
-out ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```


```{bash}
head ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab
```