---
title: "04-Align-v1"
output: html_document
date: "2025-08-02"
---
```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate
# Set number of threads
threads=42
# Set paths
ref_genome="../data/merged_out.fasta.TBtools.fa"
input_dir="../output/02-RNAseq"
output_dir="../output/04-align-v1"
# Loop over each FASTQ file
for fq in ${input_dir}/*.fastq.gz; do
# Extract sample name (e.g., Pum_barcode02 from Pum_barcode02_combined.fastq.gz)
sample=$(basename "$fq" .fastq.gz)
echo "Processing $sample..."
# Run minimap2 and samtools sort
minimap2 -ax splice -k14 --MD -t $threads "$ref_genome" "$fq" | \
samtools sort -@ $threads -o "${output_dir}/${sample}.sorted.bam"
# Index the sorted BAM
samtools index "${output_dir}/${sample}.sorted.bam"
done
```
# Bigwig
```
find . -name "*.bam" | parallel 'bedtools genomecov -split -bg -ibam {} > {.}.sp.bedGraph'
```
# make bigwig
```{bash}
cut -f1,2 ../data/merged_out.fasta.TBtools.fa.fai > ../output/04-align-v1/genome.chrom.sizes
```
#!/bin/bash
# Loop through all bedGraph files in the current directory
for file in *.bedGraph; do
echo "Fixing sorting for $file"
LC_COLLATE=C sort -k1,1 -k2,2n "$file" > "${file}.sorted"
mv "${file}.sorted" "$file"
done
```{bash}
for bed in *.bedGraph; do
base=$(basename "$bed" _combined.sorted.sp.bedGraph)
bedGraphToBigWig "$bed" genome.chrom.sizes "${base}.bw"
done
```
#stringtie
```{bash}
mkdir -p ../output/04-align-v1/stringtie_out
for bam in ../output/04-align-v1/*combined.sorted.bam; do
sample=$(basename "$bam" _combined.sorted.bam)
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \
-L \
-o ../output/04-align-v1/stringtie_out/${sample}.gtf \
-p 32 \
-v
done
```
```{bash}
ls ../output/04-align-v1/stringtie_out/*.gtf > ../output/04-align-v1/stringtie_out/mergelist.txt
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie --merge \
-L \
-G ../data/GN.gene.gtf \
-o ../output/04-align-v1/stringtie_out/merged.gtf \
../output/04-align-v1/stringtie_out/mergelist.txt
```
```{bash}
mkdir -p ../output/04-align-v1/stringtie_quant
for bam in ../output/04-align-v1/*combined.sorted.bam; do
sample=$(basename "$bam" _combined.sorted.bam)
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \
-L \
-e \
-B \
-G ../output/04-align-v1/stringtie_out/merged.gtf \
-o ../output/04-align-v1/stringtie_quant/${sample}.gtf
done
```
```{bash}
python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
-i ../output/04-align-v1/stringtie_quant/samplelist.txt \
-g ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv \
-t ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv
```
```{bash}
wc -l ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv
wc -l ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv
```
```{r}
library(DESeq2)
# Load raw counts
gene_counts <- read.csv("../output/04-align-v1/stringtie_quant/gene_count_matrix.csv", row.names = 1)
# Make sample metadata
sample_info <- data.frame(
row.names = colnames(gene_counts),
condition = c("A", "A", "A","A", "B", "B", "B","B","C","C", "C", "C") # example
)
# DESeq2 object and normalization
dds <- DESeqDataSetFromMatrix(countData = gene_counts, colData = sample_info, design = ~ condition)
dds <- DESeq(dds)
norm_counts <- counts(dds, normalized = TRUE)
```
```{r}
# Assume norm_counts is your normalized count matrix (genes x samples)
cv_values <- apply(norm_counts, 1, function(x) sd(x) / mean(x))
# Sort genes by increasing CV (lower = more stable)
cv_ranked <- sort(cv_values)
head(cv_ranked, 20) # Top 10 most stable genes
```
```{r}
library(limma)
library(tidyverse)
# Prepare expression matrix (genes x samples)
expr <- as.matrix(norm_counts)
# Create design matrix
group <- factor(case_when(
str_detect(colnames(expr), "^Pum") ~ "Pum",
str_detect(colnames(expr), "^Qui") ~ "Qui",
str_detect(colnames(expr), "^Rio") ~ "Rio"
))
design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)
# Fit linear model
fit <- lmFit(expr, design)
# Create contrast for all pairwise comparisons
contrast_matrix <- makeContrasts(
Pum_vs_Qui = Pum - Qui,
Pum_vs_Rio = Pum - Rio,
Qui_vs_Rio = Qui - Rio,
levels = design
)
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)
# Get top DE genes across comparisons
top_table <- topTable(fit2, coef = "Qui_vs_Rio", number = 20)
head(top_table)
```
```{r}
library(tidyverse)
library(pheatmap)
# Read the data
# Start with DESeq2 normalized count matrix
# norm_counts is a matrix: genes x samples
# Convert to data frame and add gene_id column
df <- as.data.frame(norm_counts) %>%
rownames_to_column("gene_id")
# Pivot longer and extract species prefix
df_long <- df %>%
pivot_longer(
cols = -gene_id,
names_to = "sample",
values_to = "count"
) %>%
mutate(species = str_sub(sample, 1, 3)) # Get first 3 letters as species
# Summarize average count per gene per species
df_summary <- df_long %>%
group_by(gene_id, species) %>%
summarize(mean_count = mean(count), .groups = "drop")
# Pivot wider so each species is a column
df_wide <- df_summary %>%
pivot_wider(names_from = species, values_from = mean_count)
# Compute variance across species per gene
df_var <- df_wide %>%
rowwise() %>%
mutate(variance = var(c_across(where(is.numeric)), na.rm = TRUE)) %>%
ungroup()
# Get top 20 genes with highest variance
top_genes <- df_var %>%
arrange(desc(variance)) %>%
slice_head(n = 20)
# Set row names to gene IDs and remove 'variance' column
top_matrix <- top_genes %>%
column_to_rownames("gene_id") %>%
select(-variance) %>%
as.matrix()
# Plot heatmap
pheatmap(
mat = top_matrix,
scale = "row", # scale across genes
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize_row = 8,
main = "Top 20 Most Variable Genes Across Species"
)
```
```{r}
library(tidyverse)
# Step 1: Convert norm_counts to long format
norm_df <- as.data.frame(norm_counts) %>%
rownames_to_column("gene_id") %>%
pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
mutate(population = str_sub(sample, 1, 3)) # Extract Pum, Qui, Rio
```
```{r}
top_genes <- norm_df %>%
group_by(gene_id) %>%
summarize(variance = var(expression, na.rm = TRUE)) %>%
arrange(desc(variance)) %>%
slice_head(n = 20) %>%
pull(gene_id)
```
```{r}
# Filter to top genes
plot_df <- norm_df %>%
filter(gene_id %in% top_genes)
# Plot
ggplot(plot_df, aes(x = sample, y = expression, fill = population)) +
geom_col() +
facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6),
strip.text = element_text(size = 8)
) +
labs(
title = "Top 20 Most Variable Genes Across All Samples",
x = "Sample",
y = "Normalized Expression"
)
```
```{r}
# Choose the gene ID
gene_id <- "MSTRG.27106"
# Extract expression values for that gene
gene_df <- data.frame(
sample = colnames(norm_counts),
expression = as.numeric(norm_counts[gene_id, ])
) %>%
mutate(population = str_sub(sample, 1, 3)) # Extract Pum, Qui, Rio
```
```{r}
ggplot(gene_df, aes(x = sample, y = expression, fill = population)) +
geom_col() +
theme_bw() +
labs(
title = paste("Expression of", gene_id),
x = "Sample",
y = "Normalized Expression"
) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))
```
```{r}
ggplot(gene_df, aes(x = population, y = expression, fill = population)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.7) +
theme_minimal() +
labs(
title = paste("Expression of", gene_id, "by Population"),
x = "Population",
y = "Normalized Expression"
)
```
```{r}
library(limma)
# Get DEGs (adjusted p < 0.05, for example)
deg_table <- topTable(fit2, adjust.method = "BH", p.value = 0.05, number = Inf)
# Extract gene IDs
deg_genes <- rownames(deg_table)
```
```{r}
deg_expr <- norm_counts[deg_genes, ]
```
```{r}
library(tidyverse)
deg_long <- as.data.frame(deg_expr) %>%
rownames_to_column("gene_id") %>%
pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
mutate(
population = str_sub(sample, 1, 3),
log_expression = log2(expression + 1) # optional
)
```
```{r}
ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) +
facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
theme_bw() +
labs(
title = "Differentially Expressed Genes (limma)",
x = "Population",
y = "log2(Normalized Expression + 1)"
) +
theme(strip.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, size = 7))
```
```{r}
library(ggplot2)
# Get unique gene IDs
all_genes <- unique(deg_long$gene_id)
# Split into groups of 12 genes
gene_chunks <- split(all_genes, ceiling(seq_along(all_genes) / 12))
# Loop through chunks and plot
pdf("deg_boxplots.pdf", width = 11, height = 8.5)
for (genes in gene_chunks) {
p <- ggplot(filter(deg_long, gene_id %in% genes),
aes(x = population, y = log_expression, fill = population)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) +
facet_wrap(~ gene_id, scales = "free_y", ncol = 4) +
theme_bw(base_size = 10) +
labs(
title = "DEGs Across Populations",
x = "Population",
y = "log2(Normalized Expression + 1)"
) +
theme(
strip.text = element_text(size = 9),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
)
print(p)
}
dev.off()
```
# all contrasts
```{r}
# Get all contrast names
contrast_names <- colnames(fit2$contrasts)
# Extract all DEGs per contrast
deg_list <- lapply(contrast_names, function(contrast) {
tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "P") # all genes
tt$gene_id <- rownames(tt)
tt$contrast <- contrast
tt
})
# Combine into one long data frame
all_degs <- bind_rows(deg_list)
```
```{r}
# Filter for significant DEGs across all contrasts
sig_degs <- all_degs %>%
filter(adj.P.Val < 0.05 & abs(logFC) > 1)
```
```{r}
sig_degs %>%
count(contrast)
```
# annotate sig diffren
```{r}
library(tidyverse)
blast_annot <- read_tsv("../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab", col_names = FALSE) # or read.delim("...") for .txt
```
```{r}
colnames(blast_annot)
```
```{r}
blast_annot <- blast_annot %>%
mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+"))
```
```{r}
# Keep first hit per gene (e.g., based on lowest evalue or highest bit score)
blast_best <- blast_annot %>%
mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+")) %>%
group_by(gene_id) %>%
slice_min(order_by = X3, n = 1) %>% # replace X3 with correct column (e.g., evalue)
ungroup()
```
```{r}
suppressWarnings({
sig_degs_annotated <- sig_degs %>%
left_join(blast_annot, by = "gene_id")
})
```
```{r}
library(UpSetR)
# Make DEG sets by contrast
deg_sets <- sig_degs %>%
group_by(contrast) %>%
summarize(genes = list(gene_id)) %>%
deframe()
upset(fromList(deg_sets), nsets = length(deg_sets), order.by = "freq")
```
```{r}
library(limma)
library(tidyverse)
library(pheatmap)
# Get contrast names
contrast_names <- colnames(fit2$contrasts)
# Extract all logFCs into a list
logfc_list <- lapply(contrast_names, function(contrast) {
tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "none")
logFCs <- tt$logFC
names(logFCs) <- rownames(tt)
return(logFCs)
})
# Combine into a matrix (genes x contrasts)
logfc_matrix <- logfc_list %>%
purrr::reduce(full_join, by = "row.names") %>%
column_to_rownames("Row.names")
# Rename columns with contrast names
colnames(logfc_matrix) <- contrast_names
```
```{r}
library(tidyverse)
# If not already done, create a long-format data frame from norm_counts
expr_long <- as.data.frame(norm_counts) %>%
rownames_to_column("gene_id") %>%
pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>%
mutate(population = str_sub(sample, 1, 3),
log_expression = log2(expression + 1))
```
```{r}
write_csv(sig_degs_annotated, "../output/04-align-v1/sig_degs_annotated.csv")
```
```{r}
sig_degs_annotated %>%
select(gene_id, contrast, starts_with("X")) %>%
knitr::kable(format = "markdown")
```
```{r}
deg_genes <- unique(sig_degs$gene_id) # from your filtered limma results
```
```{r}
library(tidyverse)
library(ggplot2)
library(gridExtra)
# Group genes into chunks of 12
gene_chunks <- split(deg_genes, ceiling(seq_along(deg_genes) / 12))
# Loop over chunks and save each group as a PNG page
for (i in seq_along(gene_chunks)) {
genes <- gene_chunks[[i]]
plot_list <- list()
for (gene in genes) {
p <- ggplot(filter(expr_long, gene_id == gene),
aes(x = population, y = log_expression, fill = population)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, size = 1, alpha = 0.6) +
theme_minimal() +
labs(
title = paste("Expression of", gene),
x = "Population",
y = "log2(Normalized Expression + 1)"
) +
theme(plot.title = element_text(size = 9),
axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
axis.title = element_text(size = 8))
plot_list[[gene]] <- p
}
# Save as PNG
png(filename = sprintf("DEG_boxplots_page_%02d.png", i), width = 10, height = 12, units = "in", res = 300)
grid.arrange(grobs = plot_list, ncol = 3, nrow = 4)
dev.off()
}
```
```{r}
library(ggplot2)
library(plotly)
# Plot 1: ggplot version
p <- ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, size = 0.5, alpha = 0.6, aes(text = sample)) +
facet_wrap(~ gene_id, scales = "free_y", ncol = 3) +
theme_minimal(base_size = 11) +
labs(
title = "Differentially Expressed Genes (Interactive)",
x = "Population",
y = "log2(Normalized Expression + 1)"
) +
theme(
strip.text = element_text(size = 9),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
)
# Plot 2: make it interactive
ggplotly(p, tooltip = c("x", "y", "text"))
```
# BLAST
# get fasta
```{bash}
cut -f3 ../output/04-align-v1/stringtie_out/merged.gtf | grep -v "^#" | sort | uniq
```
```{bash}
grep -v "^#" ../output/04-align-v1/stringtie_out/merged.gtf | awk '$3 == "transcript"' | \
awk 'BEGIN{OFS="\t"} {print $1, $4-1, $5, $10, ".", $7}' | sed -E 's/"//g; s/;//g' > ../output/04-align-v1/stringtie_out/merged_transcripts.bed
```
```{bash}
head ../output/04-align-v1/stringtie_out/merged_transcripts.bed
```
```{bash}
bedtools getfasta -fi ../data/merged_out.fasta.TBtools.fa \
-bed ../output/04-align-v1/stringtie_out/merged_transcripts.bed \
-s -name -fo ../output/04-align-v1/stringtie_out/transcripts.fa
```
```{bash}
grep -c ">" ../output/04-align-v1/stringtie_out/transcripts.fa
```
```{r, engine='bash'}
/home/shared/diamond-2.1.8 makedb \
--in ../data/uniprot_sprot_r2024_03.fasta \
--threads 30 \
--db ../blastdb/dia-uniprot_sprot_r2024_03
```
# Blast
```{bash}
/home/shared/diamond-2.1.8 help
```
```{r, engine='bash'}
/home/shared/diamond-2.1.8 blastx \
--query ../output/04-align-v1/stringtie_out/transcripts.fa \
--db ../blastdb/dia-uniprot_sprot_r2024_03 \
--out ../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab \
--threads 42 \
--max-target-seqs 1 \
--max-hsps 1 \
--outfmt 6
```
```{r, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../output/04-align-v1/stringtie_out/transcripts.fa \
-db ../blastdb/uniprot_sprot_r2024_03 \
-out ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
```
```{bash}
head ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab
```