--- title: "04-Align-v1" output: html_document date: "2025-08-02" --- ```{bash} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate # Set number of threads threads=42 # Set paths ref_genome="../data/merged_out.fasta.TBtools.fa" input_dir="../output/02-RNAseq" output_dir="../output/04-align-v1" # Loop over each FASTQ file for fq in ${input_dir}/*.fastq.gz; do # Extract sample name (e.g., Pum_barcode02 from Pum_barcode02_combined.fastq.gz) sample=$(basename "$fq" .fastq.gz) echo "Processing $sample..." # Run minimap2 and samtools sort minimap2 -ax splice -k14 --MD -t $threads "$ref_genome" "$fq" | \ samtools sort -@ $threads -o "${output_dir}/${sample}.sorted.bam" # Index the sorted BAM samtools index "${output_dir}/${sample}.sorted.bam" done ``` # Bigwig ``` find . -name "*.bam" | parallel 'bedtools genomecov -split -bg -ibam {} > {.}.sp.bedGraph' ``` # make bigwig ```{bash} cut -f1,2 ../data/merged_out.fasta.TBtools.fa.fai > ../output/04-align-v1/genome.chrom.sizes ``` #!/bin/bash # Loop through all bedGraph files in the current directory for file in *.bedGraph; do echo "Fixing sorting for $file" LC_COLLATE=C sort -k1,1 -k2,2n "$file" > "${file}.sorted" mv "${file}.sorted" "$file" done ```{bash} for bed in *.bedGraph; do base=$(basename "$bed" _combined.sorted.sp.bedGraph) bedGraphToBigWig "$bed" genome.chrom.sizes "${base}.bw" done ``` #stringtie ```{bash} mkdir -p ../output/04-align-v1/stringtie_out for bam in ../output/04-align-v1/*combined.sorted.bam; do sample=$(basename "$bam" _combined.sorted.bam) /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \ -L \ -o ../output/04-align-v1/stringtie_out/${sample}.gtf \ -p 32 \ -v done ``` ```{bash} ls ../output/04-align-v1/stringtie_out/*.gtf > ../output/04-align-v1/stringtie_out/mergelist.txt /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie --merge \ -L \ -G ../data/GN.gene.gtf \ -o ../output/04-align-v1/stringtie_out/merged.gtf \ ../output/04-align-v1/stringtie_out/mergelist.txt ``` ```{bash} mkdir -p ../output/04-align-v1/stringtie_quant for bam in ../output/04-align-v1/*combined.sorted.bam; do sample=$(basename "$bam" _combined.sorted.bam) /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie "$bam" \ -L \ -e \ -B \ -G ../output/04-align-v1/stringtie_out/merged.gtf \ -o ../output/04-align-v1/stringtie_quant/${sample}.gtf done ``` ```{bash} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../output/04-align-v1/stringtie_quant/samplelist.txt \ -g ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv \ -t ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv ``` ```{bash} wc -l ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv wc -l ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv ``` ```{r} library(DESeq2) # Load raw counts gene_counts <- read.csv("../output/04-align-v1/stringtie_quant/gene_count_matrix.csv", row.names = 1) # Make sample metadata sample_info <- data.frame( row.names = colnames(gene_counts), condition = c("A", "A", "A","A", "B", "B", "B","B","C","C", "C", "C") # example ) # DESeq2 object and normalization dds <- DESeqDataSetFromMatrix(countData = gene_counts, colData = sample_info, design = ~ condition) dds <- DESeq(dds) norm_counts <- counts(dds, normalized = TRUE) ``` ```{r} # Assume norm_counts is your normalized count matrix (genes x samples) cv_values <- apply(norm_counts, 1, function(x) sd(x) / mean(x)) # Sort genes by increasing CV (lower = more stable) cv_ranked <- sort(cv_values) head(cv_ranked, 20) # Top 10 most stable genes ``` ```{r} library(limma) library(tidyverse) # Prepare expression matrix (genes x samples) expr <- as.matrix(norm_counts) # Create design matrix group <- factor(case_when( str_detect(colnames(expr), "^Pum") ~ "Pum", str_detect(colnames(expr), "^Qui") ~ "Qui", str_detect(colnames(expr), "^Rio") ~ "Rio" )) design <- model.matrix(~ 0 + group) colnames(design) <- levels(group) # Fit linear model fit <- lmFit(expr, design) # Create contrast for all pairwise comparisons contrast_matrix <- makeContrasts( Pum_vs_Qui = Pum - Qui, Pum_vs_Rio = Pum - Rio, Qui_vs_Rio = Qui - Rio, levels = design ) fit2 <- contrasts.fit(fit, contrast_matrix) fit2 <- eBayes(fit2) # Get top DE genes across comparisons top_table <- topTable(fit2, coef = "Qui_vs_Rio", number = 20) head(top_table) ``` ```{r} library(tidyverse) library(pheatmap) # Read the data # Start with DESeq2 normalized count matrix # norm_counts is a matrix: genes x samples # Convert to data frame and add gene_id column df <- as.data.frame(norm_counts) %>% rownames_to_column("gene_id") # Pivot longer and extract species prefix df_long <- df %>% pivot_longer( cols = -gene_id, names_to = "sample", values_to = "count" ) %>% mutate(species = str_sub(sample, 1, 3)) # Get first 3 letters as species # Summarize average count per gene per species df_summary <- df_long %>% group_by(gene_id, species) %>% summarize(mean_count = mean(count), .groups = "drop") # Pivot wider so each species is a column df_wide <- df_summary %>% pivot_wider(names_from = species, values_from = mean_count) # Compute variance across species per gene df_var <- df_wide %>% rowwise() %>% mutate(variance = var(c_across(where(is.numeric)), na.rm = TRUE)) %>% ungroup() # Get top 20 genes with highest variance top_genes <- df_var %>% arrange(desc(variance)) %>% slice_head(n = 20) # Set row names to gene IDs and remove 'variance' column top_matrix <- top_genes %>% column_to_rownames("gene_id") %>% select(-variance) %>% as.matrix() # Plot heatmap pheatmap( mat = top_matrix, scale = "row", # scale across genes cluster_rows = TRUE, cluster_cols = TRUE, fontsize_row = 8, main = "Top 20 Most Variable Genes Across Species" ) ``` ```{r} library(tidyverse) # Step 1: Convert norm_counts to long format norm_df <- as.data.frame(norm_counts) %>% rownames_to_column("gene_id") %>% pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>% mutate(population = str_sub(sample, 1, 3)) # Extract Pum, Qui, Rio ``` ```{r} top_genes <- norm_df %>% group_by(gene_id) %>% summarize(variance = var(expression, na.rm = TRUE)) %>% arrange(desc(variance)) %>% slice_head(n = 20) %>% pull(gene_id) ``` ```{r} # Filter to top genes plot_df <- norm_df %>% filter(gene_id %in% top_genes) # Plot ggplot(plot_df, aes(x = sample, y = expression, fill = population)) + geom_col() + facet_wrap(~ gene_id, scales = "free_y", ncol = 4) + theme_bw() + theme( axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6), strip.text = element_text(size = 8) ) + labs( title = "Top 20 Most Variable Genes Across All Samples", x = "Sample", y = "Normalized Expression" ) ``` ```{r} # Choose the gene ID gene_id <- "MSTRG.27106" # Extract expression values for that gene gene_df <- data.frame( sample = colnames(norm_counts), expression = as.numeric(norm_counts[gene_id, ]) ) %>% mutate(population = str_sub(sample, 1, 3)) # Extract Pum, Qui, Rio ``` ```{r} ggplot(gene_df, aes(x = sample, y = expression, fill = population)) + geom_col() + theme_bw() + labs( title = paste("Expression of", gene_id), x = "Sample", y = "Normalized Expression" ) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7)) ``` ```{r} ggplot(gene_df, aes(x = population, y = expression, fill = population)) + geom_boxplot() + geom_jitter(width = 0.2, alpha = 0.7) + theme_minimal() + labs( title = paste("Expression of", gene_id, "by Population"), x = "Population", y = "Normalized Expression" ) ``` ```{r} library(limma) # Get DEGs (adjusted p < 0.05, for example) deg_table <- topTable(fit2, adjust.method = "BH", p.value = 0.05, number = Inf) # Extract gene IDs deg_genes <- rownames(deg_table) ``` ```{r} deg_expr <- norm_counts[deg_genes, ] ``` ```{r} library(tidyverse) deg_long <- as.data.frame(deg_expr) %>% rownames_to_column("gene_id") %>% pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>% mutate( population = str_sub(sample, 1, 3), log_expression = log2(expression + 1) # optional ) ``` ```{r} ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) + facet_wrap(~ gene_id, scales = "free_y", ncol = 4) + theme_bw() + labs( title = "Differentially Expressed Genes (limma)", x = "Population", y = "log2(Normalized Expression + 1)" ) + theme(strip.text = element_text(size = 8), axis.text.x = element_text(angle = 45, hjust = 1, size = 7)) ``` ```{r} library(ggplot2) # Get unique gene IDs all_genes <- unique(deg_long$gene_id) # Split into groups of 12 genes gene_chunks <- split(all_genes, ceiling(seq_along(all_genes) / 12)) # Loop through chunks and plot pdf("deg_boxplots.pdf", width = 11, height = 8.5) for (genes in gene_chunks) { p <- ggplot(filter(deg_long, gene_id %in% genes), aes(x = population, y = log_expression, fill = population)) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + geom_jitter(width = 0.2, size = 0.5, alpha = 0.6) + facet_wrap(~ gene_id, scales = "free_y", ncol = 4) + theme_bw(base_size = 10) + labs( title = "DEGs Across Populations", x = "Population", y = "log2(Normalized Expression + 1)" ) + theme( strip.text = element_text(size = 9), axis.text.x = element_text(angle = 45, hjust = 1, size = 8) ) print(p) } dev.off() ``` # all contrasts ```{r} # Get all contrast names contrast_names <- colnames(fit2$contrasts) # Extract all DEGs per contrast deg_list <- lapply(contrast_names, function(contrast) { tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "P") # all genes tt$gene_id <- rownames(tt) tt$contrast <- contrast tt }) # Combine into one long data frame all_degs <- bind_rows(deg_list) ``` ```{r} # Filter for significant DEGs across all contrasts sig_degs <- all_degs %>% filter(adj.P.Val < 0.05 & abs(logFC) > 1) ``` ```{r} sig_degs %>% count(contrast) ``` # annotate sig diffren ```{r} library(tidyverse) blast_annot <- read_tsv("../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab", col_names = FALSE) # or read.delim("...") for .txt ``` ```{r} colnames(blast_annot) ``` ```{r} blast_annot <- blast_annot %>% mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+")) ``` ```{r} # Keep first hit per gene (e.g., based on lowest evalue or highest bit score) blast_best <- blast_annot %>% mutate(gene_id = str_extract(X1, "^MSTRG\\.\\d+")) %>% group_by(gene_id) %>% slice_min(order_by = X3, n = 1) %>% # replace X3 with correct column (e.g., evalue) ungroup() ``` ```{r} suppressWarnings({ sig_degs_annotated <- sig_degs %>% left_join(blast_annot, by = "gene_id") }) ``` ```{r} library(UpSetR) # Make DEG sets by contrast deg_sets <- sig_degs %>% group_by(contrast) %>% summarize(genes = list(gene_id)) %>% deframe() upset(fromList(deg_sets), nsets = length(deg_sets), order.by = "freq") ``` ```{r} library(limma) library(tidyverse) library(pheatmap) # Get contrast names contrast_names <- colnames(fit2$contrasts) # Extract all logFCs into a list logfc_list <- lapply(contrast_names, function(contrast) { tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "none") logFCs <- tt$logFC names(logFCs) <- rownames(tt) return(logFCs) }) # Combine into a matrix (genes x contrasts) logfc_matrix <- logfc_list %>% purrr::reduce(full_join, by = "row.names") %>% column_to_rownames("Row.names") # Rename columns with contrast names colnames(logfc_matrix) <- contrast_names ``` ```{r} library(tidyverse) # If not already done, create a long-format data frame from norm_counts expr_long <- as.data.frame(norm_counts) %>% rownames_to_column("gene_id") %>% pivot_longer(-gene_id, names_to = "sample", values_to = "expression") %>% mutate(population = str_sub(sample, 1, 3), log_expression = log2(expression + 1)) ``` ```{r} write_csv(sig_degs_annotated, "../output/04-align-v1/sig_degs_annotated.csv") ``` ```{r} sig_degs_annotated %>% select(gene_id, contrast, starts_with("X")) %>% knitr::kable(format = "markdown") ``` ```{r} deg_genes <- unique(sig_degs$gene_id) # from your filtered limma results ``` ```{r} library(tidyverse) library(ggplot2) library(gridExtra) # Group genes into chunks of 12 gene_chunks <- split(deg_genes, ceiling(seq_along(deg_genes) / 12)) # Loop over chunks and save each group as a PNG page for (i in seq_along(gene_chunks)) { genes <- gene_chunks[[i]] plot_list <- list() for (gene in genes) { p <- ggplot(filter(expr_long, gene_id == gene), aes(x = population, y = log_expression, fill = population)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2, size = 1, alpha = 0.6) + theme_minimal() + labs( title = paste("Expression of", gene), x = "Population", y = "log2(Normalized Expression + 1)" ) + theme(plot.title = element_text(size = 9), axis.text.x = element_text(angle = 45, hjust = 1, size = 7), axis.title = element_text(size = 8)) plot_list[[gene]] <- p } # Save as PNG png(filename = sprintf("DEG_boxplots_page_%02d.png", i), width = 10, height = 12, units = "in", res = 300) grid.arrange(grobs = plot_list, ncol = 3, nrow = 4) dev.off() } ``` ```{r} library(ggplot2) library(plotly) # Plot 1: ggplot version p <- ggplot(deg_long, aes(x = population, y = log_expression, fill = population)) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + geom_jitter(width = 0.2, size = 0.5, alpha = 0.6, aes(text = sample)) + facet_wrap(~ gene_id, scales = "free_y", ncol = 3) + theme_minimal(base_size = 11) + labs( title = "Differentially Expressed Genes (Interactive)", x = "Population", y = "log2(Normalized Expression + 1)" ) + theme( strip.text = element_text(size = 9), axis.text.x = element_text(angle = 45, hjust = 1, size = 8) ) # Plot 2: make it interactive ggplotly(p, tooltip = c("x", "y", "text")) ``` # BLAST # get fasta ```{bash} cut -f3 ../output/04-align-v1/stringtie_out/merged.gtf | grep -v "^#" | sort | uniq ``` ```{bash} grep -v "^#" ../output/04-align-v1/stringtie_out/merged.gtf | awk '$3 == "transcript"' | \ awk 'BEGIN{OFS="\t"} {print $1, $4-1, $5, $10, ".", $7}' | sed -E 's/"//g; s/;//g' > ../output/04-align-v1/stringtie_out/merged_transcripts.bed ``` ```{bash} head ../output/04-align-v1/stringtie_out/merged_transcripts.bed ``` ```{bash} bedtools getfasta -fi ../data/merged_out.fasta.TBtools.fa \ -bed ../output/04-align-v1/stringtie_out/merged_transcripts.bed \ -s -name -fo ../output/04-align-v1/stringtie_out/transcripts.fa ``` ```{bash} grep -c ">" ../output/04-align-v1/stringtie_out/transcripts.fa ``` ```{r, engine='bash'} /home/shared/diamond-2.1.8 makedb \ --in ../data/uniprot_sprot_r2024_03.fasta \ --threads 30 \ --db ../blastdb/dia-uniprot_sprot_r2024_03 ``` # Blast ```{bash} /home/shared/diamond-2.1.8 help ``` ```{r, engine='bash'} /home/shared/diamond-2.1.8 blastx \ --query ../output/04-align-v1/stringtie_out/transcripts.fa \ --db ../blastdb/dia-uniprot_sprot_r2024_03 \ --out ../output/04-align-v1/stringtie_out/transcript-uniprot_dia-blastx.tab \ --threads 42 \ --max-target-seqs 1 \ --max-hsps 1 \ --outfmt 6 ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastx \ -query ../output/04-align-v1/stringtie_out/transcripts.fa \ -db ../blastdb/uniprot_sprot_r2024_03 \ -out ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 42 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{bash} head ../output/04-align-v1/stringtie_out/transcript-uniprot_blastx.tab ```