--- title: "04-Align-v1" output: html_document date: "2025-08-02" --- ```{bash} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate # Set number of threads threads=42 # Set paths ref_genome="../data/merged_out.fasta.TBtools.fa" input_dir="../output/02-RNAseq" output_dir="../output/04-align-v1" # Loop over each FASTQ file for fq in ${input_dir}/*.fastq.gz; do # Extract sample name (e.g., Pum_barcode02 from Pum_barcode02_combined.fastq.gz) sample=$(basename "$fq" .fastq.gz) echo "Processing $sample..." # Run minimap2 and samtools sort minimap2 -ax splice -k14 --MD -t $threads "$ref_genome" "$fq" | \ samtools sort -@ $threads -o "${output_dir}/${sample}.sorted.bam" # Index the sorted BAM samtools index "${output_dir}/${sample}.sorted.bam" done ``` ```{bash} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../output/04-align-v1/stringtie_quant/samplelist.txt \ -g ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv \ -t ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv ``` ```{bash} wc -l ../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv wc -l ../output/04-align-v1/stringtie_quant/gene_count_matrix.csv ``` ```{r} library(DESeq2) # Load raw counts gene_counts <- read.csv("../output/04-align-v1/stringtie_quant/transcript_count_matrix.csv", row.names = 1) # Make sample metadata sample_info <- data.frame( row.names = colnames(gene_counts), condition = c("A", "A", "A","A", "B", "B", "B","B","C","C", "C", "C") # example ) # DESeq2 object and normalization dds <- DESeqDataSetFromMatrix(countData = gene_counts, colData = sample_info, design = ~ condition) dds <- DESeq(dds) norm_counts <- counts(dds, normalized = TRUE) ``` ```{r} library(limma) library(tidyverse) # Prepare expression matrix (genes x samples) expr <- as.matrix(norm_counts) # Create design matrix group <- factor(case_when( str_detect(colnames(expr), "^Pum") ~ "Pum", str_detect(colnames(expr), "^Qui") ~ "Qui", str_detect(colnames(expr), "^Rio") ~ "Rio" )) design <- model.matrix(~ 0 + group) colnames(design) <- levels(group) # Fit linear model fit <- lmFit(expr, design) # Create contrast for all pairwise comparisons contrast_matrix <- makeContrasts( Pum_vs_Qui = Pum - Qui, Pum_vs_Rio = Pum - Rio, Qui_vs_Rio = Qui - Rio, levels = design ) fit2 <- contrasts.fit(fit, contrast_matrix) fit2 <- eBayes(fit2) ``` all contrasts ```{r} # Get all contrast names contrast_names <- colnames(fit2$contrasts) # Extract all DEGs per contrast deg_list <- lapply(contrast_names, function(contrast) { tt <- topTable(fit2, coef = contrast, number = Inf, sort.by = "P") # all genes tt$gene_id <- rownames(tt) tt$contrast <- contrast tt }) # Combine into one long data frame all_degs <- bind_rows(deg_list) ``` ```{r} # Filter for significant DEGs across all contrasts sig_degs <- all_degs %>% filter(adj.P.Val < 0.05 & abs(logFC) > 1) ``` ```{r} sig_degs %>% count(contrast) ``` ```{r} library(tidyverse) # Step 1: Convert matrix to data frame and calculate variance gene_variances <- as.data.frame(norm_counts) %>% rownames_to_column("gene") %>% mutate(variance = apply(select(., -gene), 1, var)) # Step 2: Select top 500 most variable genes top500 <- gene_variances %>% arrange(desc(variance)) %>% slice_head(n = 500) %>% select(-variance) %>% column_to_rownames("gene") # Step 3: Transpose for PCA (samples as rows) t_counts <- t(top500) # Step 4: Run PCA pca_res <- prcomp(t_counts, scale. = TRUE) # Step 5: Calculate percent variance explained percentVar <- pca_res$sdev^2 / sum(pca_res$sdev^2) # Step 6: Format PCA output for plotting pca_df <- as.data.frame(pca_res$x) %>% rownames_to_column("Sample") # Add group info based on sample name prefix pca_df <- pca_df %>% mutate(Group = case_when( str_starts(Sample, "Pum") ~ "Pum", str_starts(Sample, "Qui") ~ "Qui", str_starts(Sample, "Rio") ~ "Rio", TRUE ~ "Other" )) # Step 7: Plot PCA ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) + geom_point(size = 4) + xlab(paste0("PC1: ", round(percentVar[1] * 100, 1), "% variance")) + ylab(paste0("PC2: ", round(percentVar[2] * 100, 1), "% variance")) + theme_minimal() + scale_color_brewer(palette = "Set1") # Optional: nice colors ``` ```{r} library(tidyverse) # Convert normalized matrix to data frame df <- as.data.frame(norm_counts) # Extract sample names and group labels from column names sample_groups <- colnames(df) %>% as.data.frame() %>% rename(Sample = 1) %>% mutate(Group = str_extract(Sample, "^[^_]+")) # adjust if needed # Step 1: Transpose to long format: gene, sample, value df_long <- df %>% rownames_to_column("Gene") %>% pivot_longer(-Gene, names_to = "Sample", values_to = "Expression") %>% left_join(sample_groups, by = "Sample") # Step 2: Run one-way ANOVA per gene to test for group differences anova_results <- df_long %>% group_by(Gene) %>% summarise(p_value = tryCatch( summary(aov(Expression ~ Group))[[1]][["Pr(>F)"]][1], error = function(e) NA )) %>% drop_na() %>% arrange(p_value) # Step 3: Get top 500 genes with smallest p-values (most differential) top500_genes <- anova_results %>% slice_head(n = 500) %>% pull(Gene) # Step 4: Subset original matrix top500_matrix <- df[top500_genes, ] # Step 5: Transpose for PCA t_counts <- t(top500_matrix) # Step 6: Run PCA pca_res <- prcomp(t_counts, scale. = TRUE) # Step 7: Calculate % variance percentVar <- pca_res$sdev^2 / sum(pca_res$sdev^2) # Step 8: Format PCA output pca_df <- as.data.frame(pca_res$x) %>% rownames_to_column("Sample") %>% mutate(Group = str_extract(Sample, "^[^_]+")) # Step 9: Plot ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) + geom_point(size = 4) + xlab(paste0("PC1: ", round(percentVar[1] * 100, 1), "% variance")) + ylab(paste0("PC2: ", round(percentVar[2] * 100, 1), "% variance")) + theme_minimal() + scale_color_brewer(palette = "Set1") ``` ```{r} library(tidyverse) # Step 1: Ensure norm_counts is a data frame df <- as.data.frame(norm_counts) # Step 2: Transpose so samples are rows t_counts <- t(df) # Step 3: Remove genes with zero variance t_counts <- t_counts[, apply(t_counts, 2, var) != 0] # Step 4: Run PCA pca_res <- prcomp(t_counts, scale. = TRUE) # Step 5: Percent variance percentVar <- pca_res$sdev^2 / sum(pca_res$sdev^2) # Step 6: Format PCA output and group info pca_df <- as.data.frame(pca_res$x) %>% rownames_to_column("Sample") %>% mutate(Group = str_extract(Sample, "^[^_]+")) # Step 7: Plot ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) + geom_point(size = 4) + xlab(paste0("PC1: ", round(percentVar[1] * 100, 1), "% variance")) + ylab(paste0("PC2: ", round(percentVar[2] * 100, 1), "% variance")) + theme_minimal() + scale_color_brewer(palette = "Set1") ```