--- title: "07-Epimachinary Counts" author: "Steven Roberts" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: false editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(Biostrings) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` Based on blast we have list of genes.. # Epi Machinery ```{r, engine='bash'} head ../output/03-Ptua-epimods-blast/Mach-blastp-Ptua_out.tab ``` ```{r} ### Load gene expression data ### # The gene count matrix already has properly formatted column names (ACR-139-TP1 format) Ptua_genes <- read_csv("https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/F-Ptua/output/02.20-F-Ptua-RNAseq-alignment-HiSat2/ptua-gene_count_matrix.csv") Ptua_genes <- as.data.frame(Ptua_genes) # Set gene_id as rownames for easier merging rownames(Ptua_genes) <- Ptua_genes$gene_id ``` ```{r} # Read blast results and clean gene IDs blast_data <- read.delim("../output/03-Ptua-epimods-blast/Mach-blastp-Ptua_out.tab", header = FALSE) # Remove the "-T1" (or any "-T") suffix from gene IDs in column 2 blast_data$V2 <- sub("-T[0-9]+$", "", blast_data$V2) # Preview the cleaned data head(blast_data[, 1:3]) # Show first 3 columns for clarity ``` ```{r} blast_data$V2 <- paste0("gene-", blast_data$V2) # Merge blast results with gene expression data # Keep gene_id as a column for merging, then merge by gene IDs epi_mach_exp <- merge(blast_data, Ptua_genes, by.x = "V2", by.y = "gene_id", all.x = TRUE) # Handle duplicate gene_ids by keeping only the row with the lowest e-value (V11) # First, convert V11 to numeric for proper comparison epi_mach_exp$V11 <- as.numeric(epi_mach_exp$V11) # Keep only the best hit (lowest e-value) for each gene_id epi_mach_exp <- epi_mach_exp %>% group_by(V2) %>% slice_min(V11, n = 1, with_ties = FALSE) %>% ungroup() # Clean up the result: remove columns V3-V12 and rename remaining columns # Keep V1 (protein match), V2 (gene_id), and all gene expression columns cols_to_remove <- paste0("V", 3:12) epi_mach_exp <- epi_mach_exp[, !colnames(epi_mach_exp) %in% cols_to_remove] # Rename V1 to protein_match and V2 to gene_id colnames(epi_mach_exp)[colnames(epi_mach_exp) == "V1"] <- "protein_match" colnames(epi_mach_exp)[colnames(epi_mach_exp) == "V2"] <- "gene_id" # Preview the cleaned result cat("Cleaned data dimensions:", dim(epi_mach_exp), "\n") cat("Column names (first 10):", colnames(epi_mach_exp)[1:10], "\n") cat("Number of unique gene_ids:", length(unique(epi_mach_exp$gene_id)), "\n") head(epi_mach_exp[, 1:5]) # Show first 5 columns ``` ```{bash} mkdir -p ../output/07-Ptua-epimachine-exp ``` ```{r} # Save the epimachinery expression data write.csv(epi_mach_exp, file = "../output/07-Ptua-epimachine-exp/Ptua-epimachine-expression.csv", row.names = FALSE) cat("Saved epimachinery expression data to ../output/07-Ptua-epimachine-exp/Ptua-epimachine-expression.csv") ``` ```{bash} head ../output/07-Ptua-epimachine-exp/Ptua-epimachine-expression.csv ```