--- title: "Apul molecular energetic state gene expression analysis" output: github_document: null date: "2025-02-19" editor_options: chunk_output_type: console --- Analysis of gene expression of A pulchra energetic state using the following gene set GO categories: - Glycolysis GO:0006096 - Gluconeogenesis GO:0006094 - Lipolysis/lipid catabolism GO:0016042 - Fatty acid beta oxidation GO:0006635 - Starvation GO:0042594 - Lipid biosynthesis GO:000861 - Protein catabolic process GO:0030163 # Set up Load libraries ```{r} library(ggplot2) library(vegan) library(mixOmics) library(readxl) library(factoextra) library(ggfortify) library(ComplexHeatmap) library(viridis) library(lme4) library(lmerTest) library(emmeans) library(broom.mixed) library(broom) library(tidyverse) ``` # Read in gene count matrix Gene count matrix for all genes ```{r} # raw gene counts data (will filter and variance stabilize) Apul_genes <- read_csv("D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv") Apul_genes <- as.data.frame(Apul_genes) # format gene IDs as rownames (instead of a column) rownames(Apul_genes) <- Apul_genes$gene_id Apul_genes <- Apul_genes%>%select(!gene_id) # load and format metadata metadata <- read_csv("M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>% filter(grepl("ACR", ColonyID)) metadata$Sample <- paste(metadata$ColonyID, metadata$Timepoint, sep = "_") colonies <- unique(metadata$ColonyID) # Load physiological data phys<-read_csv("https://github.com/urol-e5/timeseries/raw/refs/heads/master/time_series_analysis/Output/master_timeseries.csv")%>%filter(colony_id_corr %in% colonies)%>% select(colony_id_corr, species, timepoint, site, Host_AFDW.mg.cm2, Sym_AFDW.mg.cm2, Am, AQY, Rd, Ik, Ic, calc.umol.cm2.hr, cells.mgAFDW, prot_mg.mgafdw, Ratio_AFDW.mg.cm2, Total_Chl, Total_Chl_cell, cre.umol.mgafdw) # format timepoint phys$timepoint <- gsub("timepoint", "TP", phys$timepoint) #add column with full sample info phys <- merge(phys, metadata, by.x = c("colony_id_corr", "timepoint"), by.y = c("ColonyID", "Timepoint")) %>% select(-AzentaSampleName) #add site information into metadata metadata$Site<-phys$site[match(metadata$ColonyID, phys$colony_id_corr)] # Rename gene column names to include full sample info (as in miRNA table) colnames(Apul_genes) <- metadata$Sample[match(colnames(Apul_genes), metadata$AzentaSampleName)] ``` # Gene set 1: Glycolysis GO:0006096 Load in gene set generated by Apul-energy-go script ```{r} glycolysis_go<-read_table(file="D-Apul/output/23-Apul-energy-GO/Apul_blastp-GO:0006096_out.tab")%>%pull(var=1) glycolysis_go <- str_remove(glycolysis_go, "-T1$") glycolysis_go <- str_remove(glycolysis_go, "-T2$") ``` Subset gene count matrix for this gene set. ```{r} glycolysis_genes<-Apul_genes%>% filter(rownames(.) %in% glycolysis_go) ``` Calculate the sum of the total gene set for each sample. ```{r} glycolysis_genes<-as.data.frame(t(glycolysis_genes)) glycolysis_genes$Sample<-rownames(glycolysis_genes) glycolysis_genes<-glycolysis_genes %>% rowwise() %>% mutate(glycolysis_count = sum(c_across(where(is.numeric)))) %>% ungroup()%>% as.data.frame() ``` Merge into master data frame with metadata and physiology as a new column called "glycolysis". ```{r} data<-left_join(phys, glycolysis_genes) ``` Plot over timepoints. ```{r} plot1<-data%>% ggplot(aes(x=timepoint, y=glycolysis_count, group=colony_id_corr))+ facet_wrap(~species)+ geom_point()+ geom_line()+ theme_classic();plot1 ``` Plot as a PCA. ```{r} pca_data <- data %>% select(c(starts_with("FUN"), colony_id_corr, timepoint)) # Identify numeric columns numeric_cols <- sapply(pca_data, is.numeric) # Among numeric columns, find those with non-zero sum non_zero_cols <- colSums(pca_data[, numeric_cols]) != 0 # Combine non-numeric columns with numeric columns that have non-zero sum pca_data_cleaned <- cbind( pca_data[, !numeric_cols], # All non-numeric columns pca_data[, numeric_cols][, non_zero_cols] # Numeric columns with non-zero sum ) ``` ```{r} scaled.pca<-prcomp(pca_data_cleaned%>%select(where(is.numeric)), scale=TRUE, center=TRUE) ``` Prepare a PCA plot ```{r} # scale data vegan <- scale(pca_data_cleaned%>%select(where(is.numeric))) # PerMANOVA permanova<-adonis2(vegan ~ timepoint, data = pca_data_cleaned, method='eu') permanova ``` Significant differences in glycolysis gene expression profile between time points. View by species ```{r} plot2<-ggplot2::autoplot(scaled.pca, data=pca_data_cleaned, loadings=FALSE, colour="timepoint", loadings.label.colour="black", loadings.colour="black", loadings.label=FALSE, frame=TRUE, loadings.label.size=5, loadings.label.vjust=-1, size=5) + theme_classic()+ theme(legend.text = element_text(size=18), legend.position="right", plot.background = element_blank(), legend.title = element_text(size=18, face="bold"), axis.text = element_text(size=18), axis.title = element_text(size=18, face="bold"));plot2 ``` # Gene set 1: Glycolysis GO:0006094 Load in gene set generated by Apul-energy-go script ```{r} gluconeo_go<-read_table(file="D-Apul/output/23-Apul-energy-GO/Apul_blastp-GO:0006094_out.tab")%>%pull(var=1) gluconeo_go <- str_remove(gluconeo_go, "-T1$") gluconeo_go <- str_remove(gluconeo_go, "-T2$") gluconeo_go <- str_remove(gluconeo_go, "-T3$") ``` Subset gene count matrix for this gene set. ```{r} gluconeo_genes<-Apul_genes%>% filter(rownames(.) %in% gluconeo_go) ``` Calculate the sum of the total gene set for each sample. ```{r} gluconeo_genes<-as.data.frame(t(gluconeo_genes)) gluconeo_genes$Sample<-rownames(gluconeo_genes) gluconeo_genes<-gluconeo_genes %>% rowwise() %>% mutate(gluconeo_count = sum(c_across(where(is.numeric)))) %>% ungroup()%>% as.data.frame() ``` Merge into master data frame with metadata and physiology as a new column called "glycolysis". ```{r} data2<-left_join(phys, gluconeo_genes) ``` Plot over timepoints. ```{r} plot3<-data2%>% ggplot(aes(x=timepoint, y=gluconeo_count, group=colony_id_corr))+ facet_wrap(~species)+ geom_point()+ geom_line()+ theme_classic();plot3 ``` Plot as a PCA. ```{r} pca_data <- data2 %>% dplyr::select(c(starts_with("FUN"), colony_id_corr, timepoint)) # Identify numeric columns numeric_cols <- sapply(pca_data, is.numeric) # Among numeric columns, find those with non-zero sum non_zero_cols <- colSums(pca_data[, numeric_cols]) != 0 # Combine non-numeric columns with numeric columns that have non-zero sum pca_data_cleaned <- cbind( pca_data[, !numeric_cols], # All non-numeric columns pca_data[, numeric_cols][, non_zero_cols] # Numeric columns with non-zero sum ) ``` ```{r} scaled.pca<-prcomp(pca_data_cleaned%>%select(where(is.numeric)), scale=TRUE, center=TRUE) ``` Prepare a PCA plot ```{r} # scale data vegan <- scale(pca_data_cleaned%>%dplyr::select(where(is.numeric))) # PerMANOVA permanova<-adonis2(vegan ~ timepoint, data = pca_data_cleaned, method='eu') permanova ``` Significant differences in gluconeo gene expression profile between time points. View by species ```{r} plot4<-ggplot2::autoplot(scaled.pca, data=pca_data_cleaned, loadings=FALSE, colour="timepoint", loadings.label.colour="black", loadings.colour="black", loadings.label=FALSE, frame=TRUE, loadings.label.size=5, loadings.label.vjust=-1, size=5) + theme_classic()+ theme(legend.text = element_text(size=18), legend.position="right", plot.background = element_blank(), legend.title = element_text(size=18, face="bold"), axis.text = element_text(size=18), axis.title = element_text(size=18, face="bold"));plot4 ```