--- title: "PCA and PLSDA analysis of ortholog expression" author: "Ariana S Huffmyer, E5 RoL Team" date: "2025" output: html_document: code_folding: hide toc: yes toc_depth: 6 toc_float: yes editor_options: chunk_output_type: console --- Next, try removing or imputing samples that are not represented in all time points. Also try adding factor for colony_id. # Set up ```{r} library(vegan) library(mixOmics) library(ggplot2) library(tidyverse) library(DESeq2) library(factoextra) ``` ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) # The following setting is important, do not omit. options(stringsAsFactors = FALSE) #Set Strings to character ``` # Load data Read in data from MOSAIC. ```{r} orthos<-read_csv("https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/12-ortho-annot/ortholog_groups_annotated.csv") apul<-read_csv("https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv") peve<-read_csv("https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/E-Peve/output/02.20-E-Peve-RNAseq-alignment-HiSat2/peve-gene_count_matrix.csv") ptua<-read_csv("https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/F-Ptua/output/02.20-F-Ptua-RNAseq-alignment-HiSat2/ptua-gene_count_matrix.csv") ``` Select orthologs and join data together into a count matrix. Apul ```{r} apul_o<-orthos%>% select(group_id, apul)%>% mutate(apul=gsub("-T1", "", apul))%>% dplyr::rename(gene_id=apul) apul_mat<-apul%>% filter(gene_id %in% apul_o$gene_id) apul_mat<-left_join(apul_mat, apul_o) apul_mat<-apul_mat%>% select(group_id, gene_id, everything())%>% select(!gene_id) head(apul_mat) ``` Peve ```{r} peve_o<-orthos%>% select(group_id, peve)%>% dplyr::rename(gene_id=peve)%>% mutate(gene_id = paste0("gene-", gene_id)) peve_mat<-peve%>% filter(gene_id %in% peve_o$gene_id) peve_mat<-left_join(peve_mat, peve_o) peve_mat<-peve_mat%>% select(group_id, gene_id, everything())%>% select(!gene_id) head(peve_mat) ``` Ptua ```{r} ptua_o<-orthos%>% select(group_id, ptua)%>% dplyr::rename(gene_id=ptua)%>% mutate(gene_id = paste0("gene-", gene_id)) ptua_mat<-ptua%>% filter(gene_id %in% ptua_o$gene_id) ptua_mat<-left_join(ptua_mat, ptua_o) ptua_mat<-ptua_mat%>% select(group_id, gene_id, everything())%>% select(!gene_id) head(ptua_mat) ``` Join all together. ```{r} dim(apul_mat) dim(peve_mat) dim(ptua_mat) mat<-full_join(apul_mat, peve_mat, by="group_id") mat<-full_join(mat, ptua_mat, by="group_id") dim(mat) ``` 117 samples as expected Create metadata file. ```{r} metadata<-as.data.frame(colnames(mat))%>% dplyr::rename(sample="colnames(mat)")%>% filter(!sample=="group_id")%>% mutate( # Species mapping species = case_when( str_detect(sample, regex("\\bACR\\b", ignore_case = TRUE)) ~ "Acropora", str_detect(sample, regex("\\bPOR\\b", ignore_case = TRUE)) ~ "Porites", str_detect(sample, regex("\\bPOC\\b", ignore_case = TRUE)) ~ "Pocillopora", TRUE ~ NA_character_ ), # Extract timepoint digit from TP1/TP2/TP3/TP4 .tp_digit = str_match(sample, regex("T\\s*P?\\s*([1-4])", ignore_case = TRUE))[, 2], timepoint = if_else(!is.na(.tp_digit), paste0("T", .tp_digit), NA_character_) ) %>% select(-.tp_digit) head(metadata) ``` # Conduct vst transforation Remove any orthologs with NA in any sample. ```{r} dim(mat) #18598 orthos currently mat<-mat%>% filter(if_all(everything(), ~ !is.na(.)))%>% unique() dim(mat) #9800 orthologs present in all samples ``` Conduct VST transformation ```{r} rownames(metadata)<-metadata$sample metadata<-metadata%>% select(!sample) head(metadata) counts<-as.data.frame(mat) rownames(counts)<-counts$group_id counts<-counts%>% select(!group_id) head(counts) # --- Construct DESeqDataSet --- dds <- DESeqDataSetFromMatrix( countData = counts, colData = metadata, design = ~ species * timepoint # or your formula, e.g., ~ condition ) # --- VST --- # blind = TRUE ignores the design for estimating the trend (good for QC/PCA). # Set blind = FALSE if you have a strong design and want to retain it during transformation. vst_obj <- vst(dds, blind = TRUE) # Extract the transformed matrix (genes x samples, roughly homoskedastic) vst_mat <- assay(vst_obj) head(vst_mat) vst_df<-as.data.frame(vst_mat) vst_df$group_id<-rownames(vst_df) vst_df<-vst_df%>% select(group_id, everything()) #output data write_csv(vst_df, "M-multi-species/output/14-pca-orthologs/vst_counts_matrix.csv") ``` Count matrix is now transformed and ready for PCA. # Conduct PCA/PERMANOVA Export data for PERMANOVA test. ```{r} test<-t(vst_mat) #export as matrix test<-as.data.frame(test) #add category columns test$sample<-rownames(test) test$species<-metadata$species[match(test$sample, rownames(metadata))] test$timepoint<-metadata$timepoint[match(test$sample, rownames(metadata))] ``` Build PERMANOVA model for all genes in the dataset. ```{r} dim(test) scaled_test <-prcomp(test[c(1:9800)], scale=TRUE, center=TRUE) fviz_eig(scaled_test, ncp=20) # scale data vegan <- scale(test[c(1:9800)]) # PerMANOVA permanova<-adonis2(vegan ~ species*timepoint, data = test, method='eu') permanova ``` adonis2(formula = vegan ~ species * timepoint, data = test, method = "eu") Df SumOfSqs R2 F Pr(>F) species 2 486499 0.42813 47.1107 0.001 *** timepoint 3 43356 0.03815 2.7990 0.001 *** species:timepoint 6 64327 0.05661 2.0764 0.001 *** Residual 105 542153 0.47711 Total 116 1136336 1.00000 Ortholog expression is significantly different between species and timepoint, with significant interactions. ## Examine PCA and sample distances of all genes Plot a PCA of samples by species and timepoint for all orthologs. ```{r} gPCAdata <- plotPCA(vst_obj, intgroup = c("species", "timepoint"), returnData=TRUE, ntop=9800) percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data allgenesfilt_PCA <- ggplot(gPCAdata, aes(PC1, PC2, color=species, shape=timepoint)) + geom_point(size=3) + ggtitle(expression(bold("All orthologs (9,800 genes)")))+ xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + theme_classic() + #Set background color ggtitle("A")+ theme(panel.border = element_blank(), # Set border axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank())+ stat_ellipse(); allgenesfilt_PCA ``` View the loadings of orthologs along PC1 and PC2, which explain a majority of the variance (55%). ```{r} library(matrixStats) library(ggrepel) ## 1) Get the VST matrix mat <- assay(vst_obj) # genes x samples (already variance-stabilized) ## 2) Select the same top-variable genes as plotPCA(ntop = 9800) rv <- rowVars(mat) ntop <- min(9800, nrow(mat)) top_genes <- order(rv, decreasing = TRUE)[seq_len(ntop)] mat_top <- mat[top_genes, ] ## 3) Run PCA on (genes x samples) -> transpose for prcomp # center=TRUE centers each variable (gene); scale.=FALSE is typical for VST pr <- prcomp(t(mat_top), center = TRUE, scale. = FALSE) ## 4) Percent variance (for axis labels) pvar <- (pr$sdev^2) / sum(pr$sdev^2) pv1 <- round(100 * pvar[1]) pv2 <- round(100 * pvar[2]) ## 5) Build sample scores data frame and attach metadata scores <- as.data.frame(pr$x[, 1:2]) %>% rownames_to_column("sample") %>% left_join( as.data.frame(colData(vst_obj)) %>% rownames_to_column("sample") %>% select(sample, species, timepoint), by = "sample" ) ## 6) Build loadings (genes) data frame loadings <- as.data.frame(pr$rotation[, 1:2]) %>% rownames_to_column("ortholog") %>% mutate(magnitude = sqrt(PC1^2 + PC2^2)) ## 7) Choose top features to display (by loading magnitude on PC1/PC2 plane) top_n_loadings <- 30 loads_top <- loadings %>% arrange(desc(magnitude)) %>% slice_head(n = top_n_loadings) ## 8) Make a biplot: samples + top loading arrows (scaled to fit nicely) # Compute a scale factor so arrows fit within the sample score ranges sx <- diff(range(scores$PC1)) sy <- diff(range(scores$PC2)) lx <- diff(range(loads_top$PC1)) ly <- diff(range(loads_top$PC2)) arrow_scale <- 0.7 * min(sx / (lx %||% 1e-9), sy / (ly %||% 1e-9)) # %||% avoids divide-by-zero biplot_gg <- ggplot(scores, aes(PC1, PC2, color = species, shape = timepoint)) + geom_point(size = 3) + stat_ellipse(linewidth = 0.3, linetype = 2, alpha = 0.6) + # loading arrows geom_segment(data = loads_top, aes(x = 0, y = 0, xend = PC1 * arrow_scale, yend = PC2 * arrow_scale), arrow = arrow(length = unit(0.15, "cm")), linewidth = 0.3, color = "gray30", inherit.aes = FALSE) + geom_text_repel(data = loads_top, aes(x = PC1 * arrow_scale, y = PC2 * arrow_scale, label = ortholog), size = 3, color = "gray20", max.overlaps = 50, segment.size = 0.2, inherit.aes = FALSE) + labs(title = "PCA biplot: samples + top 30 ortholog loadings", x = paste0("PC1 (", pv1, "%)"), y = paste0("PC2 (", pv2, "%)")) + theme_classic();biplot_gg ``` Try lollipop plot. ```{r} # Top loadings along PC1 (abs value) pc1_lolli <- loadings %>% slice_max(order_by = abs(PC1), n = 30) %>% mutate(ortholog = reorder(ortholog, abs(PC1))) ggplot(pc1_lolli, aes(x = abs(PC1), y = ortholog)) + geom_segment(aes(x = 0, xend = abs(PC1), y = ortholog, yend = ortholog)) + geom_point() + labs(title = "Top PC1 loadings (ortholog)", x = "PC1 loading", y = NULL) + theme_classic() # Top loadings along PC2 (abs value) pc2_lolli <- loadings %>% slice_max(order_by = abs(PC2), n = 25) %>% mutate(ortholog = reorder(ortholog, abs(PC2))) ggplot(pc2_lolli, aes(x = abs(PC2), y = ortholog)) + geom_segment(aes(x = 0, xend = abs(PC2), y = ortholog, yend = ortholog)) + geom_point() + labs(title = "Top PC2 loadings (ortholog)", x = "PC2 loading", y = NULL) + theme_classic() ``` Plot PC 3 and 4 ```{r} gPCAdata <- plotPCA(vst_obj, intgroup = c("species", "timepoint"), returnData=TRUE, ntop=9800, pcsToUse=3:4) percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data allgenesfilt_PCA <- ggplot(gPCAdata, aes(PC3, PC4, color=timepoint, shape=species, group=timepoint)) + geom_point(size=3) + xlab(paste0("PC3: ",percentVar[1],"% variance")) + ylab(paste0("PC4: ",percentVar[2],"% variance")) + theme_classic() + #Set background color theme(panel.border = element_blank(), # Set border axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank())+ stat_ellipse(); allgenesfilt_PCA ``` Plot PC 5 and 6 ```{r} gPCAdata <- plotPCA(vst_obj, intgroup = c("species", "timepoint"), returnData=TRUE, ntop=9800, pcsToUse=5:6) percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data allgenesfilt_PCA <- ggplot(gPCAdata, aes(PC5, PC6, color=timepoint, shape=species, group=timepoint)) + geom_point(size=3) + xlab(paste0("PC5: ",percentVar[1],"% variance")) + ylab(paste0("PC6: ",percentVar[2],"% variance")) + theme_classic() + #Set background color theme(panel.border = element_blank(), # Set border axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank())+ stat_ellipse(); allgenesfilt_PCA ``` Plot PC 7 and 8 ```{r} gPCAdata <- plotPCA(vst_obj, intgroup = c("species", "timepoint"), returnData=TRUE, ntop=9800, pcsToUse=7:8) percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data allgenesfilt_PCA <- ggplot(gPCAdata, aes(PC7, PC8, color=timepoint, shape=species, group=timepoint)) + geom_point(size=3) + xlab(paste0("PC7: ",percentVar[1],"% variance")) + ylab(paste0("PC8: ",percentVar[2],"% variance")) + theme_classic() + #Set background color theme(panel.border = element_blank(), # Set border axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank())+ stat_ellipse(); allgenesfilt_PCA ```