--- title: "32-gene-loading-histo" output: html_document date: "2025-11-20" --- Lets plot histograms of the distibutions of loading scores for all gene wit rank35 - lambda .02 ~/GitHub/timeseries_molecular/M-multi-species/output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv ```{r, echo=FALSE, warning=FALSE, message=FALSE} # Load data gene_factors <- read_csv("../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") # Plot histogram of only comp5 ggplot(gene_factors_long %>% filter(component == "comp5"), aes(x = loading)) + geom_histogram(bins = 50, fill = "green", alpha = 0.7) + theme_minimal() + labs(title = "Distribution of Gene Loadings for Component 5", x = "Gene Loading", y = "Count") #plot histogram of only comp24 and make pink and draw lines indicating standard deviations ggplot(gene_factors_long %>% filter(component == "comp5"), aes(x = loading)) + geom_histogram(bins = 50, fill = "green", alpha = 0.7) + geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs(title = "Distribution of Gene Loadings for Component 5", x = "Gene Loading", y = "Count") ggplot(gene_factors_long %>% filter(component == "comp24"), aes(x = loading)) + geom_histogram(bins = 50, fill = "pink", alpha = 0.7) + geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs(title = "Distribution of Gene Loadings for Component 24", x = "Gene Loading", y = "Count") ggplot(gene_factors_long %>% filter(component == "comp12"), aes(x = loading)) + geom_histogram(bins = 50, fill = "orange", alpha = 0.7) + geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs(title = "Distribution of Gene Loadings for Component 24", x = "Gene Loading", y = "Count") ``` ```{r} # exract genes from comp5 where loading is greater than mean + 1.96*sd comp5_genes <- gene_factors_long %>% filter(component == "comp5") %>% mutate(mean_loading = mean(loading), sd_loading = sd(loading), upper95 = mean_loading + 1.96 * sd_loading) %>% filter(loading > upper95) %>% arrange(desc(loading)) ``` ```{r} library(dplyr) df <- gene_factors_long %>% filter(component == "comp5") m <- mean(df$loading, na.rm = TRUE) sd <- sd(df$loading, na.rm = TRUE) genes_above_sd <- df %>% filter(loading > m + sd) %>% pull(gene) # change column name if your gene column is named differently genes_above_sd ``` ```{r} library(dplyr) df <- gene_factors_long %>% filter(component == "comp5") m <- mean(df$loading, na.rm = TRUE) sd <- sd(df$loading, na.rm = TRUE) genes_above_sd <- df %>% filter(loading > m + sd) %>% pull(gene) # change column name if your gene column is named differently genes_above_sd ``` ```{r} upper95 <- m + 1.96 * sd lower95 <- m - 1.96 * sd genes_outside_95 <- df %>% filter(loading > upper95 | loading < lower95) %>% pull(gene) ``` # ```{r} library(dplyr) library(ggplot2) df <- gene_factors_long %>% filter(component == "comp12") m <- mean(df$loading, na.rm = TRUE) sd <- sd(df$loading, na.rm = TRUE) lower95 <- m - 1.96 * sd upper95 <- m + 1.96 * sd ggplot(df, aes(x = loading)) + geom_histogram(bins = 50, fill = "orange", alpha = 0.7) + geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) + geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) + geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs( title = "Distribution of Gene Loadings for Component 12", x = "Gene Loading", y = "Count" ) df <- gene_factors_long %>% filter(component == "comp5") m <- mean(df$loading, na.rm = TRUE) sd <- sd(df$loading, na.rm = TRUE) lower95 <- m - 1.96 * sd upper95 <- m + 1.96 * sd ggplot(df, aes(x = loading)) + geom_histogram(bins = 50, fill = "green", alpha = 0.7) + geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) + geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) + geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs( title = "Distribution of Gene Loadings for Component 12", x = "Gene Loading", y = "Count" ) df <- gene_factors_long %>% filter(component == "comp24") m <- mean(df$loading, na.rm = TRUE) sd <- sd(df$loading, na.rm = TRUE) lower95 <- m - 1.96 * sd upper95 <- m + 1.96 * sd ggplot(df, aes(x = loading)) + geom_histogram(bins = 50, fill = "grey", alpha = 0.7) + geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) + geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) + geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) + theme_minimal() + labs( title = "Distribution of Gene Loadings for Component 12", x = "Gene Loading", y = "Count" ) ``` ```{r} # extract data for comp12 genes that are above upper95 comp5_genes <- gene_factors_long %>% filter(component == "comp5" & loading > upper95) %>% arrange(desc(loading)) ```