---
title: "32-gene-loading-histo"
output: html_document
date: "2025-11-20"
---
Lets plot histograms of the distibutions of loading scores for all gene wit rank35 - lambda .02
~/GitHub/timeseries_molecular/M-multi-species/output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv
```{r, echo=FALSE, warning=FALSE, message=FALSE}
# Load data
gene_factors <- read_csv("../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv")
#edit so columns names are shifted 1 to the left
colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))
#code to create histogram of gene_factors:
library(ggplot2)
library(tidyr)
# Reshape data to long format
gene_factors_long <- gene_factors %>%
pivot_longer(cols = -1, names_to = "component", values_to = "loading")
# Plot histogram
ggplot(gene_factors_long, aes(x = loading)) +
geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
facet_wrap(~ component, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Gene Loadings for Rank 35 Components",
x = "Gene Loading",
y = "Count")
# Plot histogram of only comp5
ggplot(gene_factors_long %>% filter(component == "comp5"), aes(x = loading)) +
geom_histogram(bins = 50, fill = "green", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of Gene Loadings for Component 5",
x = "Gene Loading",
y = "Count")
#plot histogram of only comp24 and make pink and draw lines indicating standard deviations
ggplot(gene_factors_long %>% filter(component == "comp5"), aes(x = loading)) +
geom_histogram(bins = 50, fill = "green", alpha = 0.7) +
geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(title = "Distribution of Gene Loadings for Component 5",
x = "Gene Loading",
y = "Count")
ggplot(gene_factors_long %>% filter(component == "comp24"), aes(x = loading)) +
geom_histogram(bins = 50, fill = "pink", alpha = 0.7) +
geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(title = "Distribution of Gene Loadings for Component 24",
x = "Gene Loading",
y = "Count")
ggplot(gene_factors_long %>% filter(component == "comp12"), aes(x = loading)) +
geom_histogram(bins = 50, fill = "orange", alpha = 0.7) +
geom_vline(aes(xintercept = mean(loading)), color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) + sd(loading)), color = "blue", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = mean(loading) - sd(loading)), color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(title = "Distribution of Gene Loadings for Component 24",
x = "Gene Loading",
y = "Count")
```
```{r}
# exract genes from comp5 where loading is greater than mean + 1.96*sd
comp5_genes <- gene_factors_long %>%
filter(component == "comp5") %>%
mutate(mean_loading = mean(loading),
sd_loading = sd(loading),
upper95 = mean_loading + 1.96 * sd_loading) %>%
filter(loading > upper95) %>%
arrange(desc(loading))
```
```{r}
library(dplyr)
df <- gene_factors_long %>%
filter(component == "comp5")
m <- mean(df$loading, na.rm = TRUE)
sd <- sd(df$loading, na.rm = TRUE)
genes_above_sd <- df %>%
filter(loading > m + sd) %>%
pull(gene) # change column name if your gene column is named differently
genes_above_sd
```
```{r}
library(dplyr)
df <- gene_factors_long %>%
filter(component == "comp5")
m <- mean(df$loading, na.rm = TRUE)
sd <- sd(df$loading, na.rm = TRUE)
genes_above_sd <- df %>%
filter(loading > m + sd) %>%
pull(gene) # change column name if your gene column is named differently
genes_above_sd
```
```{r}
upper95 <- m + 1.96 * sd
lower95 <- m - 1.96 * sd
genes_outside_95 <- df %>%
filter(loading > upper95 | loading < lower95) %>%
pull(gene)
```
#
```{r}
library(dplyr)
library(ggplot2)
df <- gene_factors_long %>%
filter(component == "comp12")
m <- mean(df$loading, na.rm = TRUE)
sd <- sd(df$loading, na.rm = TRUE)
lower95 <- m - 1.96 * sd
upper95 <- m + 1.96 * sd
ggplot(df, aes(x = loading)) +
geom_histogram(bins = 50, fill = "orange", alpha = 0.7) +
geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) +
geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(
title = "Distribution of Gene Loadings for Component 12",
x = "Gene Loading",
y = "Count"
)
df <- gene_factors_long %>%
filter(component == "comp5")
m <- mean(df$loading, na.rm = TRUE)
sd <- sd(df$loading, na.rm = TRUE)
lower95 <- m - 1.96 * sd
upper95 <- m + 1.96 * sd
ggplot(df, aes(x = loading)) +
geom_histogram(bins = 50, fill = "green", alpha = 0.7) +
geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) +
geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(
title = "Distribution of Gene Loadings for Component 12",
x = "Gene Loading",
y = "Count"
)
df <- gene_factors_long %>%
filter(component == "comp24")
m <- mean(df$loading, na.rm = TRUE)
sd <- sd(df$loading, na.rm = TRUE)
lower95 <- m - 1.96 * sd
upper95 <- m + 1.96 * sd
ggplot(df, aes(x = loading)) +
geom_histogram(bins = 50, fill = "grey", alpha = 0.7) +
geom_vline(xintercept = m, color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = lower95, color = "blue", linetype = "dashed", size = 1) +
geom_vline(xintercept = upper95, color = "blue", linetype = "dashed", size = 1) +
theme_minimal() +
labs(
title = "Distribution of Gene Loadings for Component 12",
x = "Gene Loading",
y = "Count"
)
```
```{r}
# extract data for comp12 genes that are above upper95
comp5_genes <- gene_factors_long %>%
filter(component == "comp5" & loading > upper95) %>%
arrange(desc(loading))
```