--- title: "initial visualization of biomarker data" authors: Chris M date: February 20, 2024 --- # load packages ```{r} #install.packages("vctrs") library(tidyr) library(tidyverse) library(ggplot2) library(vegan) ``` # load data #biomarkerfull= p450 and SOD by site, condition factor and economic index #morphometrics= weight, length, height, and width measurements #shellthickness= shell thickness measurements ```{r} getwd() data<- read_csv("/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/data/biomarkerfull.csv") #morph<- read_csv("data/morphometrics.csv", header= TRUE, sep=",") #thick<- read_csv("data/shellthickness.csv", header= TRUE, sep=",") ``` # inspect data ```{r} str(data) ``` # inspect data with summary ```{r} summary(data) ``` # fix character/ numeric values ```{r} as.numeric("weight_initial", "weight_change", "length", "width", "height", "condition_factor", "economic_index") ``` #still having directory issues, so i need to use the full path ```{r} #getwd() ``` # plotting sites in order of expression value for SOD ```{r} # Order the sites by SOD value data_ordered <- data[order(data$SOD),] # Create a factor with the ordered site names data_ordered$site_name <- factor(data_ordered$site_name, levels = unique(data_ordered$site_name)) # Plot the SOD values in a boxplot with ordered site names plot<- ggplot(data_ordered, aes(x = site_name, y = SOD)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels if needed #ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/sodranked.png", width=15, height=8) ``` # plotting sites in order of expression value for p450 ```{r} # Order the sites by p450 value data_ordered <- data[order(data$p450),] # Create a factor with the ordered site names data_ordered$site_name <- factor(data_ordered$site_name, levels = unique(data_ordered$site_name)) # Plot the SOD values in a boxplot with ordered site names plot<- ggplot(data_ordered, aes(x = site_name, y = p450)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels if needed #ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/p450ranked.png", width=15, height=8) ``` # plot SOD by site - don't need to do this one right now. finesse the code so that we can plot each site on the same axis, maybe something that compares the lowest 25% to the highest 25% or whatever. ```{r} #loop #set y axis threshold for (site in 1:77) { # Filter the data for the current site sodsite_df <- data[data$site_number == site, ] # Plot the bar chart for the current site p <- ggplot(sodsite_df, aes(x = factor(sample), y = SOD)) + geom_bar(stat = "identity") + labs(x = "Individual Sample", y = "SOD Value") + ggtitle(paste("SOD Values for Site", site, "by Individual Sample")) # Print the plot print(p) } ``` # this visualization doesn't help at all. find a better way to plot location and biomarker level. this could also work better on a map. ```{r} #plotting sod + p450 together ggplot(data, aes(x = longitude, y = latitude)) + geom_point(aes(color = SOD, size = p450)) + scale_color_gradient(low = "blue", high = "red", name = "SOD Value") + scale_size_continuous(name = "P450 Value") + labs(x = "Longitude", y = "Latitude") + theme_minimal() + guides(color = guide_legend(), size = guide_legend()) ``` # plotting all points basic by site ```{r} ggplot(data) + geom_point(aes(x = site_name, y = SOD, color = "SOD")) + geom_point(aes(x = site_name, y = p450, color = "P450")) + labs(x = "Site Name", y = "Value", color = "Biomarker") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/allbysite.png", width=15, height=8) ``` # plotting a histogram of the p450 values with a density curve ```{r} ggplot(data, aes(x = p450)) + geom_histogram(aes(y = ..density..), binwidth = diff(range(data$p450))/30, colour = "black", fill = "white") + geom_density(alpha = .2, fill = "#FF6666") + labs(x = "P450 Values", y = "Density", title = "Histogram of P450 Values with Density Curve") + theme_minimal() ``` #plotting a histogram of SOD with a density curve ```{r} ggplot(data, aes(x = SOD)) + geom_histogram(aes(y = ..density..), binwidth = diff(range(data$SOD))/30, colour = "black", fill = "white") + geom_density(alpha = .2, fill = "#FF6666") + labs(x = "SOD Values", y = "Density", title = "Histogram of SOD Values with Density Curve") + theme_minimal() ``` ```{r} #plotting SOD alone ggplot(data, aes(x = longitude, y = latitude, color = SOD)) + geom_point() + scale_color_gradient(low = "blue", high = "red") + labs(x = "Longitude", y = "Latitude", color = "SOD Value") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/sodmap.png", width=15, height=8) ``` ```{r} #plotting p450 alone ggplot(data, aes(x = longitude, y = latitude, color = p450)) + geom_point() + scale_color_gradient(low = "blue", high = "red") + labs(x = "Longitude", y = "Latitude", color = "p450 Value") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/p450map.png", width=15, height=8) ``` #pearson correlation ```{r} #plain pearson if I want # cor.test(df$site_number, df$sod, method = "pearson") # Convert condition_factor to numeric if it's not already data$condition_factor <- as.numeric(data$condition_factor) # Perform Pearson correlation with NA values ignored cor(data[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson") ``` #pearson correlation with visualization ```{r} # Perform Pearson correlation with NA values ignored cor_matrix <- cor(data[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson") # Visualize the correlation matrix library(ggplot2) library(reshape2) # Melt the correlation matrix for visualization melted_cor_matrix <- melt(cor_matrix) ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + labs(x = "", y = "") ``` ```{r} #this is an ugly plot - try something else data <- data.frame( p450 = as.numeric(rnorm(100)), SOD = as.numeric(rnorm(100)), condition_factor = as.numeric(rnorm(100)) ) #different option for pearson correlation visual cor_matrix <- cor(data[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson") pairs(data, lower.panel = function(x, y, ...) { points(x, y, ...) text(mean(x, na.rm = TRUE), mean(y, na.rm = TRUE), round(cor_matrix[match(names(data), colnames(cor_matrix))], 2), cex = 0.7) }, upper.panel = NULL, labels = c("p450", "SOD", "condition_factor") ) ``` ```{r} # i don't like this one either # Example data data <- data.frame( p450 = rnorm(100), SOD = rnorm(100), condition_factor = rnorm(100) ) # Compute correlation matrix cor_matrix <- cor(data, use = "complete.obs") # Plot correlation matrix with annotations plot(1, type="n", xlim=c(0, ncol(data)), ylim=c(0, ncol(data)), xlab="", ylab="", axes=FALSE) text(rep(1:ncol(data), each=ncol(data)), rep(ncol(data):1, ncol(data)), format(round(cor_matrix, 2), nsmall = 2), cex=0.7, col="black", font=2) axis(1, at=1:ncol(data), labels=colnames(data), las=2) axis(2, at=ncol(data):1, labels=rev(colnames(data)), las=2) box() ``` # ANOVA ```{r} # Variation within each site for SOD and p450 sod_variation <- aggregate(SOD ~ site_number, data = data, var) #print(sod_variation) p450_variation <- aggregate(p450 ~ site_number, data = data, var) # ANOVA for site and SOD anova_sod <- aov(SOD ~ factor(site_number), data = data) summary(anova_sod) # ANOVA for site and p450 anova_p450 <- aov(p450 ~ factor(site_number), data = data) summary(anova_p450) # ANOVA for SOD and p450 anova_sod_p450 <- aov(SOD ~ p450, data = data) summary(anova_sod_p450) #site and biomarkers are statistically relevant co-factors, but the biomarkers aren't statistically significant to each other. #try to understand the variation stuff, it feels incorrect ``` # Wasteland #SOD plotting from AH ```{r} data$rank <- rank(data$SOD) plot<-data%>% ggplot(aes(x=site_name, y=SOD))+ geom_boxplot()+ geom_point(position=position_dodge(0.3), alpha=1, size=1)+ xlab(expression(bold("Site"))) + ylab(expression(bold(paste("SOD (mg protein)"^-1)))) + theme_classic()+ ylim(0, 150)+ #adjust this to remove values you are going to exclude theme(legend.position="right")+ theme(axis.text.x=element_text(angle=90, hjust=1)); plot ggsave(plot=plot, filename="~/output/sod.png", width=15, height=8) ``` #p450 plot by site #trying to rank the values but it isn't working ```{r} #change order based on value data$rank <- rank(data$p450) plot<-data%>% ggplot(aes(x=site_name, y=p450))+ geom_boxplot()+ geom_point(position=position_dodge(0.3), alpha=1, size=1)+ xlab(expression(bold("Site"))) + ylab(expression(bold(paste("p450 (mg protein)"^-1)))) + theme_classic()+ ylim(29000, 23000000)+ #adjust this to remove values you are going to exclude theme(legend.position="right")+ theme(axis.text.x=element_text(angle=90, hjust=1)); plot ggsave(plot=plot, filename="~/viz/p450.png", width=15, height=8) ``` #plot p450 by site ```{r} #individual # Example for site 21 #site21_df <- df[df$site_number == 21, ] #ggplot(site21_df, aes(x = factor(sample), y = p450)) + #geom_bar(stat = "identity") + #labs(x = "Individual Sample", y = "p450 Value") + #ggtitle("p450 Values for Site 21 by Individual Sample") #loop # Loop through each site from 1 to 77 for (site in 1:77) { # Filter the data for the current site psite_df <- data[data$site_number == site, ] # Check if there are samples for the current site if (nrow(psite_df) > 0) { # Plot the bar chart for the current site p <- ggplot(psite_df, aes(x = factor(sample), y = p450)) + geom_bar(stat = "identity") + labs(x = "Individual Sample", y = "p450 Value") + ggtitle(paste("p450 Values for Site", site, "by Individual Sample")) # Print the plot print(p) } } ``` #pca #correlation plot #combine condition factor, economic facotr + data for correlation and pca? ```{r} # Load necessary libraries library(vegan) # Selecting the relevant variables for CCA cca_data <- df[, c("p450", "sod", "condition_factor", "economic_index")] # Perform CCA cca_result <- cca(cca_data ~ site_name, data = df) # Plot CCA results using ggplot2 cca_plot <- autoplot(cca_result, data = df, colour = 'site_name', size = 3) + theme_minimal() + labs(title = "CCA of Mussel Data") # Print the plot print(cca_plot) ``` #extra stuff ```{r} df_clean <- na.omit(df) library(FactoMineR) library(factoextra) # Ensure that condition_factor and economic_index are numeric df$condition_factor <- as.numeric(df$condition_factor) df$economic_index <- as.numeric(df$economic_index) # Remove NAs from the dataset df_clean <- na.omit(df) # Selecting the relevant variables for PCA pca_data <- df_clean[, c("sod", "p450", "condition_factor", "economic_index")] # Performing PCA pca_res <- PCA(pca_data, scale.unit = TRUE, graph = FALSE) # Plotting the PCA fviz_pca_biplot(pca_res, label = "var", col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE) # Avoid text overlapping (slow if many points) ```