--- title: "Exploratory Visualizations" name: "Chris" date: "03-01-2024" --- # Load Packages ```{r} library(tidyr) library(tidyverse) library(ggplot2) library(vegan) ``` # Get working directory & load data ```{r} getwd() data<- read_csv("/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/data/biomarkerfull.csv") ``` # Inspect data structure ```{r} str(data) ``` # Inspect data summary stats ## Note the following measurement types Weight = g\ Length, width, height = mm\ p450, SOD = activity/ (mg/protein)\ Condition factor, economic factor = unitless ```{r} summary(data) ``` # Fix character/ numeric values ```{r} as.numeric("weight_initial", "weight_change", "length", "width", "height", "condition_factor", "economic_index") ``` # Detect outliers Code from stats notes/ stack overflow - updated with Chat GPT ```{r} detect_outliers_mad <- function(data, accuracy = 0.99) { # Function to calculate MAD mad_function <- function(x) { median(abs(x - median(x))) } # Calculate z-score equivalent for the given accuracy z_threshold <- qnorm(accuracy + (1 - accuracy) / 2) # Calculate MAD threshold mad_threshold <- z_threshold # Initialize a list to store outlier indices for each numeric column outliers_list <- list() # Initialize a vector to keep track of rows with outliers rows_with_outliers <- rep(FALSE, nrow(data)) # Loop through each column in the dataframe for (col_name in names(data)) { # Check if the column is numeric if (is.numeric(data[[col_name]])) { # Calculate MAD and median for the column mad_value <- mad_function(data[[col_name]]) median_value <- median(data[[col_name]]) # Calculate the deviation scores (using a modified z-score) deviation_scores <- 0.6745 * (data[[col_name]] - median_value) / mad_value # Identify indices of outliers outlier_indices <- which(abs(deviation_scores) > mad_threshold) # Store the indices in the list outliers_list[[col_name]] <- outlier_indices # Update rows with outliers rows_with_outliers[outlier_indices] <- TRUE } } # Calculate the number of outliers in each column num_outliers_each_col <- sapply(outliers_list, length) # Calculate the number of rows with at least one outlier num_rows_with_outliers <- sum(rows_with_outliers) # Combine the results into one vector combined_results <- c(num_outliers_each_col, "Rows w/ Outliers" = num_rows_with_outliers) # Return the combined results return(combined_results) } # Detect outliers using the previously defined function outliers_info <- detect_outliers_mad(data) # Check if there are any outliers if (all(outliers_info == 0)) { print("There are no outliers in any columns.") } else { # Create a data frame for plotting outliers_data_df <- data.frame( Column = names(outliers_info), Outliers = outliers_info, OutlierPercentage = (outliers_info / nrow(df)) * 100 # Calculate the percentage of outliers ) # Plot the number of outliers for all columns with labels ggplot(outliers_data_df, aes(x = Column, y = Outliers, fill = Column)) + geom_bar(stat = "identity") + geom_text(aes(label = sprintf("%.2f%%", OutlierPercentage)), hjust = -0.3, vjust = 0) + # Add labels to the bars coord_flip() + # Makes the bars horizontal labs(title = "Number of Outliers by Column", x = "Column", y = "Number of Outliers") + scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits } ggsave(plot= plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/outliers.png", width=15, height=8) ``` #Alternate QC Plots Code from stats notes/ stack overflow - updated with Chat GPT ```{r} #library(gridExtra) #install.packages("e1071") #library("e1071") # for skewness # Function to check if the variable is highly skewed is_highly_skewed <- function(x) { # Remove missing values before computing skewness x <- na.omit(x) abs(e1071::skewness(x)) > 1 } create_plots <- function(data, var) { skewness_value <- round(e1071::skewness(na.omit(data[[var]])), 2) # Histogram p1 <- ggplot(data, aes_string(x = var)) + geom_histogram(bins = 15, fill = "skyblue", color = "black") + ggtitle(paste("Histogram of", var)) + annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_value), hjust = 1.1, vjust = 1.1) # QQ Plot with reference line p2 <- ggplot(data, aes_string(sample = var)) + stat_qq() + stat_qq_line(color = "red") + ggtitle(paste("QQ Plot of", var)) if (is_highly_skewed(data[[var]])) { df$log_transformed <- log(data[[var]] + 1) skewness_log_value <- round(e1071::skewness(na.omit(data$log_transformed)), 2) # Histogram after log transformation p3 <- ggplot(data, aes(x = log_transformed)) + geom_histogram(bins = 15, fill = "lightgreen", color = "black") + ggtitle(paste("Log-transformed", var)) + annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_log_value), hjust = 1.1, vjust = 1.1) # QQ Plot after log transformation p4 <- ggplot(data, aes(sample = log_transformed)) + stat_qq() + stat_qq_line(color = "red") + ggtitle(paste("log-transformed", var)) return(grid.arrange(p1, p2, p3, p4, ncol = 2)) } else { return(grid.arrange(p1, p2, ncol = 2)) } } # If the dataframe has more than 2000 rows, sample 2000 rows randomly df_reduced <- data if (nrow(df) > max_data_points_eda) { set.seed(123) # Set a random seed for reproducibility df_reduced <- df_reduced[sample(nrow(df_reduced), 2000), ] cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown scatter plots.") } # Apply the function to each numeric variable in the dataframe lapply(names(df_reduced)[sapply(df_reduced, is.numeric)], function(var) create_plots(df, var)) ggsave(plot= plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/qcplots.png", width=15, height=8) ``` # Plot all ```{r} plot<- ggplot(data) + geom_point(aes(x = site_name, y = SOD, color = "SOD")) + geom_point(aes(x = site_name, y = p450, color = "P450")) + labs(x = "Site Name", y = "Value", color = "Biomarker") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/allbysite.png", width=15, height=8) ``` # Plot ranked p450 all sites ```{r} # Order the sites by p450 value data_ordered <- data[order(data$p450),] # Create a factor with the ordered site names data_ordered$site_name <- factor(data_ordered$site_name, levels = unique(data_ordered$site_name)) # Plot the SOD values in a boxplot with ordered site names plot<- ggplot(data_ordered, aes(x = site_name, y = p450)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels if needed ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/p450ranked.png", width=15, height=8) ``` # Plot ranked SOD all sites ```{r} # Order the sites by SOD value data_ordered <- data[order(data$SOD),] # Create a factor with the ordered site names data_ordered$site_name <- factor(data_ordered$site_name, levels = unique(data_ordered$site_name)) # Plot the SOD values in a boxplot with ordered site names plot<- ggplot(data_ordered, aes(x = site_name, y = SOD)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels if needed ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/sodranked.png", width=15, height=8) ``` # Plot histogram of the p450 values with a density curve ```{r} ggplot(data, aes(x = p450)) + geom_histogram(aes(y = ..density..), binwidth = diff(range(data$p450))/30, colour = "black", fill = "white") + geom_density(alpha = .2, fill = "#FF6666") + labs(x = "P450 Values", y = "Density", title = "Histogram of P450 Values with Density Curve") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/p450histogram.png", width=15, height=8) ``` #Plot histogram of SOD with a density curve ```{r} ggplot(data, aes(x = SOD)) + geom_histogram(aes(y = ..density..), binwidth = diff(range(data$SOD))/30, colour = "black", fill = "white") + geom_density(alpha = .2, fill = "#FF6666") + labs(x = "SOD Values", y = "Density", title = "Histogram of SOD Values with Density Curve") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/sodhistogram.png", width=15, height=8) ``` # Plot p450 by site (lat/ long) ```{r} ggplot(data, aes(x = longitude, y = latitude, color = p450)) + geom_point() + scale_color_gradient(low = "blue", high = "red") + labs(x = "Longitude", y = "Latitude", color = "p450 Value") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/p450map.png", width=15, height=8) ``` # Plot SOD by site (lat/ long) ```{r} ggplot(data, aes(x = longitude, y = latitude, color = SOD)) + geom_point() + scale_color_gradient(low = "blue", high = "red") + labs(x = "Longitude", y = "Latitude", color = "SOD Value") + theme_minimal() ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/sodmap.png", width=15, height=8) ``` # Correlation ## Values ```{r} #pearson correlation #plain pearson if I want # cor.test(df$site_number, df$sod, method = "pearson") # Convert condition_factor to numeric if it's not already data$condition_factor <- as.numeric(data$condition_factor) # Perform Pearson correlation with NA values ignored cor(data[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson") ``` ## visualization (fix df/ data and any other data name issues) ```{r} library(corrplot) df2 <- data[sapply(data, is.numeric)] df2 <- na.omit(df2) M <- cor(df2) testRes <- cor.mtest(df2, conf.level = 0.95) ## leave blank on non-significant coefficient ## add significant correlation coefficients corrplot(M, p.mat = testRes$p, method = "circle", type = "lower", insig = "blank", addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE ) ggsave(plot=plot, filename="/home/shared/8TB_HDD_02/cnmntgna/GitHub/WDFWmussels/output/correlation.png", width=15, height=8) ``` # PCA (fix df/ data and any other data name issues) ```{r} #install.packages("FactoMineR") <<<<<<< HEAD:output/exploratoryviz.Rmd #library('FactoMineR') library(factoextra) ======= #install.packages("factoextra") library('FactoMineR') library("factoextra") >>>>>>> 5a4d0b098c45ddd9b4b6bff47983d1b2b1b84df0:code/exploratoryviz.Rmd # Ensure that condition_factor and economic_index are numeric data$condition_factor <- as.numeric(data$condition_factor) data$economic_index <- as.numeric(data$economic_index) # Remove NAs from the dataset df_clean <- na.omit(data) # Selecting the relevant variables for PCA pca_data <- df_clean[, c("SOD", "p450", "condition_factor", "economic_index")] # Performing PCA pca_res <- PCA(pca_data, scale.unit = TRUE, graph = FALSE) # Plotting the PCA fviz_pca_biplot(pca_res, label = "var", col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE) # Avoid text overlapping (slow if many points) ```