--- title: "2_8_running_model" author: "Aidan Coyle" date: "8/12/2022" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## Background In this script, we will create a generalized linear mixed model to examine potential associations between _Hematodinium_ infection status and variables within both the environment and the crab. This script will include both male and female crab, and only include crabs for which temperature data is available. #### Load libraries (and install if necessary) ```{r libraries, message=FALSE, warning=FALSE} # Add all required libraries here list.of.packages <- c("tidyverse", "lme4", "MuMIn", "rcompanion", "MASS", "generalhoslem", "mgcv", "beepr", "regclass", "car", "DHARMa", "broom.mixed", "dotwhisker", "glmmTMB", "performance", "effects") # Get names of all required packages that aren't installed new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])] # Install all new packages if(length(new.packages)) install.packages(new.packages) # Load all required libraries lapply(list.of.packages, FUN = function(X) { do.call("require", list(X)) }) # Read in custom functions source("hemat_modeling_functions.R") ``` # Read in data ```{r} crab_dat <- read.csv(file = "../output/ADFG_SE_AK_pot_surveys/cleaned_data/crab_data/BCS_examined_crab_with_temperature.csv") ``` ### Check all variables were read in correctly as either categorical or continuous predictors ```{r} # See class of each column str(crab_dat) # Looks like we've got lots of columns that should be converted to factors! crab_dat$Survey <- factor(crab_dat$Survey) crab_dat$Site <- factor(crab_dat$Site) crab_dat$Sex <- factor(crab_dat$Sex) crab_dat$Recruit.Status <- factor(crab_dat$Recruit.Status) crab_dat$Shell.Condition <- ordered(crab_dat$Shell.Condition) crab_dat$Egg.Condition <- factor(crab_dat$Egg.Condition) crab_dat$Egg.Development <- factor(crab_dat$Egg.Development) crab_dat$Leg.Condition <- factor(crab_dat$Leg.Condition) crab_dat$Bitter <- factor(crab_dat$Bitter) crab_dat$Blackmat <- factor(crab_dat$Blackmat) # See updated class of each column str(crab_dat) ``` ### Check for correlation Prior to modeling anything, we're going to run checks on combinations of variables to see if any are correlated ```{r} # CORRELATION BETWEEN CONTINUOUS VARIABLES crab_nums <- select_if(crab_dat, is.numeric) numcor <- cor(crab_nums, method = "pearson") # See the resulting table print(numcor) # See if any correlations are > 0.6 (our bar for correlation) and less than 1 (since every variable is perfectly correlated with itself) any(abs(numcor) > 0.6 & numcor < 1) which(abs(numcor) > 0.6 & numcor < 1, arr.ind = TRUE) # Looks like we have a tight correlation between latitude and longitude # Temperature and Julian day are pretty close (corr = 0.56), but nothing else is above 0.5 # CORRELATIONS BETWEEN CATEGORICAL VARIABLES # Now we're using Cramer's V test to look at correlation among our categorical variables crabcat <- select_if(crab_dat, is.factor) # Turn all from factors to numeric crabcat[] <- sapply(crabcat, as.numeric) # Initialize a blank matrix results_matrix <- matrix(nrow = length(crabcat), ncol = length(crabcat)) # Name all rows and columns with our variable names colnames(results_matrix) <- names(crabcat) rownames(results_matrix) <- names(crabcat) # Fill the matrix by performing Cramer's V test on each possible combination of factors for (i in 1:ncol(crabcat)) { for (j in 1:ncol(crabcat)) { cramer.table <- table(crabcat[,i],crabcat[,j]) cramer.matrix <- as.matrix(cramer.table) results_matrix[i,j] <- cramerV(cramer.matrix) } } # See the resulting matrix print(results_matrix) # See if any of our correlations (aside from self-correlations) cross our boundary of too much correlation any(results_matrix > 0.6 & results_matrix < 1) which(abs(results_matrix) > 0.6 & results_matrix < 1, arr.ind = TRUE) # Tight correlation between survey and site, which is fine - we weren't planning to include survey in any model # Correlations of 1 between recruit status and sex too, indicating we should choose one for our models # We'll pick sex, since recruit status is mostly captured by sex + CW # No other strong correlations # CORRELATIONS BETWEEN CATEGORICAL AND CONTINUOUS VARIABLES # We'll use Spearman rank-order correlation to determine whether we have any correlation crabrank <- crab_dat crabrank[] <- sapply(crab_dat, as.numeric) crabcomps <- cor(crabrank, method = "spearman") any(abs(crabcomps) > 0.6 & crabcomps < 1) # Looks like we do have some significant correlations this time! Let's pull them out which(abs(crabcomps) > 0.6 & crabcomps < 1, arr.ind = TRUE) print(crabcomps) # Correlations are between: # Julian day and survey (don't care, not including survey in model) # Recruit status and CW (already decided to exclude recruit status from model since it's mostly CW + sex) # Latitude and longitude (we'll likely just use latitude, or skip altogether and just use site) # Temperature and Julian day (these are two continuous variables, which were already cleared using the more appropriate Pearson metric) ``` # Adjust numeric variables, scaling some ```{r} # Subtract 2004 from all years, so that our earliest year is 1 crab_dat$s.Year <- crab_dat$Year-(min(crab_dat$Year)-1) # Scale chela height, egg percent, latitude, longitude, Julian day, depth, and temperature crab_dat$s.Chela.Ht <- scale(crab_dat$Chela.Ht) crab_dat$s.Egg.Percent <- scale(crab_dat$Egg.Percent) crab_dat$s.Latitude <- scale(crab_dat$Latitude) crab_dat$s.Longitude <- scale(crab_dat$Longitude) crab_dat$s.Jul.Day <- scale(crab_dat$Jul.Day) crab_dat$s.Depth <- scale(crab_dat$Depth) crab_dat$s.Temp <- scale(crab_dat$Temp) # We'll scale CW twice - once for males, and once for females. This'll capture the sexual dimorphism within Tanner crabs crab_dat_f <- crab_dat[crab_dat$Sex == "2", ] crab_dat_m <- crab_dat[crab_dat$Sex == "1", ] crab_dat_f$s.CW <- scale(crab_dat_f$CW) crab_dat_m$s.CW <- scale(crab_dat_m$CW) crab_dat <- rbind(crab_dat_f, crab_dat_m) ``` #### TRASHY LIL TEST MODEL This is purely to look at the effects of temperature in combination with site and year ```{r} # Check that temperature scaled correctly bitter <- crab_dat bitter$Bitter <- as.numeric(bitter$Bitter) bitter$Temp <- round(bitter$Temp, digits = 1) bitter$s.Temp <- scale(bitter$Temp) table(bitter$s.Temp) table(bitter$Temp) bitter <- bitter %>% group_by(s.Temp) %>% summarize(bitter_avg = mean(Bitter)) %>% ungroup() ggplot(bitter) + geom_point(aes(x = s.Temp, y = bitter_avg)) # Yep, it scaled correctly! # Alright, let's build a model with just temperature and site trash_model <- glmmTMB(Bitter ~ Temp + (1 | Site) + (1 | s.Year), data = crab_dat, family = binomial, na.action = "na.fail") summary(trash_model) plot(predictorEffect("Temp", trash_model)) # Okay, let's figure out what the heck is going on with temperature and our random effects! ``` # MODEL OF ALL CRABS This model will include ALL crabs, both male and female. Therefore, it will not include sex-specific measurements, such as chela height and egg-related measurements We have a Bernoulli distribution (binomial), plus random effects of year and location ```{r} # Select all variables to be used in our model for all crabs allcrabs_dat <- crab_dat %>% dplyr::select(c(s.Year, Site, Sex, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp)) # Select independent variables modeled_vars <- names(allcrabs_dat) modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")] # Initialize dataframe with model values AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2) # Create a null model and get AIC null_mod <- glmer(Bitter ~ (1 | Site) + (1 | s.Year), data = allcrabs_dat, family = binomial) AIC_null <- extractAIC(null_mod)[2] AIC_vals[1, 1] <- "null_mod" AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null # Create for loop to extract AIC for all variables for (i in 1:length(modeled_vars)){ my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)") test_mod <- glmer(my_formula, data = allcrabs_dat, family = binomial) AIC_vals[i, 1] <- modeled_vars[i] AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null } beep() # See which variables improve the model the most # I just printed these and reordered them around a bit AIC_vals[order(as.numeric(AIC_vals[, 2])), ] ``` Here's the order of the impact of our variables, from most to least impactful. Shell condition Temperature CW Black Mat Leg condition Julian day Sex ---------Null model line ------------------ Latitude Depth ### Build model ```{r} full_model <- glmmTMB(Bitter ~ Shell.Condition + s.Temp + s.CW + Blackmat + Leg.Condition + s.Jul.Day + Sex + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year), data = allcrabs_dat, family = binomial, na.action = "na.fail") # This line is for the dredge() function used later check_collinearity(full_model) # Alright, VIFs look good! ``` ### Model Diagnostics Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions ```{r} # Simulate residuals and plot testOutliers(full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE) simulateResiduals(full_model, plot = TRUE) # Perform ANOVA on model fits car::Anova(full_model) # Alright, looks like in our full model, everything is significant except leg condition and depth (with latitude being barely significant) # Test for effects. You can plot these all together with plot(allEffects(full_model)), but that gets crowded visually plot(predictorEffect("Shell.Condition", full_model)) plot(predictorEffect("s.Temp", full_model)) plot(predictorEffect("s.CW", full_model)) plot(predictorEffect("Blackmat", full_model)) plot(predictorEffect("Leg.Condition", full_model)) plot(predictorEffect("s.Jul.Day", full_model)) plot(predictorEffect("Sex", full_model)) plot(predictorEffect("s.Latitude", full_model)) plot(predictorEffect("s.Depth", full_model)) # Get influence measures of each datapoint. # Sadly, the only way to do this with glmmTMB is bruteforce it # That means refitting the model with each element removed using glmmTMB::influence_mixed() # May prove to be too long to be feasible, let's test! # Update: yep, it's too long, Did the calculations, it took us # about 12 hours to run dredge(), which analyzed 512 possible models # This would be 150,000 possible models, which would be almost 150 days # Check significance of random effects using bootstrapped LRT full_model.onlySite <- glmmTMB(Bitter ~ Shell.Condition + s.Temp + s.CW + Blackmat + Leg.Condition + s.Jul.Day + Sex + s.Latitude + s.Depth + (1 | Site), data = allcrabs_dat, family = binomial, na.action = "na.fail") ``` ### Dredging We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AICc. ```{r} all_mods <- dredge(full_model, beta = "none", eval = TRUE, rank = "AICc") plot(all_mods) # Looks like we have four good models (weights > 0.2) and four marginal models (weights between 0.04 and 0.01) best_mods <- get.models(all_mods, subset = weight > 0.01) all_best_mod <- get.models(all_mods, subset = 1)[[1]] # See what each of the best models look like best_mods[1] best_mods[2] # Same as #1, but drops Depth best_mods[3] # Same as #1, but includes Leg Condition best_mods[4] # Same as #1, but includes Leg Condition and drops Depth best_mods[5] # Same as #1, but drops Latitude best_mods[6] # Same as #1, but includes Leg Condition and drops Latitude best_mods[7] # Same as #1, but drops Depth and Latitude best_mods[8] # Same as #1, but includes Leg Condition and drops Depth + Latitude # Average models based on AICc avg_model_test <- model.avg(best_mods, beta = "none") # See what that average model looks like avg_model$coefficients summary(avg_model) # Save each of the best models! This lets us reload them in case R crashes # This saves a LOT of time - dredge() takes forever to run! for (i in 1:length(best_mods)) { saveRDS(best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/model", i, ".rds")) } # Also save the average model saveRDS(avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/avg_model.rds") # Un-comment the below code to re-read model files back in when R crashes #### Set model filepath #model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/" ##### Read in average model #avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds")) # Read in best models individually #### Get vector of relevant files #best_mod_files <- list.files(model_filepath) #### Remove average model from vector #best_mod_files <- best_mod_files[grep("^model", best_mod_files)] #### Create blank list #best_mods <- list() ##### Read in files #for (i in 1:length(best_mod_files)) { # best_mods[i] <- readRDS(file = paste0(model_filepath, best_mod_files[i])) #} ``` ### Get a summary of full model ```{r} # Set up path to put plots plot_path <- "../output/ADFG_SE_AK_pot_surveys/model_results/full_models/all_crabs/images/" # Get coefficients summary(avg_model) # Plot coefficients, and save file png(filename = paste0(plot_path, "coefficients.png")) plot(avg_model) dev.off() # Plot predictor effects plot(allEffects("Shell.Condition", test)) # First, get the names of our variables test <- summary(avg_model) var_names <- colnames(test$coefficients) attr(var_names, "order") <- NULL var_names <- png(filename="your/file/location/name.png") plot(fit) dev.off() # Plot predictor effects plot(predictorEffect("s.Temp", avg_model)) plot(predictorEffect("Shell.Condition", best_mods[1])) plot(predictorEffect("s.Temp", full_model)) plot(predictorEffect("s.CW", full_model)) plot(predictorEffect("Blackmat", full_model)) plot(predictorEffect("Leg.Condition", full_model)) plot(predictorEffect("s.Jul.Day", full_model)) plot(predictorEffect("Sex", full_model)) plot(predictorEffect("s.Latitude", full_model)) plot(predictorEffect("s.Depth", full_model)) ``` ### More tests ```{r} #### Test dispersion of each of our models # Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. # First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1 # Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model) # Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant # Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant for (i in 1:length(best_mods)){ print(paste("Model", i)) simulationOutput <- simulateResiduals(fittedModel = best_mods[i][[1]], plot = TRUE) plotResiduals(simulationOutput, allcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i)) testDispersion(simulationOutput, alternative = "greater", plot = TRUE) testZeroInflation(simulationOutput) } # Lovely, all models look just fine! ``` Quick break just so we can visualize plots easier ```{r} # Create for loop, predicting data and plotting results for each model # Predicted should effectively be a line with no outliers, histogram should be one block for (i in 1:length(best_mods)) { # Simulate data, see how accurate models are. This uses our custom function pred_dat() pred_data <- pred_dat(model = best_mods[i][[1]], data = allcrabs_dat, num_sims = 500) # Plot simulated data with a red line as the real data plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) + abline(h = sum(allcrabs_dat$Bitter == 1), col = "red") # Plot percent change from real data hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i)) # Print total mean difference from data print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(allcrabs_dat$Bitter == 1))) } ``` # MODEL OF FEMALE CRABS This model will include all female crabs with egg-related measurements ### Filter and re-scale to get correct data for model ```{r} # Select only female crabs fem_crabs <- crab_dat %>% filter(Sex == "2") # Drop chela height column fem_crabs <- fem_crabs %>% dplyr::select(-Chela.Ht) # Drop scaled columns - we'll scale them again fem_crabs <- fem_crabs %>% dplyr::select(-contains("s.")) # We'll try to move Egg Development and Egg Condition into a single column, a lot of the data is redundant or useless # We likely should have just done this in script 2_7, but hey, no time like the present (and I don't want to go back and potentially have to re-run my all crabs model) # Move barren clean/matted data into the Egg Development column fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Egg.Condition == "Barren_Clean", "Barren_Clean", as.character(Egg.Development))) fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Egg.Condition == "Barren_Matted", "Barren_Matted", as.character(Egg.Development))) # Move dead eggs data into Egg Development column. We only have ~50 with dead eggs over 20%, so we'll just note all as having dead eggs fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Egg.Condition == "Dead_eggs_over_20pct", "Dead_Eggs", as.character(Egg.Development))) fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Egg.Condition == "Dead_eggs_under_20pct", "Dead_Eggs", as.character(Egg.Development))) # Only three crabs have eyed eggs, we'll just remove those fem_crabs <- fem_crabs %>% filter(Egg.Development != "Eyed") # Change all crabs with a recruit status of Juvenile Female to Egg Development = Juvenile fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Recruit.Status == "Juvenile_Female", "Juvenile", as.character(Egg.Development))) # Alright, we can remove the Egg Condition column fem_crabs <- fem_crabs %>% dplyr::select(-Egg.Condition) # If a crab has an egg percent of zero (and isn't a juvenile or barren), change to NA in egg development (since we don't know if barren or matted) fem_crabs <- fem_crabs %>% mutate(Egg.Development = ifelse(Egg.Percent == 0 & (Egg.Development == "Dead_Eggs" | Egg.Development == "Uneyed"), NA, as.character(Egg.Development))) # If a crab is juvenile or barren, change egg percent to zero fem_crabs <- fem_crabs %>% mutate(Egg.Percent = ifelse(Egg.Percent != 0 & (Egg.Development == "Barren_Clean" | Egg.Development == "Barren_Matted" | Egg.Development == "Juvenile"), NA, Egg.Percent)) # See how many NAs we have colSums(is.na(fem_crabs)) # Drop all rows with NAs fem_crabs <- na.omit(fem_crabs) # Great, we've got around 25,000 rows! Neat stuff. #### Re-scale all variables for our new dataset # Subtract 2004 from all years, so that our earliest year is 1 fem_crabs$s.Year <- fem_crabs$Year-(min(fem_crabs$Year)-1) # Scale latitude, longitude, Julian day, depth, and temperature fem_crabs$s.Latitude <- scale(fem_crabs$Latitude) fem_crabs$s.Longitude <- scale(fem_crabs$Longitude) fem_crabs$s.Jul.Day <- scale(fem_crabs$Jul.Day) fem_crabs$s.Depth <- scale(fem_crabs$Depth) fem_crabs$s.Temp <- scale(fem_crabs$Temp) # Just need to scale CW once this time fem_crabs$s.CW <- scale(fem_crabs$CW) # Need to also do egg percent this time! fem_crabs$s.Egg.Percent <- scale(fem_crabs$Egg.Percent) colnames(fem_crabs) ``` ### Select variables to be used in model ```{r} # Select all variables to be used in our model for female crabs femcrabs_dat <- fem_crabs %>% dplyr::select(c(s.Year, Site, s.CW, Shell.Condition, Egg.Development, Leg.Condition, Bitter, s.Egg.Percent, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp)) # Select independent variables modeled_vars <- names(femcrabs_dat) modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")] # Initialize dataframe with model values AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2) # Create a null model and get AIC null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), data = femcrabs_dat, family = binomial, na.action = "na.fail") AIC_null <- extractAIC(null_mod)[2] AIC_vals[1, 1] <- "null_mod" AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null # Create for loop to extract AIC for all variables for (i in 1:length(modeled_vars)){ my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)") test_mod <- glmer(my_formula, data = femcrabs_dat, family = binomial) AIC_vals[i, 1] <- modeled_vars[i] AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null } beep() # See which variables improve the model the most AIC_vals[order(as.numeric(AIC_vals[, 2])), ] ``` Here's the order of the impact of our variables, from most to least impactful. Shell Condition Egg Percent Egg Development Black Mat Leg Condition CW Temperature Depth Latitude [Null model] Julian Day ### Build model Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity ```{r} fem_full_model <- glmmTMB(Bitter ~ Shell.Condition + s.Egg.Percent + Egg.Development + Blackmat + Leg.Condition + s.CW + s.Temp + s.Depth + s.Latitude + s.Jul.Day + (1 | Site) + (1 | s.Year), data = femcrabs_dat, family = binomial, na.action = "na.fail" # This line is for the dredge() function used later ) check_collinearity(fem_full_model) # Wow, everything looks great! Was worried about having issues with egg percent and egg development (since barren = 0), but apparently all good! ``` ### Model Diagnostics Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions ```{r} # Simulate residuals and plot testOutliers(fem_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE) simulateResiduals(fem_full_model, plot = TRUE) # Perform ANOVA on model fits car::Anova(fem_full_model) # Alright, looks like in our full model, everything is significant except Black Mat and depth, with latitude and leg condition being just barely significant. # Test for effects. You can plot these all together with plot(allEffects(fem_full_model)), but that gets crowded visually plot(predictorEffect("Shell.Condition", fem_full_model)) plot(predictorEffect("s.Egg.Percent", fem_full_model)) plot(predictorEffect("Egg.Development", fem_full_model)) plot(predictorEffect("Blackmat", fem_full_model)) plot(predictorEffect("Leg.Condition", fem_full_model)) plot(predictorEffect("s.CW", fem_full_model)) plot(predictorEffect("s.Temp", fem_full_model)) plot(predictorEffect("s.Depth", fem_full_model)) plot(predictorEffect("s.Latitude", fem_full_model)) plot(predictorEffect("s.Jul.Day", fem_full_model)) ``` Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI Shell.Condition 1.14 [1.13, 1.16] 1.07 0.87 [0.86, 0.89] s.Egg.Percent 1.35 [1.33, 1.37] 1.16 0.74 [0.73, 0.75] Egg.Development 1.48 [1.45, 1.50] 1.22 0.68 [0.67, 0.69] Blackmat 1.03 [1.02, 1.04] 1.01 0.97 [0.96, 0.98] Leg.Condition 1.01 [1.00, 1.04] 1.01 0.99 [0.97, 1.00] s.CW 1.01 [1.00, 1.03] 1.01 0.99 [0.97, 1.00] s.Temp 1.71 [1.68, 1.74] 1.31 0.58 [0.57, 0.59] s.Depth 1.80 [1.77, 1.83] 1.34 0.56 [0.55, 0.57] s.Latitude 1.16 [1.15, 1.18] 1.08 0.86 [0.85, 0.87] s.Jul.Day 1.70 [1.67, 1.73] 1.31 0.59 [0.58, 0.60] ### Dredging We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC. ```{r} fem_all_mods <- dredge(fem_full_model, beta = "none", eval = TRUE, rank = "AICc") plot(fem_all_mods) # We have one very good model (weight = 0.30), a good model (weight = 0.18), # five alright models (weights < .10 & > 0.05), and seven meh models (weights < 0.05 & > 0.01) fem_best_mods <- get.models(fem_all_mods, subset = weight > 0.01) summary(fem_best_mods) # See what each of the models look like. Comments compare it to the best model (1) fem_best_mods[1] fem_best_mods[2] # Adds depth # Start of alright models fem_best_mods[3] # Drops Black Mat fem_best_mods[4] # Drops latitude fem_best_mods[5] # Adds depth, drops latitude fem_best_mods[6] # Drops leg condition fem_best_mods[7] # Adds depth, drops Black Mat # Start of meh models fem_best_mods[8] # Drops leg condition, adds depth fem_best_mods[9] # Drops Black Mat and latitude fem_best_mods[10] # Drops Black Mat and latitude, adds depth fem_best_mods[11] # Drops Black Mat and leg condition fem_best_mods[12] # Drops leg condition and latitude fem_best_mods[13] # Drops leg condition and latitude, adds depth fem_best_mods[14] # Drops Black Mat and leg condition, adds depth # Average models based on AICc fem_avg_model <- model.avg(fem_best_mods, beta = "none") # See what that average model looks like fem_avg_model$coefficients summary(fem_avg_model) # Save each of the best models! This lets us reload them in case R crashes # This saves a LOT of time - dredge() takes forever to run! for (i in 1:length(fem_best_mods)) { saveRDS(fem_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/model", i, ".rds")) } # Also save the average model saveRDS(fem_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/avg_model.rds") # Un-comment the below code to re-read model files back in when R crashes #### Set model filepath #model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/" ##### Read in average model #fem_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds")) # Read in best models individually ### Get vector of relevant files #fem_best_mod_files <- list.files(model_filepath) ### Remove average model from vector #fem_best_mod_files <- fem_best_mod_files[grep("^model", fem_best_mod_files)] ### Create blank list #fem_best_mods <- list() #### Read in files #for (i in 1:length(fem_best_mod_files)) { # fem_best_mods[i] <- readRDS(file = paste0(model_filepath, fem_best_mod_files[i])) #} ``` ### More tests ```{r} #### Test dispersion of each of our models # Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. # First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1 # Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model) # Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant # Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant for (i in 1:length(fem_best_mods)){ print(paste("Model", i)) simulationOutput <- simulateResiduals(fittedModel = fem_best_mods[i][[1]], plot = TRUE) plotResiduals(simulationOutput, femcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i)) testDispersion(simulationOutput, alternative = "greater", plot = TRUE) testZeroInflation(simulationOutput) } # Lovely, all models look just fine! ``` Quick break just so we can visualize plots easier ```{r} # Create for loop, predicting data and plotting results for each model # Predicted should effectively be a line with no outliers, histogram should be one block for (i in 1:length(fem_best_mods)) { # Simulate data, see how accurate models are. This uses our custom function pred_dat() pred_data <- pred_dat(model = fem_best_mods[i][[1]], data = femcrabs_dat, num_sims = 500) # Plot simulated data with a red line as the real data plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) + abline(h = sum(femcrabs_dat$Bitter == 1), col = "red") # Plot percent change from real data hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i)) # Print total mean difference from data print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(femcrabs_dat$Bitter == 1))) } ``` # MODEL OF MALE CRABS This model will include all male crabs with chela height measurements ### Filter and re-scale to get correct data for model ```{r} # Select only male crabs masc_crabs <- crab_dat %>% filter(Sex == "1") # Drop egg columns masc_crabs <- masc_crabs %>% dplyr::select(-c(Egg.Development, Egg.Condition, Egg.Percent)) # Drop scaled columns - we'll scale them again masc_crabs <- masc_crabs %>% dplyr::select(-contains("s.")) # We'll use the ratio of chela height to carapace width to indicate crab maturity # Note: This is using the EBS line for height-CW, will have to get the SE AK ratio later # EBS equation = ln(chela_ht) = (1.189*ln(CW)) - 2.674 # Create new column for maturity line. Remember, log() computes ln, R is weird like that # Also, male crabs with a CW < 60 are immature masc_crabs$mat_line <- (1.189 * log(masc_crabs$CW)) - 2.674 # Drop crabs without chela height value masc_crabs <- masc_crabs[!is.na(masc_crabs$Chela.Ht),] # Calculate maturity value. masc_crabs$mat_val <- log(masc_crabs$Chela.Ht) # Create column for maturity status. If maturity value is greater than maturity line and CW >= 60 cm, is mature masc_crabs <- masc_crabs %>% mutate(mat_stat = case_when( mat_val <= mat_line | CW <= 60 ~ "1", mat_val > mat_line ~ "2" )) # Remove maturity value and line columns, no longer needed masc_crabs <- masc_crabs %>% dplyr::select(-c(mat_val, mat_line)) # See how many NAs we have colSums(is.na(masc_crabs)) # Great, we've got around 25,000 rows! Neat stuff. #### Re-scale all variables for our new dataset # Subtract 2004 from all years, so that our earliest year is 1 masc_crabs$s.Year <- masc_crabs$Year-(min(masc_crabs$Year)-1) # Scale latitude, longitude, Julian day, depth, and temperature masc_crabs$s.Latitude <- scale(masc_crabs$Latitude) masc_crabs$s.Longitude <- scale(masc_crabs$Longitude) masc_crabs$s.Jul.Day <- scale(masc_crabs$Jul.Day) masc_crabs$s.Depth <- scale(masc_crabs$Depth) masc_crabs$s.Temp <- scale(masc_crabs$Temp) # Just need to scale CW once this time masc_crabs$s.CW <- scale(masc_crabs$CW) colnames(masc_crabs) ``` ### Select variables to be used in model ```{r} # Select all variables to be used in our model for male crabs masccrabs_dat <- masc_crabs %>% dplyr::select(c(s.Year, Site, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp, mat_stat)) # Select independent variables modeled_vars <- names(masccrabs_dat) modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")] # Initialize dataframe with model values AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2) # Create a null model and get AIC null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), data = masccrabs_dat, family = binomial, na.action = "na.fail") AIC_null <- extractAIC(null_mod)[2] AIC_vals[1, 1] <- "null_mod" AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null # Create for loop to extract AIC for all variables for (i in 1:length(modeled_vars)){ my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)") test_mod <- glmer(my_formula, data = masccrabs_dat, family = binomial) AIC_vals[i, 1] <- modeled_vars[i] AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null } beep() # See which variables improve the model the most AIC_vals[order(as.numeric(AIC_vals[, 2])), ] ``` Here's the order of the impact of our variables, from most to least impactful. Shell Condition Maturity Carapace width Temperature Julian day Black Mat infection Leg condition [Null model] Latitude Depth ### Build model Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity ```{r} masc_full_model <- glmmTMB(Bitter ~ Shell.Condition + mat_stat + s.CW + s.Temp + s.Jul.Day + Blackmat + Leg.Condition + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year), data = masccrabs_dat, family = binomial, na.action = "na.fail" # This line is for the dredge() function used later ) check_collinearity(masc_full_model) # Everything looks great! ``` ### Model Diagnostics Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions ```{r} # Simulate residuals and plot testOutliers(masc_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE) simulateResiduals(masc_full_model, plot = TRUE) # Perform ANOVA on model fits car::Anova(masc_full_model) # Alright, looks like in our full model, shell condition, maturity, CW, temperature, and leg condition are significant, while Julian day, Black Mat, Latitude, and Depth aren't. # Test for effects. You can plot these all together with plot(allEffects(masc_full_model)), but that gets crowded visually plot(predictorEffect("Shell.Condition", masc_full_model)) plot(predictorEffect("mat_stat", masc_full_model)) plot(predictorEffect("s.CW", masc_full_model)) plot(predictorEffect("s.Temp", masc_full_model)) plot(predictorEffect("s.Jul.Day", masc_full_model)) plot(predictorEffect("Blackmat", masc_full_model)) plot(predictorEffect("Leg.Condition", masc_full_model)) plot(predictorEffect("s.Latitude", masc_full_model)) plot(predictorEffect("s.Depth", masc_full_model)) ``` Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI Shell.Condition 1.14 [1.13, 1.16] 1.07 0.87 [0.86, 0.89] s.Egg.Percent 1.35 [1.33, 1.37] 1.16 0.74 [0.73, 0.75] Egg.Development 1.48 [1.45, 1.50] 1.22 0.68 [0.67, 0.69] Blackmat 1.03 [1.02, 1.04] 1.01 0.97 [0.96, 0.98] Leg.Condition 1.01 [1.00, 1.04] 1.01 0.99 [0.97, 1.00] s.CW 1.01 [1.00, 1.03] 1.01 0.99 [0.97, 1.00] s.Temp 1.71 [1.68, 1.74] 1.31 0.58 [0.57, 0.59] s.Depth 1.80 [1.77, 1.83] 1.34 0.56 [0.55, 0.57] s.Latitude 1.16 [1.15, 1.18] 1.08 0.86 [0.85, 0.87] s.Jul.Day 1.70 [1.67, 1.73] 1.31 0.59 [0.58, 0.60] ### Dredging We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC. ```{r} masc_all_mods <- dredge(masc_full_model, beta = "none", eval = TRUE, rank = "AICc") plot(masc_all_mods) # Looks like we have 16 total models above 0.01. Of these, two are best (combined weight: 0.39), and an additional 6 are decent (combined weight with the two best: combined weight: 0.83). There are then 9 more models with weights from 0.3 to 0.1 masc_best_mods <- get.models(masc_all_mods, subset = weight > 0.01) # See what each of the models looks like # Descriptions compare each to the best model (Model 1) masc_best_mods[1] masc_best_mods[2] # Drops Julian day masc_best_mods[3] # Adds Depth masc_best_mods[4] # Adds Depth, drops Julian day masc_best_mods[5] # Adds Black Mat masc_best_mods[6] # Adds Latitude masc_best_mods[7] # Adds Latitude, drops Julian day masc_best_mods[8] # Adds Black Mat, drops Julian day masc_best_mods[9] # Adds Black Mat and Depth masc_best_mods[10] # Adds Depth and Latitude masc_best_mods[11] # Adds Depth and Latitude, drops Julian day masc_best_mods[12] # Adds Black Mat and Depth, drops Julian day masc_best_mods[13] # Adds Black Mat and Latitude masc_best_mods[14] # Adds Black Mat and Latitude, drops Julian day masc_best_mods[15] # Adds Black Mat, Depth, and Latitude masc_best_mods[16] # Adds Black Mat, Depth, and Latitude, drops Julian day # Average models based on AICc masc_avg_model <- model.avg(masc_best_mods, beta = "none") # See what that average model looks like masc_avg_model$coefficients summary(masc_avg_model) # Save each of the best models! This lets us reload them in case R crashes # This saves a LOT of time - dredge() takes forever to run! for (i in 1:length(masc_best_mods)) { saveRDS(masc_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/model", i, ".rds")) } # Also save the average model saveRDS(masc_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/avg_model.rds") # Un-comment the below code to re-read model files back in when R crashes #### Set model filepath #model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/" ##### Read in average model #masc_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds")) # Read in best models individually #### Get vector of relevant files #masc_best_mod_files <- list.files(model_filepath) #### Remove average model from vector #masc_best_mod_files <- masc_best_mod_files[grep("^model", masc_best_mod_files)] #### Create blank list #masc_best_mods <- list() #### Read in files #for (i in 1:length(masc_best_mod_files)) { # masc_best_mods[i] <- readRDS(file = paste0(model_filepath, masc_best_mod_files[i])) #} ``` ### More tests ```{r} #### Test dispersion of each of our models # Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. # First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1 # Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model) # Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant # Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant for (i in 1:length(masc_best_mods)){ print(paste("Model", i)) simulationOutput <- simulateResiduals(fittedModel = masc_best_mods[i][[1]], plot = TRUE) plotResiduals(simulationOutput, masccrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i)) testDispersion(simulationOutput, alternative = "greater", plot = TRUE) testZeroInflation(simulationOutput) } # Lovely, all models look just fine! ``` Quick break just so we can visualize plots easier ```{r} # Create for loop, predicting data and plotting results for each model # Predicted should effectively be a line with no outliers, histogram should be one block for (i in 1:length(masc_best_mods)) { # Simulate data, see how accurate models are. This uses our custom function pred_dat() pred_data <- pred_dat(model = masc_best_mods[i][[1]], data = masccrabs_dat, num_sims = 500) # Plot simulated data with a red line as the real data plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) + abline(h = sum(masccrabs_dat$Bitter == 1), col = "red") # Plot percent change from real data hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i)) # Print total mean difference from data print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(masccrabs_dat$Bitter == 1))) } ``` # MODEL W MATURITY Hey, now that we have mature/immature status for both males and females, why not do another full model? This time, it'll examine maturity as well - same as the model of all crabs, but only including crabs with maturity status (and, of course, adding maturity status as a variable) This model will include all crabs with maturity measurements ### Filter and re-scale to get correct data for model ```{r} # Create new dataframes from sex-specific dataframes created earlier mat_crabs_m <- masc_crabs mat_crabs_f <- fem_crabs # Drop all scaled variables from each dataframe - we'll re-scale them (once combined for all except CW) mat_crabs_m <- mat_crabs_m %>% dplyr::select(-contains("s.")) mat_crabs_f <- mat_crabs_f %>% dplyr::select(-contains("s.")) # Create new maturity column within female dataframe (male one already has one) mat_crabs_f$mat_stat <- car::recode(mat_crabs_f$Egg.Development, "'Juvenile' = '1'; else = '2'") # Drop sex-specific columns from dataframes mat_crabs_f <- mat_crabs_f %>% dplyr::select(-c(Egg.Development, Egg.Percent)) mat_crabs_m <- mat_crabs_m %>% dplyr::select(-Chela.Ht) # Scale CW for each sex separately mat_crabs_f$s.CW <- scale(mat_crabs_f$CW) mat_crabs_m$s.CW <- scale(mat_crabs_m$CW) # Join the two datasets mat_crabs <- rbind(mat_crabs_f, mat_crabs_m) #### Re-scale all variables for our new dataset # Subtract 2004 from all years, so that our earliest year is 1 mat_crabs$s.Year <- mat_crabs$Year-(min(mat_crabs$Year)-1) # Scale latitude, longitude, Julian day, depth, and temperature mat_crabs$s.Latitude <- scale(mat_crabs$Latitude) mat_crabs$s.Longitude <- scale(mat_crabs$Longitude) mat_crabs$s.Jul.Day <- scale(mat_crabs$Jul.Day) mat_crabs$s.Depth <- scale(mat_crabs$Depth) mat_crabs$s.Temp <- scale(mat_crabs$Temp) # Change maturity status to factor mat_crabs$mat_stat <- as.factor(mat_crabs$mat_stat) colnames(mat_crabs) ``` ### Select variables to be used in model ```{r} # Select all variables to be used in our model for male crabs matcrabs_dat <- mat_crabs %>% dplyr::select(c(s.Year, Site, Sex, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp, mat_stat)) # Select independent variables modeled_vars <- names(matcrabs_dat) modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")] # Initialize dataframe with model values AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2) # Create a null model and get AIC null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), data = matcrabs_dat, family = binomial, na.action = "na.fail") AIC_null <- extractAIC(null_mod)[2] AIC_vals[1, 1] <- "null_mod" AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null # Create for loop to extract AIC for all variables for (i in 1:length(modeled_vars)){ my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)") test_mod <- glmer(my_formula, data = matcrabs_dat, family = binomial) AIC_vals[i, 1] <- modeled_vars[i] AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null } beep() # See which variables improve the model the most AIC_vals[order(as.numeric(AIC_vals[, 2])), ] ``` Here's the order of the impact of our variables, from most to least impactful. Shell Condition Black Mat Carapace width Temperature Maturity Leg Condition Sex Julian day Latitude Depth ### Build model Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity ```{r} mat_full_model <- glmmTMB(Bitter ~ Shell.Condition + Blackmat + s.CW + s.Temp + mat_stat + Leg.Condition + Sex + s.Jul.Day + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year), data = matcrabs_dat, family = binomial, na.action = "na.fail" # This line is for the dredge() function used later ) check_collinearity(mat_full_model) # Everything looks great! ``` ### Model Diagnostics Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions ```{r} # Simulate residuals and plot testOutliers(mat_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE) simulateResiduals(mat_full_model, plot = TRUE) # Perform ANOVA on model fits car::Anova(mat_full_model) # Alright, looks like in our full model, shell condition, maturity, CW, temperature, and leg condition are significant, while Julian day, Black Mat, Latitude, and Depth aren't. # Test for effects. You can plot these all together with plot(allEffects(mat_full_model)), but that gets crowded visually plot(predictorEffect("Shell.Condition", mat_full_model)) plot(predictorEffect("mat_stat", mat_full_model)) plot(predictorEffect("s.CW", mat_full_model)) plot(predictorEffect("s.Temp", mat_full_model)) plot(predictorEffect("s.Jul.Day", mat_full_model)) plot(predictorEffect("Blackmat", mat_full_model)) plot(predictorEffect("Leg.Condition", mat_full_model)) plot(predictorEffect("s.Latitude", mat_full_model)) plot(predictorEffect("s.Depth", mat_full_model)) plot(predictorEffect("Sex", mat_full_model)) ``` ### Dredging We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC. ```{r} mat_all_mods <- dredge(mat_full_model, beta = "none", eval = TRUE, rank = "AICc") plot(mat_all_mods) # Looks like we have two good models (weights ~ 0.2), four meh models (weights > 0.05 & < 0.15) and eight marginal models (weights between 0.04 and 0.01). Easier to see this in summary(mat_avg_model) below. mat_best_mods <- get.models(mat_all_mods, subset = weight > 0.01) # See what each of the best models look like # Comments compare each to the best model (mat_best_mods[1]) mat_best_mods[1] mat_best_mods[2] # Adds Depth mat_best_mods[3] # Drops Black Mat mat_best_mods[4] # Adds Depth, drops Black Mat mat_best_mods[5] # Drops Leg Condition mat_best_mods[6] # Adds Depth, drops Leg Condition mat_best_mods[7] # Drops Black Mat and Leg Condition mat_best_mods[8] # Adds Depth, drops Black Mat and Leg Condition mat_best_mods[9] # Drops Julian day mat_best_mods[10] # Drops Black Mat and Julian day mat_best_mods[11] # Adds Depth, drops Latitude mat_best_mods[12] # Drops Latitude mat_best_mods[13] # Drops Leg Condition and Julian day mat_best_mods[14] # Adds Depth, drops Julian day # Average models based on AICc mat_avg_model <- model.avg(mat_best_mods, beta = "none") # See what that average model looks like mat_avg_model$coefficients summary(mat_avg_model) # Save each of the best models! This lets us reload them in case R crashes # This saves a LOT of time - dredge() takes forever to run! for (i in 1:length(mat_best_mods)) { saveRDS(mat_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/model", i, ".rds")) } # Also save the average model saveRDS(mat_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/avg_model.rds") # Un-comment the below code to re-read model files back in when R crashes ### Set model filepath model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/" #### Read in average model mat_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds")) # Read in best models individually ### Get vector of relevant files mat_best_mod_files <- list.files(model_filepath) ### Remove average model from vector mat_best_mod_files <- mat_best_mod_files[grep("^model", mat_best_mod_files)] ### Create blank list mat_best_mods <- list() #### Read in files for (i in 1:length(mat_best_mod_files)) { mat_best_mods[i] <- readRDS(file = paste0(model_filepath, mat_best_mod_files[i])) } ``` ### More tests ```{r} #### Test dispersion of each of our models # Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. # First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1 # Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model) # Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant # Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant for (i in 1:length(mat_best_mods)){ print(paste("Model", i)) simulationOutput <- simulateResiduals(fittedModel = mat_best_mods[i][[1]], plot = TRUE) plotResiduals(simulationOutput, matcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i)) testDispersion(simulationOutput, alternative = "greater", plot = TRUE) testZeroInflation(simulationOutput) } # Lovely, all models look just fine! ``` Quick break just so we can visualize plots easier ```{r} # Create for loop, predicting data and plotting results for each model # Predicted should effectively be a line with no outliers, histogram should be one block for (i in 1:length(mat_best_mods)) { # Simulate data, see how accurate models are. This uses our custom function pred_dat() pred_data <- pred_dat(model = mat_best_mods[i][[1]], data = matcrabs_dat, num_sims = 500) # Plot simulated data with a red line as the real data plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) + abline(h = sum(matcrabs_dat$Bitter == 1), col = "red") # Plot percent change from real data hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i)) # Print total mean difference from data print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(matcrabs_dat$Bitter == 1))) } ```