---
title: "2_8_running_model"
author: "Aidan Coyle"
date: "8/12/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Background

In this script, we will create a generalized linear mixed model to examine potential associations between _Hematodinium_ infection status and variables within both the environment and the crab.

This script will include both male and female crab, and only include crabs for which temperature data is available.


#### Load libraries (and install if necessary)

```{r libraries, message=FALSE, warning=FALSE}
# Add all required libraries here
list.of.packages <- c("tidyverse", "lme4", "MuMIn", "rcompanion", "MASS", "generalhoslem", "mgcv", "beepr", "regclass", "car", "DHARMa", "broom.mixed", "dotwhisker", "glmmTMB", "performance", "effects")
# Get names of all required packages that aren't installed
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
# Install all new packages
if(length(new.packages)) install.packages(new.packages)


# Load all required libraries
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X))
})

# Read in custom functions
source("hemat_modeling_functions.R")
```

# Read in data
```{r}
crab_dat <- read.csv(file = "../output/ADFG_SE_AK_pot_surveys/cleaned_data/crab_data/BCS_examined_crab_with_temperature.csv")
```

### Check all variables were read in correctly as either categorical or continuous predictors

```{r}
# See class of each column
str(crab_dat)
# Looks like we've got lots of columns that should be converted to factors!
crab_dat$Survey <- factor(crab_dat$Survey)
crab_dat$Site <- factor(crab_dat$Site)
crab_dat$Sex <- factor(crab_dat$Sex)
crab_dat$Recruit.Status <- factor(crab_dat$Recruit.Status)
crab_dat$Shell.Condition <- ordered(crab_dat$Shell.Condition)
crab_dat$Egg.Condition <- factor(crab_dat$Egg.Condition)
crab_dat$Egg.Development <- factor(crab_dat$Egg.Development)
crab_dat$Leg.Condition <- factor(crab_dat$Leg.Condition)
crab_dat$Bitter <- factor(crab_dat$Bitter)
crab_dat$Blackmat <- factor(crab_dat$Blackmat)

# See updated class of each column
str(crab_dat)
```

### Check for correlation

Prior to modeling anything, we're going to run checks on combinations of variables to see if any are correlated

```{r}
# CORRELATION BETWEEN CONTINUOUS VARIABLES

crab_nums <- select_if(crab_dat, is.numeric)
numcor <- cor(crab_nums, method = "pearson")

# See the resulting table
print(numcor)

# See if any correlations are > 0.6 (our bar for correlation) and less than 1 (since every variable is perfectly correlated with itself)
any(abs(numcor) > 0.6 & numcor < 1)
which(abs(numcor) > 0.6 & numcor < 1, arr.ind = TRUE)
# Looks like we have a tight correlation between latitude and longitude
# Temperature and Julian day are pretty close (corr = 0.56), but nothing else is above 0.5

# CORRELATIONS BETWEEN CATEGORICAL VARIABLES

# Now we're using Cramer's V test to look at correlation among our categorical variables
crabcat <- select_if(crab_dat, is.factor)

# Turn all from factors to numeric
crabcat[] <- sapply(crabcat, as.numeric)

# Initialize a blank matrix
results_matrix <- matrix(nrow = length(crabcat), ncol = length(crabcat))
# Name all rows and columns with our variable names
colnames(results_matrix) <- names(crabcat)
rownames(results_matrix) <- names(crabcat)

# Fill the matrix by performing Cramer's V test on each possible combination of factors
for (i in 1:ncol(crabcat)) {
  for (j in 1:ncol(crabcat)) {
    cramer.table <- table(crabcat[,i],crabcat[,j])
    cramer.matrix <- as.matrix(cramer.table)
    results_matrix[i,j] <- cramerV(cramer.matrix)
  }
}
# See the resulting matrix
print(results_matrix)

# See if any of our correlations (aside from self-correlations) cross our boundary of too much correlation
any(results_matrix > 0.6 & results_matrix < 1)
which(abs(results_matrix) > 0.6 & results_matrix < 1, arr.ind = TRUE)

# Tight correlation between survey and site, which is fine - we weren't planning to include survey in any model
# Correlations of 1 between recruit status and sex too, indicating we should choose one for our models
#       We'll pick sex, since recruit status is mostly captured by sex + CW
# No other strong correlations

# CORRELATIONS BETWEEN CATEGORICAL AND CONTINUOUS VARIABLES

# We'll use Spearman rank-order correlation to determine whether we have any correlation
crabrank <- crab_dat
crabrank[] <- sapply(crab_dat, as.numeric)
crabcomps <- cor(crabrank, method = "spearman")
any(abs(crabcomps) > 0.6 & crabcomps < 1)
# Looks like we do have some significant correlations this time! Let's pull them out
which(abs(crabcomps) > 0.6 & crabcomps < 1, arr.ind = TRUE)
print(crabcomps)
# Correlations are between:
    # Julian day and survey (don't care, not including survey in model)
    # Recruit status and CW (already decided to exclude recruit status from model since it's mostly CW + sex)
    # Latitude and longitude (we'll likely just use latitude, or skip altogether and just use site)
    # Temperature and Julian day (these are two continuous variables, which were already cleared using the more appropriate Pearson metric)
```


# Adjust numeric variables, scaling some

```{r}
# Subtract 2004 from all years, so that our earliest year is 1
crab_dat$s.Year <- crab_dat$Year-(min(crab_dat$Year)-1)

# Scale chela height, egg percent, latitude, longitude, Julian day, depth, and temperature
crab_dat$s.Chela.Ht <- scale(crab_dat$Chela.Ht)
crab_dat$s.Egg.Percent <- scale(crab_dat$Egg.Percent)
crab_dat$s.Latitude <- scale(crab_dat$Latitude)
crab_dat$s.Longitude <- scale(crab_dat$Longitude)
crab_dat$s.Jul.Day <- scale(crab_dat$Jul.Day)
crab_dat$s.Depth <- scale(crab_dat$Depth)
crab_dat$s.Temp <- scale(crab_dat$Temp)

# We'll scale CW twice - once for males, and once for females. This'll capture the sexual dimorphism within Tanner crabs
crab_dat_f <- crab_dat[crab_dat$Sex == "2", ]
crab_dat_m <- crab_dat[crab_dat$Sex == "1", ]

crab_dat_f$s.CW <- scale(crab_dat_f$CW)
crab_dat_m$s.CW <- scale(crab_dat_m$CW)

crab_dat <- rbind(crab_dat_f, crab_dat_m)
```

#### TRASHY LIL TEST MODEL

This is purely to look at the effects of temperature in combination with site and year


```{r}

# Check that temperature scaled correctly
bitter <- crab_dat
bitter$Bitter <- as.numeric(bitter$Bitter)
bitter$Temp <- round(bitter$Temp, digits = 1)
bitter$s.Temp <- scale(bitter$Temp)
table(bitter$s.Temp)
table(bitter$Temp)


bitter <- bitter %>%
  group_by(s.Temp) %>%
  summarize(bitter_avg = mean(Bitter)) %>%
  ungroup()

ggplot(bitter) +
  geom_point(aes(x = s.Temp, y = bitter_avg))

# Yep, it scaled correctly!

# Alright, let's build a model with just temperature and site

trash_model <- glmmTMB(Bitter ~ Temp + (1 | Site) + (1 | s.Year),
                       data = crab_dat,
                       family = binomial,
                       na.action = "na.fail")

summary(trash_model)

plot(predictorEffect("Temp", trash_model))

# Okay, let's figure out what the heck is going on with temperature and our random effects!


```


# MODEL OF ALL CRABS

This model will include ALL crabs, both male and female. Therefore, it will not include sex-specific measurements, such as chela height and egg-related measurements

We have a Bernoulli distribution (binomial), plus random effects of year and location


```{r}
# Select all variables to be used in our model for all crabs
allcrabs_dat <- crab_dat %>%
  dplyr::select(c(s.Year, Site, Sex, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp))

# Select independent variables
modeled_vars <- names(allcrabs_dat)
modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")]

# Initialize dataframe with model values
AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2)

# Create a null model and get AIC
null_mod <- glmer(Bitter ~ (1 | Site) + (1 | s.Year), 
                  data = allcrabs_dat,
                  family = binomial)
AIC_null <- extractAIC(null_mod)[2]
AIC_vals[1, 1] <- "null_mod"
AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null

# Create for loop to extract AIC for all variables
for (i in 1:length(modeled_vars)){
  my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)")
  test_mod <- glmer(my_formula,
                    data = allcrabs_dat,
                    family = binomial)
  
  AIC_vals[i, 1] <- modeled_vars[i]
  AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null
}
beep()


# See which variables improve the model the most
# I just printed these and reordered them around a bit
AIC_vals[order(as.numeric(AIC_vals[, 2])), ]
```

Here's the order of the impact of our variables, from most to least impactful. 

Shell condition
Temperature
CW
Black Mat
Leg condition
Julian day
Sex
---------Null model line ------------------
Latitude
Depth

### Build model

```{r}
full_model <- glmmTMB(Bitter ~ Shell.Condition + s.Temp + s.CW + Blackmat + Leg.Condition + s.Jul.Day + Sex + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year),
                       data = allcrabs_dat,
                       family = binomial,
                       na.action = "na.fail")              # This line is for the dredge() function used later
                       

check_collinearity(full_model)

# Alright, VIFs look good!

```


### Model Diagnostics

Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions

```{r}
# Simulate residuals and plot
testOutliers(full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE)
simulateResiduals(full_model, plot = TRUE)

# Perform ANOVA on model fits
car::Anova(full_model)

# Alright, looks like in our full model, everything is significant except leg condition and depth (with latitude being barely significant)

# Test for effects. You can plot these all together with plot(allEffects(full_model)), but that gets crowded visually
plot(predictorEffect("Shell.Condition", full_model))
plot(predictorEffect("s.Temp", full_model))
plot(predictorEffect("s.CW", full_model))
plot(predictorEffect("Blackmat", full_model))
plot(predictorEffect("Leg.Condition", full_model))
plot(predictorEffect("s.Jul.Day", full_model))
plot(predictorEffect("Sex", full_model))
plot(predictorEffect("s.Latitude", full_model))
plot(predictorEffect("s.Depth", full_model))

# Get influence measures of each datapoint.
# Sadly, the only way to do this with glmmTMB is bruteforce it
# That means refitting the model with each element removed using glmmTMB::influence_mixed()
# May prove to be too long to be feasible, let's test!

# Update: yep, it's too long, Did the calculations, it took us 
# about 12 hours to run dredge(), which analyzed 512 possible models
# This would be 150,000 possible models, which would be almost 150 days

# Check significance of random effects using bootstrapped LRT

full_model.onlySite <- glmmTMB(Bitter ~ Shell.Condition + s.Temp + s.CW + Blackmat + Leg.Condition + s.Jul.Day + Sex + s.Latitude + s.Depth + (1 | Site),
                       data = allcrabs_dat,
                       family = binomial,
                       na.action = "na.fail")

  
```
### Dredging

We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AICc.

```{r}
all_mods <- dredge(full_model, beta = "none",
       eval = TRUE,
       rank = "AICc")

plot(all_mods)

# Looks like we have four good models (weights > 0.2) and four marginal models (weights between 0.04 and 0.01)
best_mods <- get.models(all_mods, subset = weight > 0.01)

all_best_mod <- get.models(all_mods, subset = 1)[[1]]

# See what each of the best models look like
best_mods[1]
best_mods[2] # Same as #1, but drops Depth
best_mods[3] # Same as #1, but includes Leg Condition
best_mods[4] # Same as #1, but includes Leg Condition and drops Depth
best_mods[5] # Same as #1, but drops Latitude
best_mods[6] # Same as #1, but includes Leg Condition and drops Latitude
best_mods[7] # Same as #1, but drops Depth and Latitude
best_mods[8] # Same as #1, but includes Leg Condition and drops Depth + Latitude

# Average models based on AICc
avg_model_test <- model.avg(best_mods, beta = "none")

# See what that average model looks like
avg_model$coefficients
summary(avg_model)

# Save each of the best models! This lets us reload them in case R crashes
# This saves a LOT of time - dredge() takes forever to run!
for (i in 1:length(best_mods)) {
  saveRDS(best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/model", i, ".rds"))
}

# Also save the average model
saveRDS(avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/avg_model.rds")

# Un-comment the below code to re-read model files back in when R crashes

#### Set model filepath
#model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/all_crabs/"

##### Read in average model
#avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds"))

# Read in best models individually

#### Get vector of relevant files
#best_mod_files <- list.files(model_filepath)
#### Remove average model from vector
#best_mod_files <- best_mod_files[grep("^model", best_mod_files)]
#### Create blank list
#best_mods <- list()
##### Read in files
#for (i in 1:length(best_mod_files)) {
#  best_mods[i] <- readRDS(file = paste0(model_filepath, best_mod_files[i]))
#}
```

### Get a summary of full model
```{r}
# Set up path to put plots
plot_path <- "../output/ADFG_SE_AK_pot_surveys/model_results/full_models/all_crabs/images/"

# Get coefficients
summary(avg_model)

# Plot coefficients, and save file
png(filename = paste0(plot_path, "coefficients.png"))
plot(avg_model)
dev.off()

# Plot predictor effects
plot(allEffects("Shell.Condition", test))


# First, get the names of our variables
test <- summary(avg_model)
var_names <- colnames(test$coefficients)
attr(var_names, "order") <- NULL
var_names <- 


png(filename="your/file/location/name.png")
plot(fit)
dev.off()

# Plot predictor effects

plot(predictorEffect("s.Temp", avg_model))

plot(predictorEffect("Shell.Condition", best_mods[1]))
plot(predictorEffect("s.Temp", full_model))
plot(predictorEffect("s.CW", full_model))
plot(predictorEffect("Blackmat", full_model))
plot(predictorEffect("Leg.Condition", full_model))
plot(predictorEffect("s.Jul.Day", full_model))
plot(predictorEffect("Sex", full_model))
plot(predictorEffect("s.Latitude", full_model))
plot(predictorEffect("s.Depth", full_model))

```


### More tests 
```{r}
#### Test dispersion of each of our models

# Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. 

# First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1

# Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model)

# Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant

# Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant

for (i in 1:length(best_mods)){
  print(paste("Model", i))
  simulationOutput <- simulateResiduals(fittedModel = best_mods[i][[1]], plot = TRUE) 
  plotResiduals(simulationOutput, allcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i))
  testDispersion(simulationOutput, alternative = "greater", plot = TRUE)
  testZeroInflation(simulationOutput)
}

# Lovely, all models look just fine!
```

Quick break just so we can visualize plots easier

```{r}

# Create for loop, predicting data and plotting results for each model
# Predicted should effectively be a line with no outliers, histogram should be one block

for (i in 1:length(best_mods)) {
  # Simulate data, see how accurate models are. This uses our custom function pred_dat()
  pred_data <- pred_dat(model = best_mods[i][[1]], data = allcrabs_dat, num_sims = 500)
  # Plot simulated data with a red line as the real data
  plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) +
    abline(h = sum(allcrabs_dat$Bitter == 1), col = "red")
  # Plot percent change from real data
  hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i))
  # Print total mean difference from data
  print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(allcrabs_dat$Bitter == 1)))
}
```

# MODEL OF FEMALE CRABS

This model will include all female crabs with egg-related measurements

### Filter and re-scale to get correct data for model

```{r}
# Select only female crabs
fem_crabs <- crab_dat %>%
  filter(Sex == "2")

# Drop chela height column
fem_crabs <- fem_crabs %>%
  dplyr::select(-Chela.Ht)

# Drop scaled columns - we'll scale them again
fem_crabs <- fem_crabs %>%
  dplyr::select(-contains("s."))

# We'll try to move Egg Development and Egg Condition into a single column, a lot of the data is redundant or useless

# We likely should have just done this in script 2_7, but hey, no time like the present (and I don't want to go back and potentially have to re-run my all crabs model)

# Move barren clean/matted data into the Egg Development column
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Egg.Condition == "Barren_Clean", "Barren_Clean", as.character(Egg.Development)))
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Egg.Condition == "Barren_Matted", "Barren_Matted", as.character(Egg.Development)))

# Move dead eggs data into Egg Development column. We only have ~50 with dead eggs over 20%, so we'll just note all as having dead eggs
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Egg.Condition == "Dead_eggs_over_20pct", "Dead_Eggs", as.character(Egg.Development)))
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Egg.Condition == "Dead_eggs_under_20pct", "Dead_Eggs", as.character(Egg.Development)))

# Only three crabs have eyed eggs, we'll just remove those
fem_crabs <- fem_crabs %>%
  filter(Egg.Development != "Eyed")

# Change all crabs with a recruit status of Juvenile Female to Egg Development = Juvenile
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Recruit.Status == "Juvenile_Female", "Juvenile", as.character(Egg.Development)))

# Alright, we can remove the Egg Condition column
fem_crabs <- fem_crabs %>%
  dplyr::select(-Egg.Condition)

# If a crab has an egg percent of zero (and isn't a juvenile or barren), change to NA in egg development (since we don't know if barren or matted)
fem_crabs <- fem_crabs %>%
  mutate(Egg.Development = ifelse(Egg.Percent == 0 & (Egg.Development == "Dead_Eggs" | Egg.Development == "Uneyed"), NA, as.character(Egg.Development)))

# If a crab is juvenile or barren, change egg percent to zero
fem_crabs <- fem_crabs %>%
  mutate(Egg.Percent = ifelse(Egg.Percent != 0 & (Egg.Development == "Barren_Clean" | Egg.Development == "Barren_Matted" | Egg.Development == "Juvenile"), NA, Egg.Percent))

# See how many NAs we have
colSums(is.na(fem_crabs))

# Drop all rows with NAs
fem_crabs <- na.omit(fem_crabs)
# Great, we've got around 25,000 rows! Neat stuff.

#### Re-scale all variables for our new dataset

# Subtract 2004 from all years, so that our earliest year is 1
fem_crabs$s.Year <- fem_crabs$Year-(min(fem_crabs$Year)-1)

# Scale latitude, longitude, Julian day, depth, and temperature
fem_crabs$s.Latitude <- scale(fem_crabs$Latitude)
fem_crabs$s.Longitude <- scale(fem_crabs$Longitude)
fem_crabs$s.Jul.Day <- scale(fem_crabs$Jul.Day)
fem_crabs$s.Depth <- scale(fem_crabs$Depth)
fem_crabs$s.Temp <- scale(fem_crabs$Temp)
# Just need to scale CW once this time
fem_crabs$s.CW <- scale(fem_crabs$CW)
# Need to also do egg percent this time!
fem_crabs$s.Egg.Percent <- scale(fem_crabs$Egg.Percent)

colnames(fem_crabs)
```


### Select variables to be used in model

```{r}
# Select all variables to be used in our model for female crabs
femcrabs_dat <- fem_crabs %>%
  dplyr::select(c(s.Year, Site, s.CW, Shell.Condition, Egg.Development, Leg.Condition, Bitter, s.Egg.Percent, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp))

# Select independent variables
modeled_vars <- names(femcrabs_dat)
modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")]

# Initialize dataframe with model values
AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2)

# Create a null model and get AIC
null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), 
                  data = femcrabs_dat,
                  family = binomial,
                  na.action = "na.fail")

AIC_null <- extractAIC(null_mod)[2]
AIC_vals[1, 1] <- "null_mod"
AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null

# Create for loop to extract AIC for all variables
for (i in 1:length(modeled_vars)){
  my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)")
  test_mod <- glmer(my_formula,
                    data = femcrabs_dat,
                    family = binomial)
  
  AIC_vals[i, 1] <- modeled_vars[i]
  AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null
}
beep()


# See which variables improve the model the most
AIC_vals[order(as.numeric(AIC_vals[, 2])), ]

```

Here's the order of the impact of our variables, from most to least impactful. 

Shell Condition
Egg Percent
Egg Development
Black Mat
Leg Condition
CW
Temperature
Depth
Latitude
[Null model]
Julian Day

### Build model

Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity

```{r}
fem_full_model <- glmmTMB(Bitter ~ Shell.Condition + s.Egg.Percent + Egg.Development + Blackmat + Leg.Condition + s.CW + s.Temp + s.Depth + s.Latitude + s.Jul.Day + (1 | Site) + (1 | s.Year),
                       data = femcrabs_dat,
                       family = binomial,
                       na.action = "na.fail"              # This line is for the dredge() function used later
                       )

check_collinearity(fem_full_model)

# Wow, everything looks great! Was worried about having issues with egg percent and egg development (since barren = 0), but apparently all good!
```

### Model Diagnostics

Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions

```{r}
# Simulate residuals and plot
testOutliers(fem_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE)
simulateResiduals(fem_full_model, plot = TRUE)

# Perform ANOVA on model fits
car::Anova(fem_full_model)

# Alright, looks like in our full model, everything is significant except Black Mat and depth, with latitude and leg condition being just barely significant.

# Test for effects. You can plot these all together with plot(allEffects(fem_full_model)), but that gets crowded visually
plot(predictorEffect("Shell.Condition", fem_full_model))
plot(predictorEffect("s.Egg.Percent", fem_full_model))
plot(predictorEffect("Egg.Development", fem_full_model))
plot(predictorEffect("Blackmat", fem_full_model))
plot(predictorEffect("Leg.Condition", fem_full_model))
plot(predictorEffect("s.CW", fem_full_model))
plot(predictorEffect("s.Temp", fem_full_model))
plot(predictorEffect("s.Depth", fem_full_model))
plot(predictorEffect("s.Latitude", fem_full_model))
plot(predictorEffect("s.Jul.Day", fem_full_model))

```

           Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
 Shell.Condition 1.14 [1.13, 1.16]         1.07      0.87     [0.86, 0.89]
   s.Egg.Percent 1.35 [1.33, 1.37]         1.16      0.74     [0.73, 0.75]
 Egg.Development 1.48 [1.45, 1.50]         1.22      0.68     [0.67, 0.69]
        Blackmat 1.03 [1.02, 1.04]         1.01      0.97     [0.96, 0.98]
   Leg.Condition 1.01 [1.00, 1.04]         1.01      0.99     [0.97, 1.00]
            s.CW 1.01 [1.00, 1.03]         1.01      0.99     [0.97, 1.00]
          s.Temp 1.71 [1.68, 1.74]         1.31      0.58     [0.57, 0.59]
         s.Depth 1.80 [1.77, 1.83]         1.34      0.56     [0.55, 0.57]
      s.Latitude 1.16 [1.15, 1.18]         1.08      0.86     [0.85, 0.87]
       s.Jul.Day 1.70 [1.67, 1.73]         1.31      0.59     [0.58, 0.60]
### Dredging

We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC.

```{r}
fem_all_mods <- dredge(fem_full_model, beta = "none",
       eval = TRUE,
       rank = "AICc")

plot(fem_all_mods)

# We have one very good model (weight = 0.30), a good model (weight = 0.18), 
# five alright models (weights < .10 & > 0.05), and seven meh models (weights < 0.05 & > 0.01)
fem_best_mods <- get.models(fem_all_mods, subset = weight > 0.01)

summary(fem_best_mods)

# See what each of the models look like. Comments compare it to the best model (1)
fem_best_mods[1]                               
fem_best_mods[2] # Adds depth
# Start of alright models
fem_best_mods[3] # Drops Black Mat
fem_best_mods[4] # Drops latitude
fem_best_mods[5] # Adds depth, drops latitude
fem_best_mods[6] # Drops leg condition
fem_best_mods[7] # Adds depth, drops Black Mat
# Start of meh models
fem_best_mods[8] # Drops leg condition, adds depth
fem_best_mods[9] # Drops Black Mat and latitude
fem_best_mods[10] # Drops Black Mat and latitude, adds depth
fem_best_mods[11] # Drops Black Mat and leg condition
fem_best_mods[12] # Drops leg condition and latitude
fem_best_mods[13] # Drops leg condition and latitude, adds depth
fem_best_mods[14] # Drops Black Mat and leg condition, adds depth

# Average models based on AICc
fem_avg_model <- model.avg(fem_best_mods, beta = "none")

# See what that average model looks like
fem_avg_model$coefficients
summary(fem_avg_model)

# Save each of the best models! This lets us reload them in case R crashes
# This saves a LOT of time - dredge() takes forever to run!
for (i in 1:length(fem_best_mods)) {
  saveRDS(fem_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/model", i, ".rds"))
}

# Also save the average model
saveRDS(fem_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/avg_model.rds")

# Un-comment the below code to re-read model files back in when R crashes

#### Set model filepath
#model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/fem_crabs/"

##### Read in average model
#fem_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds"))

# Read in best models individually

### Get vector of relevant files
#fem_best_mod_files <- list.files(model_filepath)
### Remove average model from vector
#fem_best_mod_files <- fem_best_mod_files[grep("^model", fem_best_mod_files)]
### Create blank list
#fem_best_mods <- list()
#### Read in files
#for (i in 1:length(fem_best_mod_files)) {
#  fem_best_mods[i] <- readRDS(file = paste0(model_filepath, fem_best_mod_files[i]))
#}
```

### More tests 
```{r}
#### Test dispersion of each of our models

# Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. 

# First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1

# Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model)

# Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant

# Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant

for (i in 1:length(fem_best_mods)){
  print(paste("Model", i))
  simulationOutput <- simulateResiduals(fittedModel = fem_best_mods[i][[1]], plot = TRUE) 
  plotResiduals(simulationOutput, femcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i))
  testDispersion(simulationOutput, alternative = "greater", plot = TRUE)
  testZeroInflation(simulationOutput)
}

# Lovely, all models look just fine!
```
Quick break just so we can visualize plots easier

```{r}

# Create for loop, predicting data and plotting results for each model
# Predicted should effectively be a line with no outliers, histogram should be one block

for (i in 1:length(fem_best_mods)) {
  # Simulate data, see how accurate models are. This uses our custom function pred_dat()
  pred_data <- pred_dat(model = fem_best_mods[i][[1]], data = femcrabs_dat, num_sims = 500)
  # Plot simulated data with a red line as the real data
  plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) +
    abline(h = sum(femcrabs_dat$Bitter == 1), col = "red")
  # Plot percent change from real data
  hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i))
  # Print total mean difference from data
  print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(femcrabs_dat$Bitter == 1)))
}
```


# MODEL OF MALE CRABS

This model will include all male crabs with chela height measurements

### Filter and re-scale to get correct data for model

```{r}
# Select only male crabs
masc_crabs <- crab_dat %>%
  filter(Sex == "1")

# Drop egg columns
masc_crabs <- masc_crabs %>%
  dplyr::select(-c(Egg.Development, Egg.Condition, Egg.Percent))

# Drop scaled columns - we'll scale them again
masc_crabs <- masc_crabs %>%
  dplyr::select(-contains("s."))

# We'll use the ratio of chela height to carapace width to indicate crab maturity
# Note: This is using the EBS line for height-CW, will have to get the SE AK ratio later
# EBS equation = ln(chela_ht) = (1.189*ln(CW)) - 2.674
# Create new column for maturity line. Remember, log() computes ln, R is weird like that 
# Also, male crabs with a CW < 60 are immature
masc_crabs$mat_line <- (1.189 * log(masc_crabs$CW)) - 2.674

# Drop crabs without chela height value
masc_crabs <- masc_crabs[!is.na(masc_crabs$Chela.Ht),]

# Calculate maturity value. 
masc_crabs$mat_val <- log(masc_crabs$Chela.Ht)

# Create column for maturity status. If maturity value is greater than maturity line and CW >= 60 cm, is mature
masc_crabs <- masc_crabs %>%
  mutate(mat_stat = case_when(
    mat_val <= mat_line | CW <= 60 ~ "1",
    mat_val > mat_line ~ "2"
    ))

# Remove maturity value and line columns, no longer needed
masc_crabs <- masc_crabs %>%
  dplyr::select(-c(mat_val, mat_line))

# See how many NAs we have
colSums(is.na(masc_crabs))

# Great, we've got around 25,000 rows! Neat stuff.

#### Re-scale all variables for our new dataset

# Subtract 2004 from all years, so that our earliest year is 1
masc_crabs$s.Year <- masc_crabs$Year-(min(masc_crabs$Year)-1)

# Scale latitude, longitude, Julian day, depth, and temperature
masc_crabs$s.Latitude <- scale(masc_crabs$Latitude)
masc_crabs$s.Longitude <- scale(masc_crabs$Longitude)
masc_crabs$s.Jul.Day <- scale(masc_crabs$Jul.Day)
masc_crabs$s.Depth <- scale(masc_crabs$Depth)
masc_crabs$s.Temp <- scale(masc_crabs$Temp)
# Just need to scale CW once this time
masc_crabs$s.CW <- scale(masc_crabs$CW)

colnames(masc_crabs)
```


### Select variables to be used in model

```{r}
# Select all variables to be used in our model for male crabs
masccrabs_dat <- masc_crabs %>%
  dplyr::select(c(s.Year, Site, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp, mat_stat))

# Select independent variables
modeled_vars <- names(masccrabs_dat)
modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")]

# Initialize dataframe with model values
AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2)

# Create a null model and get AIC
null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), 
                  data = masccrabs_dat,
                  family = binomial,
                  na.action = "na.fail")

AIC_null <- extractAIC(null_mod)[2]
AIC_vals[1, 1] <- "null_mod"
AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null

# Create for loop to extract AIC for all variables
for (i in 1:length(modeled_vars)){
  my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)")
  test_mod <- glmer(my_formula,
                    data = masccrabs_dat,
                    family = binomial)
  
  AIC_vals[i, 1] <- modeled_vars[i]
  AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null
}
beep()


# See which variables improve the model the most
AIC_vals[order(as.numeric(AIC_vals[, 2])), ]

```

Here's the order of the impact of our variables, from most to least impactful. 

Shell Condition
Maturity
Carapace width
Temperature
Julian day
Black Mat infection
Leg condition
[Null model]
Latitude
Depth

### Build model

Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity

```{r}
masc_full_model <- glmmTMB(Bitter ~ Shell.Condition + mat_stat + s.CW + s.Temp + s.Jul.Day + Blackmat + Leg.Condition + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year),
                       data = masccrabs_dat,
                       family = binomial,
                       na.action = "na.fail"              # This line is for the dredge() function used later
                       )

check_collinearity(masc_full_model)

# Everything looks great!
```

### Model Diagnostics

Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions

```{r}
# Simulate residuals and plot
testOutliers(masc_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE)
simulateResiduals(masc_full_model, plot = TRUE)

# Perform ANOVA on model fits
car::Anova(masc_full_model)

# Alright, looks like in our full model, shell condition, maturity, CW, temperature, and leg condition are significant, while Julian day, Black Mat, Latitude, and Depth aren't.

# Test for effects. You can plot these all together with plot(allEffects(masc_full_model)), but that gets crowded visually
plot(predictorEffect("Shell.Condition", masc_full_model))
plot(predictorEffect("mat_stat", masc_full_model))
plot(predictorEffect("s.CW", masc_full_model))
plot(predictorEffect("s.Temp", masc_full_model))
plot(predictorEffect("s.Jul.Day", masc_full_model))
plot(predictorEffect("Blackmat", masc_full_model))
plot(predictorEffect("Leg.Condition", masc_full_model))
plot(predictorEffect("s.Latitude", masc_full_model))
plot(predictorEffect("s.Depth", masc_full_model))
```

           Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
 Shell.Condition 1.14 [1.13, 1.16]         1.07      0.87     [0.86, 0.89]
   s.Egg.Percent 1.35 [1.33, 1.37]         1.16      0.74     [0.73, 0.75]
 Egg.Development 1.48 [1.45, 1.50]         1.22      0.68     [0.67, 0.69]
        Blackmat 1.03 [1.02, 1.04]         1.01      0.97     [0.96, 0.98]
   Leg.Condition 1.01 [1.00, 1.04]         1.01      0.99     [0.97, 1.00]
            s.CW 1.01 [1.00, 1.03]         1.01      0.99     [0.97, 1.00]
          s.Temp 1.71 [1.68, 1.74]         1.31      0.58     [0.57, 0.59]
         s.Depth 1.80 [1.77, 1.83]         1.34      0.56     [0.55, 0.57]
      s.Latitude 1.16 [1.15, 1.18]         1.08      0.86     [0.85, 0.87]
       s.Jul.Day 1.70 [1.67, 1.73]         1.31      0.59     [0.58, 0.60]
### Dredging

We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC.

```{r}
masc_all_mods <- dredge(masc_full_model, beta = "none",
       eval = TRUE,
       rank = "AICc")

plot(masc_all_mods)

# Looks like we have 16 total models above 0.01. Of these, two are best (combined weight: 0.39), and an additional 6 are decent (combined weight with the two best: combined weight: 0.83). There are then 9 more models with weights from 0.3 to 0.1

masc_best_mods <- get.models(masc_all_mods, subset = weight > 0.01)

# See what each of the models looks like
# Descriptions compare each to the best model (Model 1)
masc_best_mods[1]
masc_best_mods[2]  # Drops Julian day
masc_best_mods[3]  # Adds Depth
masc_best_mods[4]  # Adds Depth, drops Julian day
masc_best_mods[5]  # Adds Black Mat
masc_best_mods[6]  # Adds Latitude
masc_best_mods[7]  # Adds Latitude, drops Julian day
masc_best_mods[8]  # Adds Black Mat, drops Julian day
masc_best_mods[9]  # Adds Black Mat and Depth
masc_best_mods[10] # Adds Depth and Latitude
masc_best_mods[11] # Adds Depth and Latitude, drops Julian day
masc_best_mods[12] # Adds Black Mat and Depth, drops Julian day
masc_best_mods[13] # Adds Black Mat and Latitude
masc_best_mods[14] # Adds Black Mat and Latitude, drops Julian day
masc_best_mods[15] # Adds Black Mat, Depth, and Latitude
masc_best_mods[16] # Adds Black Mat, Depth, and Latitude, drops Julian day

# Average models based on AICc
masc_avg_model <- model.avg(masc_best_mods, beta = "none")

# See what that average model looks like
masc_avg_model$coefficients
summary(masc_avg_model)

# Save each of the best models! This lets us reload them in case R crashes
# This saves a LOT of time - dredge() takes forever to run!
for (i in 1:length(masc_best_mods)) {
  saveRDS(masc_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/model", i, ".rds"))
}

# Also save the average model
saveRDS(masc_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/avg_model.rds")

# Un-comment the below code to re-read model files back in when R crashes

#### Set model filepath
#model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/male_crabs/"

##### Read in average model
#masc_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds"))

# Read in best models individually

#### Get vector of relevant files
#masc_best_mod_files <- list.files(model_filepath)
#### Remove average model from vector
#masc_best_mod_files <- masc_best_mod_files[grep("^model", masc_best_mod_files)]
#### Create blank list
#masc_best_mods <- list()
#### Read in files
#for (i in 1:length(masc_best_mod_files)) {
#  masc_best_mods[i] <- readRDS(file = paste0(model_filepath, masc_best_mod_files[i]))
#}
```
### More tests 
```{r}
#### Test dispersion of each of our models

# Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. 

# First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1

# Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model)

# Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant

# Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant

for (i in 1:length(masc_best_mods)){
  print(paste("Model", i))
  simulationOutput <- simulateResiduals(fittedModel = masc_best_mods[i][[1]], plot = TRUE) 
  plotResiduals(simulationOutput, masccrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i))
  testDispersion(simulationOutput, alternative = "greater", plot = TRUE)
  testZeroInflation(simulationOutput)
}

# Lovely, all models look just fine!
```

Quick break just so we can visualize plots easier

```{r}

# Create for loop, predicting data and plotting results for each model
# Predicted should effectively be a line with no outliers, histogram should be one block

for (i in 1:length(masc_best_mods)) {
  # Simulate data, see how accurate models are. This uses our custom function pred_dat()
  pred_data <- pred_dat(model = masc_best_mods[i][[1]], data = masccrabs_dat, num_sims = 500)
  # Plot simulated data with a red line as the real data
  plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) +
    abline(h = sum(masccrabs_dat$Bitter == 1), col = "red")
  # Plot percent change from real data
  hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i))
  # Print total mean difference from data
  print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(masccrabs_dat$Bitter == 1)))
}
```

# MODEL W MATURITY

Hey, now that we have mature/immature status for both males and females, why not do another full model? This time, it'll examine maturity as well - same as the model of all crabs, but only including crabs with maturity status (and, of course, adding maturity status as a variable)


This model will include all crabs with maturity measurements

### Filter and re-scale to get correct data for model

```{r}
# Create new dataframes from sex-specific dataframes created earlier
mat_crabs_m <- masc_crabs
mat_crabs_f <- fem_crabs

# Drop all scaled variables from each dataframe - we'll re-scale them (once combined for all except CW)
mat_crabs_m <- mat_crabs_m %>%
  dplyr::select(-contains("s."))
mat_crabs_f <- mat_crabs_f %>%
  dplyr::select(-contains("s."))

# Create new maturity column within female dataframe (male one already has one)
mat_crabs_f$mat_stat <- car::recode(mat_crabs_f$Egg.Development, "'Juvenile' = '1'; else = '2'")

# Drop sex-specific columns from dataframes
mat_crabs_f <- mat_crabs_f %>%
  dplyr::select(-c(Egg.Development, Egg.Percent))
mat_crabs_m <- mat_crabs_m %>%
  dplyr::select(-Chela.Ht)

# Scale CW for each sex separately
mat_crabs_f$s.CW <- scale(mat_crabs_f$CW)
mat_crabs_m$s.CW <- scale(mat_crabs_m$CW)

# Join the two datasets
mat_crabs <- rbind(mat_crabs_f, mat_crabs_m)

#### Re-scale all variables for our new dataset

# Subtract 2004 from all years, so that our earliest year is 1
mat_crabs$s.Year <- mat_crabs$Year-(min(mat_crabs$Year)-1)

# Scale latitude, longitude, Julian day, depth, and temperature
mat_crabs$s.Latitude <- scale(mat_crabs$Latitude)
mat_crabs$s.Longitude <- scale(mat_crabs$Longitude)
mat_crabs$s.Jul.Day <- scale(mat_crabs$Jul.Day)
mat_crabs$s.Depth <- scale(mat_crabs$Depth)
mat_crabs$s.Temp <- scale(mat_crabs$Temp)

# Change maturity status to factor
mat_crabs$mat_stat <- as.factor(mat_crabs$mat_stat)

colnames(mat_crabs)
```


### Select variables to be used in model

```{r}
# Select all variables to be used in our model for male crabs
matcrabs_dat <- mat_crabs %>%
  dplyr::select(c(s.Year, Site, Sex, s.CW, Shell.Condition, Leg.Condition, Bitter, Blackmat, s.Latitude, s.Jul.Day, s.Depth, s.Temp, mat_stat))

# Select independent variables
modeled_vars <- names(matcrabs_dat)
modeled_vars <- modeled_vars[!modeled_vars %in% c("s.Year", "Site", "Bitter")]

# Initialize dataframe with model values
AIC_vals <- matrix(nrow = length(modeled_vars), ncol = 2)

# Create a null model and get AIC
null_mod <- glmmTMB(Bitter ~ (1 | Site) + (1 | s.Year), 
                  data = matcrabs_dat,
                  family = binomial,
                  na.action = "na.fail")

AIC_null <- extractAIC(null_mod)[2]
AIC_vals[1, 1] <- "null_mod"
AIC_vals[1, 2] <- extractAIC(null_mod)[2] - AIC_null

# Create for loop to extract AIC for all variables
for (i in 1:length(modeled_vars)){
  my_formula = paste0("Bitter ~ ", modeled_vars[i], " + (1 | Site) + (1 | s.Year)")
  test_mod <- glmer(my_formula,
                    data = matcrabs_dat,
                    family = binomial)
  
  AIC_vals[i, 1] <- modeled_vars[i]
  AIC_vals[i, 2] <- extractAIC(test_mod)[2] - AIC_null
}
beep()


# See which variables improve the model the most
AIC_vals[order(as.numeric(AIC_vals[, 2])), ]

```

Here's the order of the impact of our variables, from most to least impactful. 

Shell Condition
Black Mat
Carapace width
Temperature
Maturity
Leg Condition
Sex
Julian day
Latitude
Depth

### Build model

Alright, we'll again use glmmTMB to build a full model, then we'll check for collinearity

```{r}
mat_full_model <- glmmTMB(Bitter ~ Shell.Condition + Blackmat + s.CW + s.Temp + mat_stat + Leg.Condition + Sex + s.Jul.Day + s.Latitude + s.Depth + (1 | Site) + (1 | s.Year),
                       data = matcrabs_dat,
                       family = binomial,
                       na.action = "na.fail"              # This line is for the dredge() function used later
                       )

check_collinearity(mat_full_model)

# Everything looks great! 
```


### Model Diagnostics

Now that we have produced a full model, before we start fine-tuning it, we need to do some diagnostics to ensure it meets our assumptions

```{r}
# Simulate residuals and plot
testOutliers(mat_full_model, alternative = "two.sided", margin = "both", type = "bootstrap", plot = TRUE)
simulateResiduals(mat_full_model, plot = TRUE)

# Perform ANOVA on model fits
car::Anova(mat_full_model)

# Alright, looks like in our full model, shell condition, maturity, CW, temperature, and leg condition are significant, while Julian day, Black Mat, Latitude, and Depth aren't.

# Test for effects. You can plot these all together with plot(allEffects(mat_full_model)), but that gets crowded visually
plot(predictorEffect("Shell.Condition", mat_full_model))
plot(predictorEffect("mat_stat", mat_full_model))
plot(predictorEffect("s.CW", mat_full_model))
plot(predictorEffect("s.Temp", mat_full_model))
plot(predictorEffect("s.Jul.Day", mat_full_model))
plot(predictorEffect("Blackmat", mat_full_model))
plot(predictorEffect("Leg.Condition", mat_full_model))
plot(predictorEffect("s.Latitude", mat_full_model))
plot(predictorEffect("s.Depth", mat_full_model))
plot(predictorEffect("Sex", mat_full_model))
```

### Dredging

We will now use the dredge() function from the MuMIn package to go through each of our model possibilities and select an optimal full model using AIC.

```{r}
mat_all_mods <- dredge(mat_full_model, beta = "none",
       eval = TRUE,
       rank = "AICc")

plot(mat_all_mods)

# Looks like we have two good models (weights ~ 0.2), four meh models (weights > 0.05 & < 0.15) and eight marginal models (weights between 0.04 and 0.01). Easier to see this in summary(mat_avg_model) below. 
mat_best_mods <- get.models(mat_all_mods, subset = weight > 0.01)

# See what each of the best models look like
# Comments compare each to the best model (mat_best_mods[1])
mat_best_mods[1]
mat_best_mods[2] # Adds Depth
mat_best_mods[3] # Drops Black Mat
mat_best_mods[4] # Adds Depth, drops Black Mat
mat_best_mods[5] # Drops Leg Condition
mat_best_mods[6] # Adds Depth, drops Leg Condition
mat_best_mods[7] # Drops Black Mat and Leg Condition
mat_best_mods[8] # Adds Depth, drops Black Mat and Leg Condition
mat_best_mods[9] # Drops Julian day
mat_best_mods[10] # Drops Black Mat and Julian day
mat_best_mods[11] # Adds Depth, drops Latitude
mat_best_mods[12] # Drops Latitude
mat_best_mods[13] # Drops Leg Condition and Julian day
mat_best_mods[14] # Adds Depth, drops Julian day

# Average models based on AICc
mat_avg_model <- model.avg(mat_best_mods, beta = "none")

# See what that average model looks like
mat_avg_model$coefficients
summary(mat_avg_model)

# Save each of the best models! This lets us reload them in case R crashes
# This saves a LOT of time - dredge() takes forever to run!
for (i in 1:length(mat_best_mods)) {
  saveRDS(mat_best_mods[i], file = paste0("../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/model", i, ".rds"))
}

# Also save the average model
saveRDS(mat_avg_model, file = "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/avg_model.rds")

# Un-comment the below code to re-read model files back in when R crashes

### Set model filepath
model_filepath <- "../output/ADFG_SE_AK_pot_surveys/models/weighted_models/mature_crabs/"

#### Read in average model
mat_avg_model <- readRDS(file = paste0(model_filepath, "avg_model.rds"))

# Read in best models individually

### Get vector of relevant files
mat_best_mod_files <- list.files(model_filepath)
### Remove average model from vector
mat_best_mod_files <- mat_best_mod_files[grep("^model", mat_best_mod_files)]
### Create blank list
mat_best_mods <- list()
#### Read in files
for (i in 1:length(mat_best_mod_files)) {
  mat_best_mods[i] <- readRDS(file = paste0(model_filepath, mat_best_mod_files[i]))
}
```

### More tests 
```{r}
#### Test dispersion of each of our models

# Note: they're all going to fail the KS test, just because our sample size is huge. As long as the QQ plot looks fine, no cause for concern. 

# First plot: a Q-Q plot and a plot of residuals. THe latter should appear as a line around 0.5, followed by a few dots up at 1

# Second plot: Standardized residuals for crabs predicted as bitter and not bitter. Has the number of the model in the title (1 = best model, 14 = worst model)

# Third plot: Dispersion test of residuals. Red line should be approx around mean, p value should be not significant

# Fourth plot: Dispersion test for zero inflation. Red line should be approx around mean, p value should be not significant

for (i in 1:length(mat_best_mods)){
  print(paste("Model", i))
  simulationOutput <- simulateResiduals(fittedModel = mat_best_mods[i][[1]], plot = TRUE) 
  plotResiduals(simulationOutput, matcrabs_dat$Bitter, quantreg = TRUE, rank = TRUE, main = paste("Model", i))
  testDispersion(simulationOutput, alternative = "greater", plot = TRUE)
  testZeroInflation(simulationOutput)
}

# Lovely, all models look just fine!
```

Quick break just so we can visualize plots easier

```{r}

# Create for loop, predicting data and plotting results for each model
# Predicted should effectively be a line with no outliers, histogram should be one block

for (i in 1:length(mat_best_mods)) {
  # Simulate data, see how accurate models are. This uses our custom function pred_dat()
  pred_data <- pred_dat(model = mat_best_mods[i][[1]], data = matcrabs_dat, num_sims = 500)
  # Plot simulated data with a red line as the real data
  plot(pred_data$sim_num, pred_data$total_bitter, main = paste("Model", i)) +
    abline(h = sum(matcrabs_dat$Bitter == 1), col = "red")
  # Plot percent change from real data
  hist(pred_data$pct_change_from_data, main = paste("Histogram of Model", i))
  # Print total mean difference from data
  print(paste("Mean difference for Model", i, "is", mean(pred_data$total_bitter) - sum(matcrabs_dat$Bitter == 1)))
}
```