---
title: "Initial Data Visualization " 
author: Chris 
date: "`r format(Sys.time(), '%d %B, %Y')`"\
output: 
  html_document: 
  theme: readable 
  highlight: zenburn 
    toc: true 
    toc_float: true 
    number_sections: true 
    code_folding: show 
    code_download: true 
editor: 
markdown: 
wrap: 72 
---

###READ ME We will use the following files to explore our data

algae.csv - the algal presence/ absence by top layer and second layer percentCover.csv - the substrate and second layer information  
truncated.csv - this is a simplified version of species data

Note: Add parameters block/ explanation of choices on render What I did - will chronicle as things work OLD csv: presence.csv - Presence/ Absence data of seaweeds (may collapse this further, but it is here as a relic) OLD csv: data.csv - Full data set without manipulation

# Package Install and call

```{r}

# Installation is commented out, remove hashtag if you need to install
install.packages("kableExtra")
install.packages('gridExtra')

```

```{r}

#Use library to call your installed packages for use
library(data.table)
library(dplyr)
library('ggplot2')
library("indicspecies")
library(kableExtra)
library(knitr)
library(RColorBrewer)
library(tidyr)
library(tidyverse)
library(vegan)

```

# Working directory & data load

```{r}

getwd()

```

```{r}
#You can also hover over the file on your 'files pane' and right click to import data
#change this path/ place to you and your file path


counts<- read.csv("truncated.csv", header= TRUE) #species counts per section
algae<- read.csv("algae.csv", header= TRUE) #algal cover in first and second layer of each quadrat
substrate<- read.csv("substrate.csv", header=TRUE) #substrate percent cover in first and second layer of each quadrat

```

# At-a-Glance data frame check

# counts

```{r}

#shows the dimensions of your data frame, a quick check to make sure everything imported the way you like

#counts <- truncated
#algae <- algae
#substrate <- substrate
dim(counts)

#quick view of data frame structure
str(counts)

#look at the summary of your data
summary(counts)

```

# algae

```{r}

#repeat the process for the second dataframe. Make Note that our data here is characters and will have to change to a number so we can manipulate it.
dim(algae)

str(algae)

summary(algae)

```

# substrate

```{r}

#repeat the process for the second dataframe. Make Note that our data here is characters and will have to change to a number so we can manipulate it.
dim(substrate)

str(substrate)

summary(substrate)

```

# Change dataframe structures so we can work with the data

## counts

```{r}

#trying to apply the section1 test code to create a stacked bar for species presence across the sections.
counts1 <- counts %>%
  gather(key = "Phylum", value = "Count", ANNELIDA:OSTEICHTHYES)

```

```{r}
# this counts the categories from above so i can plot them
counts2 <- counts1 %>%
  group_by(SECTION, ZONATION, Phylum, Count) %>%
  tally()
```

```{r}

# plot the species counts
ggplot(counts2, aes(x = Phylum, y = n, fill = Count)) +
  geom_bar(stat = "identity") +
  labs(title = "Phylum Count by Section",
       x = "Phylum",
       y = "Count",
       fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~SECTION, ncol = 2)

```

# Species Richness

## How many different species are present in different sections on YI?

### Note 'species' is a misleading name because these aren't all species, some are phyla. This will be corrected

```{r}
# total island richness is 28 unique species present (non-algal) 
phylum_richness <- length(unique(counts2$Phylum))

print(phylum_richness)
```

# Clarifying species/ phyla by site and zonation

```{r}

# Filter out rows with NAs in the ZONATION or Phylum column
counts2_filtered <- counts2 %>%
  filter(!is.na(ZONATION), !is.na(Phylum))

# Perform grouping and summarizing on the filtered data
zonation <- counts2_filtered %>%
  group_by(ZONATION, Phylum) %>%
  summarize(Sum_Count = sum(Count))


```
```{r}
zonation <- counts2_filtered %>%
  group_by(ZONATION, Phylum) %>%
  summarize(Sum_Count = sum(Count, na.rm = TRUE))

# Remove rows with NAs in ZONATION and Phylum columns
counts2_cleaned <- drop_na(counts2_filtered, ZONATION, Phylum)

# Perform grouping and summarizing on the cleaned data
zonation <- counts2_cleaned %>%
  group_by(ZONATION, Phylum) %>%
  summarize(Sum_Count = sum(Count))


```

```{r}

#this is grouped by zone and section
section_zone <- counts2 %>%
  group_by(SECTION, ZONATION, Phylum) %>%
  summarize(Sum_Count = sum(Count))

```

# Heatmap

## counts

```{r}

# Convert ZONATION to a factor with ordered levels
section_zone$ZONATION <- factor(section_zone$ZONATION, levels = c("L", "M", "U"))

# Create a new data frame with required columns for the heatmap
#heatmap_data <- section_zone %>%
  #select(Phylum, ZONATION, Sum_Count)

# Plot the heatmap
ggplot(data = zonation, aes(x = ZONATION, y = Phylum, fill = Sum_Count)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "Heatmap of Phylum Counts by Zonation",
       x = "Zonation", y = "Phylum", fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


```

# Stacked Bar plot

## counts

```{r}
# we will have to choose zone over section or vice versa so we can clearly see the result. Maybe we adjust the way we manipulate it

# Plot the stacked bar plot with sections on X-axis and species on Y-axis
ggplot(data = section_zone, aes(x = factor(SECTION), y = Sum_Count, fill = Phylum)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Phylum Counts by Section",
       x = "Section", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


```

# NMDS

# Not going to use this plot type - I just wanted to try it out to see if it is helpful

```{r}

# Check for missing values in the entire data frame
sum(is.na(counts))

# Check for missing values in specific columns (e.g., the algal count columns)
colSums(is.na(counts[, 3:13]))

# Convert the species count data to a distance matrix
distance_matrix <- vegdist(counts[, 3:13], method = "bray")

# Impute missing values with the mean of non-missing values
distance_matrix[is.na(distance_matrix)] <- mean(distance_matrix, na.rm = TRUE)

# Perform NMDS
nmds_result <- metaMDS(distance_matrix, k = 2)  # k = 2 for 2-dimensional NMDS, you can change it as needed

# Plot NMDS
plot(nmds_result, display = "sites", type = "n")  # Plot without points first
points(nmds_result, display = "sites", pch = 16, col = counts$SECTION)  # Add points with different colors for sections
legend("bottomright", legend = unique(counts$SECTION), col = unique(counts$SECTION), pch = 16, title = "Section")

```

# PERMANOVA - this is glitchy - use the ANOVA

## counts

```{r}

# Convert the species count data to a distance matrix
distance_matrix <- vegdist(counts[, 3:13], method = "bray")

# Impute missing values with the mean of non-missing values
distance_matrix[is.na(distance_matrix)] <- mean(distance_matrix, na.rm = TRUE)

# Perform PERMANOVA
permanova_result <- adonis(distance_matrix ~ SECTION, data = counts)

# Print PERMANOVA results
print(permanova_result)

```

# ANOVA - GLM

# counts

```{r}
library(stats)

# Combine the columns related to species counts into a single column
species_data <- counts %>%
  select(SECTION, ZONATION, starts_with("ANNELIDA"), starts_with("BRYOZOA"), starts_with("TUNICATA"),
         starts_with("PORIFERA_DEMOSPONGIAE"), starts_with("CNIDAIRA_ANTHOZOA"),
         starts_with("MOLLUSCA"), starts_with("ARTHROPODA_NC"), starts_with("ARTHROPODA_C"),
         starts_with("ECHINODERMATA_A"), starts_with("ECHINODERMATA_O"), starts_with("OSTEICHTHYES"))

# Gather the species count data into a long format
species_data_long <- species_data %>%
  pivot_longer(cols = starts_with("ANNELIDA"):starts_with("OSTEICHTHYES"),
               names_to = "Species", values_to = "Count")

# Calculate the total count of each species for each section and zone
total_counts <- species_data_long %>%
  group_by(SECTION, ZONATION, Species) %>%
  summarise(Total_Count = sum(Count))

# Create a Poisson GLM for section comparison
section_glm <- glm(Total_Count ~ SECTION, data = total_counts, family = poisson)

# Create a Poisson GLM for zone comparison
zone_glm <- glm(Total_Count ~ ZONATION, data = total_counts, family = poisson)

# Perform ANOVA on the Poisson GLMs
section_anova <- anova(section_glm)
zone_anova <- anova(zone_glm)

# Print the ANOVA results
print(section_anova)
print(zone_anova)

```

### The explanation here is that SECTION has a small deviance and a low degree of freedom meaning the model is a good fit for this data and SECTION has a significant effect on species count. Additionally, the small degree of freedom for ZONATION also indicates a significant effect on species count - this we knew.

# ANOVA

## counts

```{r}

# Perform the ANOVA for section comparison
section_anova <- aov(Total_Count ~ SECTION, data = total_counts)

# Print the ANOVA table with p-values
summary(section_anova)

# Perform the ANOVA for zone comparison
zone_anova <- aov(Total_Count ~ ZONATION, data = total_counts)

# Print the ANOVA table with p-values
summary(zone_anova)

```

This result shows that the better fit for Zonation yields a statistically significant result. The SECTION didn't fit because the data isn't normally distributed and a better fit for the GLM above

# ANOVA - GLM

## algae

```{r}

# Sum the counts for each species to get the total count per section and zone
section_totals <- algae %>%
  group_by(SECTION) %>%
  summarise(total_count = sum(BROWN_FL, RED_FL, GREEN_FL, ENCRUSTING_FL, OTHER_FL,
                              BROWN_SL, RED_SL, GREEN_SL, ENCRUSTING_SL, OTHER_SL))

zone_totals <- algae %>%
  group_by(ZONATION) %>%
  summarise(total_count = sum(BROWN_FL, RED_FL, GREEN_FL, ENCRUSTING_FL, OTHER_FL,
                              BROWN_SL, RED_SL, GREEN_SL, ENCRUSTING_SL, OTHER_SL))

# Create a Poisson GLM for section comparison
section_glm <- glm(total_count ~ SECTION, data = section_totals, family = poisson)

# Create a Poisson GLM for zone comparison
zone_glm <- glm(total_count ~ ZONATION, data = zone_totals, family = poisson)

# Perform ANOVA on the Poisson GLM
section_anova <- anova(section_glm)
zone_anova <- anova(zone_glm)

# Print the ANOVA results
print(section_anova)
print(zone_anova)

```

# ANOVA

## algae

```{r}

# Combine the columns related to algae counts into a single column
algae_data <- algae %>%
  select(SECTION, ZONATION, starts_with("BROWN_FL"), starts_with("RED_FL"), starts_with("GREEN_FL"),
         starts_with("ENCRUSTING_FL"), starts_with("OTHER_FL"),
         starts_with("BROWN_SL"), starts_with("RED_SL"), starts_with("GREEN_SL"),
         starts_with("ENCRUSTING_SL"), starts_with("OTHER_SL"))

# Gather the algae count data into a long format
algae_data_long <- algae_data %>%
  pivot_longer(cols = starts_with("BROWN_FL"):starts_with("OTHER_SL"),
               names_to = "Algal_Type", values_to = "Count")

# Calculate the total count of each algal type for each section and zone
total_counts_algae <- algae_data_long %>%
  group_by(SECTION, ZONATION, Algal_Type) %>%
  summarise(Total_Count = sum(Count))

# Perform the ANOVA for section comparison
section_anova_algae <- aov(Total_Count ~ SECTION, data = total_counts_algae)

# Print the ANOVA table with p-values
summary(section_anova_algae)

# Perform the ANOVA for zone comparison
zone_anova_algae <- aov(Total_Count ~ ZONATION, data = total_counts_algae)

# Print the ANOVA table with p-values
summary(zone_anova_algae)

```

Similar output to the counts information above - zonation vice section makes the most impact, section didn't seem to have any impact here at all.

# Stacked bar

## algae

```{r}
# this visualization is unhelpful - trying to break it down below
# Create a new data frame with the right categories for viz

# Remove rows with NAs in ZONATION and Count columns
plot_data_cleaned <- drop_na(plot_data, ZONATION, Count)

# Plot the stacked bar plot with cleaned data
ggplot(data = plot_data_cleaned, aes(x = factor(SECTION), y = Count, fill = ZONATION)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Algal Counts by Section, Algal Type, and Zone",
       x = "Section", y = "Count", fill = "Zone") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```

# Algal profile by zone and section

```{r}

# Create a new data frame with the relevant columns for visualization
plot_data_by_section <- algae %>%
  gather(key = "Algal_Type", value = "Count", starts_with(c("BROWN_", "RED_", "GREEN_", "ENCRUSTING_", "OTHER_")), factor_key = TRUE) %>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type))

plot_data_by_zone <- algae %>%
  gather(key = "Algal_Type", value = "Count", starts_with(c("BROWN_", "RED_", "GREEN_", "ENCRUSTING_", "OTHER_")), factor_key = TRUE) %>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type),
         ZONATION = factor(ZONATION, levels = c("L", "M", "U")))

# Create two separate plots for each algal type
# 1. Algal type by section
plot_section <- ggplot(data = plot_data_by_section, aes(x = factor(SECTION), y = Count, fill = Algal_Type)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Algal Type Distribution by Section",
       x = "Section", y = "Count", fill = "Algal Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 2. Algal type by zone
plot_zone <- ggplot(data = plot_data_by_zone, aes(x = Algal_Type, y = Count, fill = ZONATION)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Algal Type Distribution by Zone",
       x = "Algal Type", y = "Count", fill = "Zone") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Arrange the plots side by side using 'gridExtra' package
library(gridExtra)

# Arrange the two plots side by side
grid.arrange(plot_section, plot_zone, ncol = 2)

```
# New way to count algae
# trying to fix the presence absence plotting - this still gives NAs eventhough I'm trying to remove them
```{r}

algae<- read.csv("algae1.csv")
# Convert the data frame to presence_absence format
algae_presence <- algae %>%
  mutate(across(c("BROWN", "RED", "GREEN", "ENCRUSTING", "OTHER"), 
                ~ifelse(. == "P", "Present", "Absent"))) %>%
  pivot_longer(cols = c(BROWN, RED, GREEN, ENCRUSTING, OTHER),
               names_to = "Algal_Type", values_to = "Presence_Absence",
               values_drop_na = TRUE) %>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type),
         ZONATION = factor(ZONATION, levels = c("L", "M", "U"))) 

# Plot the bar plot with facets for each zonation
ggplot(data = algae_presence, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar() +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ ZONATION)  # Create separate facets for each zonation

```
# Asking chat gpt for different supports to plot type of algae per section per zone
```{r}
library(dplyr)
library(tidyr)
library(ggplot2)

# Assuming your data frame is named "algae_data"

# Convert the data frame to presence_absence format
algae_presence <- algae %>%
  mutate(across(c("BROWN", "RED", "GREEN", "ENCRUSTING", "OTHER"), 
                ~ifelse(. == "P", "Present", "Absent"))) %>%
  pivot_longer(cols = c(BROWN, RED, GREEN, ENCRUSTING, OTHER),
               names_to = "Algal_Type", values_to = "Presence_Absence",
               values_drop_na = TRUE) %>%
  drop_na(Presence_Absence)%>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type),
         ZONATION = factor(ZONATION, levels = c("L", "M", "U"))) 

# Plot the stacked bar plot of types of algae present in each section by zone
ggplot(data = algae_presence, aes(x = factor(SECTION), fill = Algal_Type)) +
  geom_bar() +
  labs(title = "Types of Algae Present in Each Section by Zone",
       x = "Section", fill = "Algal Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ ZONATION)  # Create separate facets for each zonation

```
#pie charts for algae
```{r}
library(dplyr)
library(tidyr)
library(ggplot2)


# Plot the pie chart for each section
ggplot(data = algae_presence, aes(x = "", fill = Algal_Type)) +
  geom_bar(width = 1) +  # Use geom_bar() with width = 1 to create pie charts
  coord_polar("y", start = 0) +  # Convert to polar coordinates for pie chart
  labs(title = "Types of Algae Present in Each Section",
       fill = "Algal Type") +
  theme_void() +
  facet_grid(~ SECTION + ZONATION)  # Create separate facets for each section and zonation

```

#try this one for pie charts
```{r}
algae_colors <- c(GREEN = "#00FF00", BROWN = "#A52A2A", RED = "#FF0000", ENCRUSTING = "#800080", OTHER = "#808080")

algae_presence$ALGAL_COLOR <- algae_colors[algae_presence$Algal_Type]

# Convert the ALGAL_TYPE factor levels to their respective colors
algae_presence$ALGAL_COLOR <- factor(algae_presence$Algal_Type, levels = names(algae_colors),
                                 labels = algae_colors)
```
#pie chart - time 2
```{r}

# Summarize data by ZONATION and Algal_Type
zone_summary <- algae_presence %>%
  group_by(ZONATION, Algal_Type) %>%
  summarise(count = sum(ifelse(Presence_Absence == "Present", 1, 0)))

# Summarize data by SECTION and Algal_Type
section_summary <- algae_presence %>%
  group_by(SECTION, Algal_Type) %>%
  summarise(count = sum(ifelse(Presence_Absence == "Present", 1, 0)))

# Function to create a pie chart with percent labels at the ends of slices
create_pie_chart <- function(data, title, group_var) {
  ggplot(data, aes(x = "", y = count, fill = Algal_Type)) +
    geom_bar(stat = "identity", width = 1) +
    coord_polar("y", start = 0) +
    labs(title = title) +
    theme_void() +
    theme(legend.title = element_blank(),
          legend.position = "right") +
    stat("count", aes(label = scales::percent(..prop..), group = !!sym(group_var)),
         position = position_stack(vjust = 0.5))
}


#theme(plot.title = element_text(hjust = 0.5)) +
  
custom_colors <- c("sienna", "purple", "darkgreen", "#F0E442", "red")

# Create pie chart for the presence of algal types in each zone
#pie_chart_zone <- create_pie_chart(zone_summary, "Presence of Algal Types in Each Zone") +
 # scale_fill_manual(values = custom_colors)

# Create pie chart for the presence of algal types in each section
pie_chart_section <- create_pie_chart(section_summary, "Presence of Algal Types in Each Section") +
  scale_fill_manual(values = custom_colors)

# Display the pie charts
#print(pie_chart_zone)
print(pie_chart_section)

```
#single section pie chart attempt
```{r}


# Filter data for SECTION 1
section_data <- algae_presence %>% filter(SECTION == 1)

# Calculate the proportion of each algal type
algal_counts <- table(section_data$Algal_Type)
algal_proportions <- algal_counts / sum(algal_counts)

# Create the pie chart for SECTION 1
pie_chart <- ggplot(section_data, aes(x = "", fill = Algal_Type)) +
  geom_bar(width = 1, aes(y = algal_proportions), stat = "identity") +
  coord_polar("y") +
  labs(title = "Algal Cover in SECTION 1",
       fill = "Algal Type") +
  scale_fill_manual(values = algae_colors) +
  theme_minimal()
# Display the pie chart for SECTION 1
print(pie_chart)

```

# pie chart that doesn't like how we calculate values
```{r}

# Create a list to store the pie charts
pie_charts_list <- list()

# Loop through each unique SECTION
for (section in unique(algae_presence$SECTION)) {
  # Filter data for the current SECTION
  section_data <- algae_presence %>% filter(SECTION == section)
  
  # Calculate the proportion of each algal type
  algal_counts <- table(section_data$Algal_Type)
  algal_proportions <- algal_counts / sum(algal_counts)
  
  # Create the pie chart for the current SECTION
  pie_chart <- ggplot(section_data, aes(x = "", fill = Algal_Type, values = algal_proportions)) +
    geom_bar(width = 1, stat = "identity") +
    coord_polar("y") +
    labs(title = paste("Algal Cover in SECTION", section),
         fill = "Algal Type") +
    scale_fill_manual(values = algae_colors) +
    theme_minimal()
  
  # Append the pie chart to the list
  pie_charts_list[[section]] <- pie_chart
}

# Arrange and print the pie charts
gridExtra::grid.arrange(grobs = pie_charts_list, ncol = 2)

```
# this isn't working for the stacked bar - each stack is 25%, that is incorrect
```{r}
# stacked bar by zone
# Calculate the counts of each algal type by ZONATION
zonal_counts <- algae_presence %>%
  group_by(ZONATION, Algal_Type) %>%
  summarise(count = n()) %>%
  mutate(algal_proportion = count / sum(count))

# Create the stacked bar plot
stacked_bar_plot <- ggplot(zonal_counts, aes(x = ZONATION, y = algal_proportion, fill = Algal_Type)) +
  geom_bar(stat = "identity") +
  labs(title = "Algal Cover Across Zonation",
       x = "Zonation",
       y = "Proportion",
       fill = "Algal Type") +
  scale_fill_manual(values = c(GREEN = "#00FF00", BROWN = "#A52A2A", RED = "#FF0000", ENCRUSTING = "#FFD700")) +
  theme_minimal()

# Display the stacked bar plot
print(stacked_bar_plot)

```


```{r}
library(ggplot2)

# Assuming you have the "plot_data" data frame after using pivot_longer

# Plot the bar plot with stacked bars to show types of algae present at each section
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Algal_Type)) +
  geom_bar(position = "stack") +
  labs(title = "Types of Algae Present at Each Section",
       x = "Section", fill = "Algal Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ ZONATION)  # Create separate facets for each zonation

```

```{r}


# Create a presence_absence column for each algae type
algae_presence <- algae %>%
  mutate(Presence_Absence_BROWN_FL = ifelse(BROWN_FL == "P", "Present", "Absent"),
         Presence_Absence_RED_FL = ifelse(RED_FL == "P", "Present", "Absent"),
         Presence_Absence_GREEN_FL = ifelse(GREEN_FL == "P", "Present", "Absent"),
         Presence_Absence_ENCRUSTING_FL = ifelse(ENCRUSTING_FL == "P", "Present", "Absent"),
         Presence_Absence_OTHER_FL = ifelse(OTHER_FL == "P", "Present", "Absent"),
         Presence_Absence_BROWN_SL = ifelse(BROWN_SL == "P", "Present", "Absent"),
         Presence_Absence_RED_SL = ifelse(RED_SL == "P", "Present", "Absent"),
         Presence_Absence_GREEN_SL = ifelse(GREEN_SL == "P", "Present", "Absent"),
         Presence_Absence_ENCRUSTING_SL = ifelse(ENCRUSTING_SL == "P", "Present", "Absent"),
         Presence_Absence_OTHER_SL = ifelse(OTHER_SL == "P", "Present", "Absent"))

# Use pivot_longer to gather the columns into rows
plot_data <- algae_presence %>%
  pivot_longer(cols = starts_with("Presence_Absence"), names_to = "Algal_Type", values_to = "Presence_Absence")

# The plot_data data frame now contains three columns: SECTION, Algal_Type, and Presence_Absence
# where Presence_Absence will have "Present" if algae is present (P) or "Absent" if algae is absent (A).

```

#plot data after the P/A mutation
```{r}

# Plot the bar plot
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar() +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```
#trying by zone and by section
```{r}

# Plot the bar plot with facets by zone and section
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar() +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(ZONATION ~ .)  # Create facets by zone and section

```
#more options for algae plotting
```{r}
# Plot the bar plot with wrapped facets by zone and section
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar() +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~ ZONATION)  # Create wrapped facets by zone

```
#type of algae by zone maybe
```{r}
library(ggplot2)

# Assuming you have the "plot_data" data frame after using pivot_longer

# Plot the bar plot with stacked bars to show types of algae present at each section
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Algal_Type)) +
  geom_bar(position = "stack") +
  labs(title = "Types of Algae Present at Each Section",
       x = "Section", fill = "Algal Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ ZONATION)  # Create separate facets for each zonation

```

#trying to plot by zone
```{r}

# Plot the bar plot with facets for each zonation
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar() +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ ZONATION)  # Create separate facets for each zonation

```
#plotting by section and zone
```{r}
# Plot the bar plot with different colors for presence and absence of algal covers
ggplot(data = plot_data, aes(x = factor(SECTION), fill = Presence_Absence)) +
  geom_bar(position = "stack") +
  labs(title = "Algae Presence-Absence by Section and Algal Type",
       x = "Section", fill = "Presence / Absence") +
  scale_fill_manual(values = c("Present" = "blue", "Absent" = "gray")) +  # Customize fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```


#not connected to the earlier stuff
```{r}
#use algae 1 for this
algae <- read.csv("algae1.csv")

algae_presence <- algae %>%
  mutate(presence_absence = ifelse(rowSums(select(., starts_with("BROWN_"))) > 0, "Present", "Absent"))

# Now, use pivot_longer to gather the BROWN_* columns into one column
plot_data <- algae_presence %>%
  pivot_longer(cols = starts_with("BROWN_"), names_to = "Algal_Type", values_to = "Presence_Absence") %>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type))

# plot_data now has three columns: SECTION, Algal_Type, and Presence_Absence
# SECTION: The section data
# Algal_Type: The type of algae (without _FL and _SL)
# Presence_Absence: "Present" if algae is present (1), "Absent" if algae is absent (0)

```

```{r}
# Assuming you have raw data in the "algae" data frame with a column "Presence_Absence"
# and you want to create the "plot_data" with presence-absence converted to binary


# Convert presence-absence to binary (1 for presence, 0 for absence)
algae_binary <- algae %>%
  mutate(Count = ifelse(Presence_Absence == "Present", 1, 0))

# Create the plot_data with binary data
plot_data <- algae_binary %>%
  gather(key = "Algal_Type", value = "Count", starts_with("BROWN_"), factor_key = TRUE) %>%
  mutate(Algal_Type = gsub("_FL|_SL", "", Algal_Type),
         ZONATION = factor(ZONATION, levels = c("L", "M", "U")))

```


# Trying a PCA

```{r}
# This is a plot that shows 3 dimensions of data in 2 dimensions
# Create a species abundance matrix

species_matrix <- t(table(counts1$species, counts1$SECTION))

# Run PCA

pca_result <- rda(species_matrix)

# Create a PCA plot in base R - do this first and then do it in ggplot

plot(pca_result, type = "points", display = "species", cex = 1.5)

# Load required libraries

# install.packages("ggplot2") # If not installed previously library(ggplot2)

# Extract PCA scores for each species

pca_scores <- scores(pca_result, display = "species")

# Convert PCA scores to a data frame

pca_data <- as.data.frame(pca_scores$species)

# Add Species names as a column

pca_data$Species <- rownames(pca_data)

# Create a PCA plot using ggplot2

ggplot(pca_data, aes(x = PC1, y = PC2, label = Species)) + geom_point() + geom_text(size = 3, hjust = 0, vjust = 0) + xlab("Principal Component 1") + ylab("Principal Component 2") + ggtitle("PCA Plot of Intertidal Species") + theme_minimal()
```

# Trying Presence/ Absence Data Manipulation

```{r}

#different data pivot; using the full sheet

data_long <- data %>%
  gather(key = "Seaweed", value = "Presence", FL_ROCKWEED:FL_OTHER)

```

```{r}

seaweed_counts <- data_long %>%
  group_by(SECTION, Seaweed, Presence) %>%
  tally()

```

```{r}

ggplot(seaweed_counts, aes(x = Seaweed, y = n, fill = Presence)) +
  geom_bar(stat = "identity") +
  labs(title = "Presence/Absence of Seaweeds",
       x = "Seaweed Species",
       y = "Count",
       fill = "Presence") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~SECTION, ncol = 2)

```

```{R}
#section1 test data to show zonation v substrate - have to alter this to make that happen

test_long <- data %>%
  pivot_longer(cols = c(BARNACLE, ROCK, ALGAE),
               names_to = "substrate",
               values_to = "percent")

# Printing the first few rows of the resulting data frame
print(data_long)
```

# Percent Cover of Phylum Count for MARINe Data Sites- Manchester & Post
using
postpercent.csv
manpercent.csv

```{r}
#phylum cover


# Plot the stacked bar plot with sections on X-axis and species on Y-axis
ggplot(data = section_zone, aes(x = factor(SECTION), y = Sum_Count, fill = Phylum)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Phylum Counts by Section",
       x = "Section", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```

# Percent cover for stacked bar
```{r}
marine<- read.csv("combinedMarine.csv")
# Load necessary libraries
library(ggplot2)


# Create the stacked bar plot
ggplot(marine, aes(x = SITE, y = Percent_Cover, fill = Phylum)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~SECTION, scales = "free", ncol = 1) +
  labs(title = "Phylum Percent Cover across Site and Section",
       x = "Site",
       y = "Percent Cover") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility

```
```{r}
ggplot(marine, aes(x = Phylum, y = Percent_Cover, fill = Phylum)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(SITE ~ SECTION, scales = "free", space = "free", switch = "y") +
  labs(title = "Phylum Percent Cover across Site and Section",
       x = "Phylum",
       y = "Percent Cover") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility
```
```{r}
# Load necessary libraries
library(ggplot2)

# Create the stacked bar plot with facet_grid
ggplot(marine, aes(x = SITE, fill = Phylum)) +
  geom_bar(position = "dodge") +
  labs(title = "Phylum Counts across Site and Section",
       x = "Site",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility

```
# New stacked bar
```{r}

# Combine Site and Section to form the X-axis label
marine$Site_Section <- paste(marine$SITE, marine$SECTION, sep = "-")

# Define the custom order of Phylum categories
custom_order <- c("Rhodophyta", "Ochrophyta", "Chlorophyta", "Mollusca", "Cnidaria",  "Arthropoda")

# Match colors to phyla from last graph
custom_colors <- c("indianred4", "salmon2", "mediumseagreen", "green2", "royalblue","red")

# Reorder the Phylum variable based on the custom order
marine$Phylum <- factor(marine$Phylum, levels = custom_order)

# Plot the stacked bar graph
ggplot(marine, aes(x = Site_Section, y = Percent_Cover, fill = Phylum)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = custom_colors) +
  labs(title = "Phyla Percent Cover across Site and Section",
       x = "Site - Section",
       y = "Percent Cover") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  guides(fill = guide_legend(reverse = TRUE))  # Reverse the order of the legend

```


# Chi-square to prove highest abundance per section
```{r}

# Chi-square test for each section
section_chi_square_tests <- counts1 %>%
  group_by(SECTION, Phylum) %>%
  summarize(Count = sum(Count)) %>%
  group_by(SECTION) %>%
  do(chi_square_test = chisq.test(.$Count))
print(section_chi_square_tests)

# Chi-square test for each zonation
zonation_chi_square_tests <- counts1 %>%
  group_by(ZONATION, Phylum) %>%
  summarize(Count = sum(Count)) %>%
  group_by(ZONATION) %>%
  do(chi_square_test = chisq.test(.$Count))

```

```{r}

# Frequency analysis for ranking phyla across zonation
phylum_counts_by_zonation <- counts1 %>%
  group_by(Phylum, ZONATION) %>%
  summarize(Count = sum(Count)) %>%
  group_by(Phylum) %>%
  summarize(Total_Count = sum(Count)) %>%
  arrange(desc(Total_Count))

# Frequency analysis for ranking phyla across all sections combined
phylum_counts_combined <- counts1 %>%
  group_by(Phylum) %>%
  summarize(Total_Count = sum(Count)) %>%
  arrange(desc(Total_Count))

```