Step 1: Visit the TidyTuesday website and check out the resources and podcast episodes: https://www.tidytuesday.com/
Step 2: Go to the TIdyTuesday GitHub repo and read the README: https://github.com/rfordatascience/tidytuesday
Step 3: Go to the 2023 bird watching dataset and read the README and the data dictionary information: https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-10/readme.md
Step 4: Download the data set and subset using this code in R:
#install.packages("tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load(‘2023-01-10’) tuesdata <- tidytuesdayR::tt_load(2023, week = 02)
feederwatch <- tuesdata$feederwatch
#tuesdata <- tidytuesdayR::tt_load(2023, week = 02)
#feederwatch <- tuesdata$feederwatch
feederwatch <- readr::read_csv(‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_2021_public.csv’) site_data <- readr::read_csv(‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_count_site_data_public_2021.csv’)
feederwatch <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_2021_public.csv')
## Rows: 100000 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): loc_id, subnational1_code, entry_technique, sub_id, obs_id, PROJ_P...
## dbl (14): latitude, longitude, Month, Day, Year, how_many, valid, reviewed, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
site_data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_count_site_data_public_2021.csv')
## Rows: 254355 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): loc_id, proj_period_id
## dbl (60): yard_type_pavement, yard_type_garden, yard_type_landsca, yard_type...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PFW_2021_public <- readr::read_csv(“https://clo-pfw-prod.s3.us-west-2.amazonaws.com/data/PFW_2021_public.csv”) dplyr::glimpse(PFW_2021_public)
PFW_2021_public <- readr::read_csv("https://clo-pfw-prod.s3.us-west-2.amazonaws.com/data/PFW_2021_public.csv")
## Rows: 2897105 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): loc_id, subnational1_code, entry_technique, sub_id, obs_id, PROJ_P...
## dbl (14): latitude, longitude, Month, Day, Year, how_many, valid, reviewed, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(PFW_2021_public)
## Rows: 2,897,105
## Columns: 22
## $ loc_id <chr> "L12782033", "L12782033", "L12782033", "L12755941",…
## $ latitude <dbl> 44.57415, 44.57415, 44.57415, 54.13687, 54.13687, 5…
## $ longitude <dbl> -78.20561, -78.20561, -78.20561, -108.68786, -108.6…
## $ subnational1_code <chr> "CA-ON", "CA-ON", "CA-ON", "CA-SK", "CA-SK", "CA-SK…
## $ entry_technique <chr> "/GOOGLE_MAP/ZOOM:15", "/GOOGLE_MAP/ZOOM:15", "/GOO…
## $ sub_id <chr> "S79876486", "S79876486", "S79876486", "S76634904",…
## $ obs_id <chr> "OBS1059258900", "OBS1059259256", "OBS1059259639", …
## $ Month <dbl> 1, 1, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1, 1, …
## $ Day <dbl> 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 20,…
## $ Year <dbl> 2021, 2021, 2021, 2020, 2020, 2020, 2020, 2020, 202…
## $ PROJ_PERIOD_ID <chr> "PFW_2021", "PFW_2021", "PFW_2021", "PFW_2021", "PF…
## $ species_code <chr> "amtspa", "blujay", "bkcchi", "dowwoo", "whbnut", "…
## $ how_many <dbl> 1, 2, 2, 2, 2, 20, 1, 2, 14, 2, 22, 20, 2, 10, 1, 2…
## $ valid <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ reviewed <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day1_am <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day1_pm <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day2_am <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day2_pm <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ effort_hrs_atleast <dbl> 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.…
## $ snow_dep_atleast <dbl> 5.000, 5.000, 5.000, 15.001, 15.001, 15.001, 15.001…
## $ Data_Entry_Method <chr> "PFW Mobile App v1.1.17", "PFW Mobile App v1.1.17",…
set.seed(424242) PFW_2021_public_subset <- dplyr::slice_sample(PFW_2021_public, n = 1e5)
readr::write_csv(PFW_2021_public_subset, here::here(“data”, “2023”, “2023-01-10”, “PFW_2021_public.csv”))
set.seed(424242)
PFW_2021_public_subset <- dplyr::slice_sample(PFW_2021_public, n = 1e5)
readr::write_csv(PFW_2021_public_subset, here::here("misc", "data", "PFW_2021_public.csv"))
Step 5: Now that you know what types of data are available, write down 2-3 questions you want to answer using this dataset (e.g., how does the number of birds sited relate to the habitat type?)
how many birds based on place date type of bird
Step 6: Manipulate the data (make it tidy!) and make some fun plots! Post your favorite plot to this discussion thread. In lab meeting we will go around the room and talk about what questions we had, how we manipulated the data and any issues we ran into, and the plots we
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
PFW_2021_public_subset %>%
group_by(subnational1_code, Month, species_code) %>%
summarize(total_count = sum(how_many, na.rm=TRUE)) %>%
filter(total_count > 500)
## `summarise()` has grouped output by 'subnational1_code', 'Month'. You can
## override using the `.groups` argument.
## # A tibble: 37 × 4
## # Groups: subnational1_code, Month [17]
## subnational1_code Month species_code total_count
## <chr> <dbl> <chr> <dbl>
## 1 CA-BC 1 pinsis 607
## 2 CA-ON 1 amegfi 549
## 3 CA-ON 1 bkcchi 668
## 4 CA-ON 1 blujay 529
## 5 CA-ON 1 comred 1085
## 6 CA-ON 1 daejun 729
## 7 CA-ON 1 houspa 1018
## 8 CA-ON 1 moudov 942
## 9 CA-ON 2 amegfi 739
## 10 CA-ON 2 bkcchi 549
## # … with 27 more rows
df <- PFW_2021_public_subset %>%
group_by(subnational1_code, Month, species_code) %>%
summarize(total_count = sum(how_many, na.rm=TRUE)) %>%
filter(total_count > 500)
## `summarise()` has grouped output by 'subnational1_code', 'Month'. You can
## override using the `.groups` argument.
ggplot(df, aes(x = total_count)) +
geom_histogram() +
facet_wrap(~species_code)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = total_count)) +
geom_bar() +
facet_wrap(~Month)
ggplot(df, aes(x = total_count)) +
geom_boxplot() +
facet_wrap(~subnational1_code)
ggplot(df, aes(x = total_count)) +
geom_boxplot()
Step 7: Keep up with Tidy Tuesday in the future if you want!