Step 1: Visit the TidyTuesday website and check out the resources and podcast episodes: https://www.tidytuesday.com/

Step 2: Go to the TIdyTuesday GitHub repo and read the README: https://github.com/rfordatascience/tidytuesday

Step 3: Go to the 2023 bird watching dataset and read the README and the data dictionary information: https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-10/readme.md

Step 4: Download the data set and subset using this code in R:

Get the Data

Read in with tidytuesdayR package

Install from CRAN via: install.packages(“tidytuesdayR”)

This loads the readme and all the datasets for the week of interest

#install.packages("tidytuesdayR")

Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load(‘2023-01-10’) tuesdata <- tidytuesdayR::tt_load(2023, week = 02)

feederwatch <- tuesdata$feederwatch

#tuesdata <- tidytuesdayR::tt_load(2023, week = 02)

#feederwatch <- tuesdata$feederwatch

Or read in the data manually

feederwatch <- readr::read_csv(‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_2021_public.csv’) site_data <- readr::read_csv(‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_count_site_data_public_2021.csv’)

feederwatch <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_2021_public.csv')

## Rows: 100000 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): loc_id, subnational1_code, entry_technique, sub_id, obs_id, PROJ_P...
## dbl (14): latitude, longitude, Month, Day, Year, how_many, valid, reviewed, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

site_data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-10/PFW_count_site_data_public_2021.csv')

## Rows: 254355 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): loc_id, proj_period_id
## dbl (60): yard_type_pavement, yard_type_garden, yard_type_landsca, yard_type...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Download the raw data.

PFW_2021_public <- readr::read_csv(“https://clo-pfw-prod.s3.us-west-2.amazonaws.com/data/PFW_2021_public.csv”) dplyr::glimpse(PFW_2021_public)

PFW_2021_public <- readr::read_csv("https://clo-pfw-prod.s3.us-west-2.amazonaws.com/data/PFW_2021_public.csv")

## Rows: 2897105 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): loc_id, subnational1_code, entry_technique, sub_id, obs_id, PROJ_P...
## dbl (14): latitude, longitude, Month, Day, Year, how_many, valid, reviewed, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dplyr::glimpse(PFW_2021_public)

## Rows: 2,897,105
## Columns: 22
## $ loc_id             <chr> "L12782033", "L12782033", "L12782033", "L12755941",…
## $ latitude           <dbl> 44.57415, 44.57415, 44.57415, 54.13687, 54.13687, 5…
## $ longitude          <dbl> -78.20561, -78.20561, -78.20561, -108.68786, -108.6…
## $ subnational1_code  <chr> "CA-ON", "CA-ON", "CA-ON", "CA-SK", "CA-SK", "CA-SK…
## $ entry_technique    <chr> "/GOOGLE_MAP/ZOOM:15", "/GOOGLE_MAP/ZOOM:15", "/GOO…
## $ sub_id             <chr> "S79876486", "S79876486", "S79876486", "S76634904",…
## $ obs_id             <chr> "OBS1059258900", "OBS1059259256", "OBS1059259639", …
## $ Month              <dbl> 1, 1, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1, 1, …
## $ Day                <dbl> 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 20,…
## $ Year               <dbl> 2021, 2021, 2021, 2020, 2020, 2020, 2020, 2020, 202…
## $ PROJ_PERIOD_ID     <chr> "PFW_2021", "PFW_2021", "PFW_2021", "PFW_2021", "PF…
## $ species_code       <chr> "amtspa", "blujay", "bkcchi", "dowwoo", "whbnut", "…
## $ how_many           <dbl> 1, 2, 2, 2, 2, 20, 1, 2, 14, 2, 22, 20, 2, 10, 1, 2…
## $ valid              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ reviewed           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day1_am            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day1_pm            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day2_am            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ day2_pm            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ effort_hrs_atleast <dbl> 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.…
## $ snow_dep_atleast   <dbl> 5.000, 5.000, 5.000, 15.001, 15.001, 15.001, 15.001…
## $ Data_Entry_Method  <chr> "PFW Mobile App v1.1.17", "PFW Mobile App v1.1.17",…

There are almost three million rows! The file is too big for github, let’s

subsample.

set.seed(424242) PFW_2021_public_subset <- dplyr::slice_sample(PFW_2021_public, n = 1e5)

readr::write_csv(PFW_2021_public_subset, here::here(“data”, “2023”, “2023-01-10”, “PFW_2021_public.csv”))

set.seed(424242)
PFW_2021_public_subset <- dplyr::slice_sample(PFW_2021_public, n = 1e5)

readr::write_csv(PFW_2021_public_subset, here::here("misc", "data", "PFW_2021_public.csv"))

Step 5: Now that you know what types of data are available, write down 2-3 questions you want to answer using this dataset (e.g., how does the number of birds sited relate to the habitat type?)

how many birds based on place date type of bird

Step 6: Manipulate the data (make it tidy!) and make some fun plots! Post your favorite plot to this discussion thread. In lab meeting we will go around the room and talk about what questions we had, how we manipulated the data and any issues we ran into, and the plots we

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

PFW_2021_public_subset %>%
  group_by(subnational1_code, Month, species_code) %>%
  summarize(total_count = sum(how_many, na.rm=TRUE)) %>%
  filter(total_count > 500)

## `summarise()` has grouped output by 'subnational1_code', 'Month'. You can
## override using the `.groups` argument.

## # A tibble: 37 × 4
## # Groups:   subnational1_code, Month [17]
##    subnational1_code Month species_code total_count
##    <chr>             <dbl> <chr>              <dbl>
##  1 CA-BC                 1 pinsis               607
##  2 CA-ON                 1 amegfi               549
##  3 CA-ON                 1 bkcchi               668
##  4 CA-ON                 1 blujay               529
##  5 CA-ON                 1 comred              1085
##  6 CA-ON                 1 daejun               729
##  7 CA-ON                 1 houspa              1018
##  8 CA-ON                 1 moudov               942
##  9 CA-ON                 2 amegfi               739
## 10 CA-ON                 2 bkcchi               549
## # … with 27 more rows

df <- PFW_2021_public_subset %>%
  group_by(subnational1_code, Month, species_code) %>%
  summarize(total_count = sum(how_many, na.rm=TRUE)) %>%
  filter(total_count > 500)

## `summarise()` has grouped output by 'subnational1_code', 'Month'. You can
## override using the `.groups` argument.

ggplot(df, aes(x = total_count)) +
  geom_histogram() +
  facet_wrap(~species_code)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = total_count)) +
  geom_bar() +
  facet_wrap(~Month)

ggplot(df, aes(x = total_count)) +
  geom_boxplot() +
  facet_wrap(~subnational1_code)

ggplot(df, aes(x = total_count)) +
  geom_boxplot()

Step 7: Keep up with Tidy Tuesday in the future if you want!

01-tt

2023-02-07