tidymodels
diff --git a/‎inst/doc/chisq_test.R‎
Lines changed: 77 additions & 0 deletions b/‎inst/doc/chisq_test.R‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎inst/doc/chisq_test.Rmd‎
Lines changed: 143 additions & 0 deletions b/‎inst/doc/chisq_test.Rmd‎
Lines changed: 143 additions & 0 deletions
@@ -0,0 +1,77 @@
+## ----include=FALSE-------------------------------------------------------
+knitr::opts_chunk$set(fig.width = 8, fig.height = 3) 
+
+## ----message=FALSE, warning=FALSE----------------------------------------
+library(nycflights13)
+library(dplyr)
+library(ggplot2)
+library(stringr)
+library(infer)
+set.seed(2017)
+fli_small <- flights %>% 
+  na.omit() %>% 
+  sample_n(size = 500) %>% 
+  mutate(season = case_when(
+    month %in% c(10:12, 1:3) ~ "winter",
+    month %in% c(4:9) ~ "summer"
+  )) %>% 
+  mutate(day_hour = case_when(
+    between(hour, 1, 12) ~ "morning",
+    between(hour, 13, 24) ~ "not morning"
+  )) %>% 
+  select(arr_delay, dep_delay, season, 
+         day_hour, origin, carrier)
+
+## ------------------------------------------------------------------------
+obs_chisq <- fli_small %>%
+  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
+  calculate(stat = "Chisq")
+
+## ------------------------------------------------------------------------
+obs_chisq <- fli_small %>% 
+  chisq_test(formula = origin ~ season) %>% 
+  dplyr::select(statistic)
+
+## ------------------------------------------------------------------------
+obs_chisq <- fli_small %>% 
+  chisq_stat(formula = origin ~ season)
+
+## ------------------------------------------------------------------------
+chisq_null_distn <- fli_small %>%
+  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
+  hypothesize(null = "independence") %>%
+  generate(reps = 1000, type = "permute") %>%
+  calculate(stat = "Chisq")
+chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater")
+
+## ------------------------------------------------------------------------
+chisq_null_distn %>% 
+  get_pvalue(obs_stat = obs_chisq, direction = "greater")
+
+## ------------------------------------------------------------------------
+fli_small %>%
+  specify(origin ~ season) %>% 
+  hypothesize(null = "independence") %>%
+  # generate() ## Not used for theoretical
+  calculate(stat = "Chisq") %>%
+  visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right")
+
+## ----eval=FALSE----------------------------------------------------------
+#  fli_small %>%
+#    specify(origin ~ season) %>%  %>% # alt: response = origin, explanatory = season
+#    hypothesize(null = "independence") %>%
+#    generate(reps = 1000, type = "permute") %>%
+#    calculate(stat = "Chisq") %>%
+#    visualize(method = "both", obs_stat = obs_chisq, direction = "right")
+
+## ----echo=FALSE----------------------------------------------------------
+# To use same distribution calculated above
+chisq_null_distn %>% 
+  visualize(method = "both", obs_stat = obs_chisq, direction = "right")
+
+## ------------------------------------------------------------------------
+fli_small %>% 
+  chisq_test(formula = origin ~ season) %>% 
+  dplyr::select(p_value) %>% 
+  dplyr::pull()
+
@@ -0,0 +1,143 @@
+---
+title: "Chi-squared test example using `nycflights13` `flights` data"
+author: "Chester Ismay"
+date: "`r Sys.Date()`"
+output: 
+  rmarkdown::html_vignette:
+    df_print: kable
+vignette: |
+  %\VignetteIndexEntry{Chi-squared test flights example}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r include=FALSE}
+knitr::opts_chunk$set(fig.width = 8, fig.height = 3) 
+```
+
+**Note**: The `type` argument in `generate()` is automatically filled based on the entries for `specify()` and
+`hypothesize()`. It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.
+
+## Data preparation
+
+```{r message=FALSE, warning=FALSE}
+library(nycflights13)
+library(dplyr)
+library(ggplot2)
+library(stringr)
+library(infer)
+set.seed(2017)
+fli_small <- flights %>% 
+  na.omit() %>% 
+  sample_n(size = 500) %>% 
+  mutate(season = case_when(
+    month %in% c(10:12, 1:3) ~ "winter",
+    month %in% c(4:9) ~ "summer"
+  )) %>% 
+  mutate(day_hour = case_when(
+    between(hour, 1, 12) ~ "morning",
+    between(hour, 13, 24) ~ "not morning"
+  )) %>% 
+  select(arr_delay, dep_delay, season, 
+         day_hour, origin, carrier)
+```
+
+* Two numeric - `arr_delay`, `dep_delay`
+* Two categories 
+    - `season` (`"winter"`, `"summer"`), 
+    - `day_hour` (`"morning"`, `"not morning"`)
+* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`)
+* Sixteen categories - `carrier`
+
+***
+
+# One numerical variable, one categorical (2 levels)
+
+## Calculate observed statistic 
+
+The recommended approach is to use `specify() %>% calculate()`:
+
+```{r}
+obs_chisq <- fli_small %>%
+  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
+  calculate(stat = "Chisq")
+```
+
+The observed $\chi^2$ statistic is `r obs_chisq`.
+
+Or using `chisq_test` in `infer`
+
+```{r}
+obs_chisq <- fli_small %>% 
+  chisq_test(formula = origin ~ season) %>% 
+  dplyr::select(statistic)
+```
+
+Again, the observed $\chi^2$ statistic is `r obs_chisq`.
+
+Or using another shortcut function in `infer`:
+
+```{r}
+obs_chisq <- fli_small %>% 
+  chisq_stat(formula = origin ~ season)
+```
+
+Lastly, the observed $\chi^2$ statistic is `r obs_chisq`.
+
+## Randomization approach to $\chi^2$-statistic
+
+```{r}
+chisq_null_distn <- fli_small %>%
+  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
+  hypothesize(null = "independence") %>%
+  generate(reps = 1000, type = "permute") %>%
+  calculate(stat = "Chisq")
+chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater")
+```
+
+## Calculate the randomization-based $p$-value
+
+```{r}
+chisq_null_distn %>% 
+  get_pvalue(obs_stat = obs_chisq, direction = "greater")
+```
+
+
+## Theoretical distribution
+
+```{r }
+fli_small %>%
+  specify(origin ~ season) %>% 
+  hypothesize(null = "independence") %>%
+  # generate() ## Not used for theoretical
+  calculate(stat = "Chisq") %>%
+  visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right")
+```
+
+## Overlay appropriate $\chi^2$ distribution on top of permuted statistics
+
+```{r eval=FALSE}
+fli_small %>%
+  specify(origin ~ season) %>%  %>% # alt: response = origin, explanatory = season
+  hypothesize(null = "independence") %>%
+  generate(reps = 1000, type = "permute") %>%
+  calculate(stat = "Chisq") %>% 
+  visualize(method = "both", obs_stat = obs_chisq, direction = "right")
+```
+
+```{r echo=FALSE}
+# To use same distribution calculated above
+chisq_null_distn %>% 
+  visualize(method = "both", obs_stat = obs_chisq, direction = "right")
+```
+
+
+## Compute theoretical p-value
+
+```{r}
+fli_small %>% 
+  chisq_test(formula = origin ~ season) %>% 
+  dplyr::select(p_value) %>% 
+  dplyr::pull()
+```
+