Skip to content

Commit 4dec548

Browse files
hfricktopepo
andauthored
Iceland classwork (#101)
* update classwork * remove empty leading lines * fix numbering * re-render Co-authored-by: Max Kuhn <[email protected]>
1 parent a6fd18d commit 4dec548

11 files changed

+169
-166
lines changed

classwork/05-classwork.qmd

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ nhl_train <- analysis(nhl_val$splits[[1]])
5252
set.seed(100)
5353
nhl_train %>%
5454
sample_n(200) %>%
55-
plot_nhl_shots(emphasis = position)
55+
plot_nhl_shots(emphasis = shooter_type)
5656
5757
# Your code here!
5858
@@ -191,14 +191,37 @@ autoplot(roc_curve_points)
191191

192192
Compute and plot an ROC curve for your current model.
193193

194+
What data is being used for this ROC curve plot?
195+
194196
```{r}
195197
# Your code here!
196198
197199
```
198200

199-
## Your turn
201+
## Collapsing factor levels
200202

201-
What data is being used for this ROC curve plot?
203+
```{r}
204+
nhl_other_rec <-
205+
recipe(on_goal ~ ., data = nhl_train) %>%
206+
# Any player with <= 0.01% of shots is set to "other"
207+
step_other(shooter, threshold = 0.001) %>%
208+
step_dummy(all_nominal_predictors()) %>%
209+
step_zv(all_predictors())
210+
```
211+
212+
## Does othering help?
213+
214+
```{r}
215+
nhl_other_wflow <-
216+
nhl_glm_wflow %>%
217+
update_recipe(nhl_other_rec)
218+
219+
nhl_other_res <-
220+
nhl_other_wflow %>%
221+
fit_resamples(nhl_val, control = ctrl)
222+
223+
collect_metrics(nhl_other_res)
224+
```
202225

203226
## Player effects
204227

@@ -207,7 +230,7 @@ library(embed)
207230
208231
nhl_effect_rec <-
209232
recipe(on_goal ~ ., data = nhl_train) %>%
210-
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
233+
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
211234
step_dummy(all_nominal_predictors()) %>%
212235
step_zv(all_predictors())
213236
```
@@ -221,7 +244,7 @@ nhl_effect_wflow <-
221244
222245
nhl_effect_res <-
223246
nhl_effect_wflow %>%
224-
fit_resamples(nhl_val)
247+
fit_resamples(nhl_val, control = ctrl)
225248
226249
collect_metrics(nhl_effect_res)
227250
```
@@ -231,36 +254,40 @@ collect_metrics(nhl_effect_res)
231254
```{r}
232255
# angle
233256
nhl_angle_rec <-
234-
nhl_indicators %>%
257+
nhl_effect_rec %>%
235258
step_mutate(
236-
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi))
259+
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) )
237260
)
238261
239-
# distance
240-
nhl_distance_rec <-
262+
# defensive zone
263+
nhl_zone_rec <-
241264
nhl_angle_rec %>%
242265
step_mutate(
243-
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
244-
distance = log(distance)
266+
defensive_zone = ifelse(coord_x <= -25.5, 1, 0)
245267
)
246268
247269
# behind goal line
248270
nhl_behind_rec <-
249-
nhl_distance_rec %>%
271+
nhl_zone_rec %>%
250272
step_mutate(
251-
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
273+
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
252274
)
253275
```
254276

255277
## Fit different recipes
256278

257279
```{r}
280+
no_coord_rec <-
281+
nhl_indicators %>%
282+
step_rm(starts_with("coord"))
283+
258284
set.seed(9)
259285
260286
nhl_glm_set_res <-
261287
workflow_set(
262-
list(`1_dummy` = nhl_indicators, `2_angle` = nhl_angle_rec,
263-
`3_dist` = nhl_distance_rec, `4_bgl` = nhl_behind_rec),
288+
list(`1_no_coord` = no_coord_rec, `2_other` = nhl_other_rec,
289+
`3_effects` = nhl_effect_rec, `4_angle` = nhl_angle_rec,
290+
`5_zone` = nhl_zone_rec, `6_bgl` = nhl_behind_rec),
264291
list(logistic = logistic_reg())
265292
) %>%
266293
workflow_map(fn = "fit_resamples", resamples = nhl_val, verbose = TRUE, control = ctrl)
@@ -294,3 +321,13 @@ collect_metrics(nhl_glm_set_res) %>%
294321
geom_point(size = 3) +
295322
labs(y = NULL, x = "ROC AUC (validation set)")
296323
```
324+
325+
## Debugging a recipe
326+
327+
```{r}
328+
nhl_angle_fit <- prep(nhl_angle_rec)
329+
330+
tidy(nhl_angle_fit, number = 1) %>% slice(1:4)
331+
332+
bake(nhl_angle_fit, nhl_train %>% slice(1:3), starts_with("coord"), angle, shooter)
333+
```

classwork/06-classwork.qmd

Lines changed: 37 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
title: "3 - Tuning Hyperparameters - Classwork"
2+
title: "6 - Tuning Hyperparameters - Classwork"
33
subtitle: "Machine learning with tidymodels"
44
editor_options:
55
chunk_output_type: console
@@ -30,25 +30,24 @@ nhl_val <- validation_split(nhl_train_and_val, prop = 0.80)
3030
3131
nhl_train <- analysis(nhl_val$splits[[1]])
3232
33-
nhl_distance_rec <-
33+
nhl_position_rec <-
3434
recipe(on_goal ~ ., data = nhl_train) %>%
35-
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
35+
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
3636
step_other(all_nominal_predictors()) %>% # TODO: keep this?
3737
step_dummy(all_nominal_predictors()) %>%
3838
step_zv(all_predictors()) %>%
3939
step_mutate(
40-
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
41-
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
42-
distance = log(distance)
40+
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi)),
41+
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
4342
)
4443
45-
nhl_distance_wflow <-
44+
nhl_position_wflow <-
4645
workflow() %>%
47-
add_recipe(nhl_distance_rec) %>%
46+
add_recipe(nhl_position_rec) %>%
4847
add_model(logistic_reg())
4948
50-
nhl_distance_res <-
51-
nhl_distance_wflow %>%
49+
nhl_position_res <-
50+
nhl_position_wflow %>%
5251
fit_resamples(nhl_val)
5352
```
5453

@@ -57,18 +56,16 @@ nhl_distance_res <-
5756
```{r}
5857
glm_rec <-
5958
recipe(on_goal ~ ., data = nhl_train) %>%
60-
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
59+
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
6160
step_dummy(all_nominal_predictors()) %>%
6261
step_mutate(
63-
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
64-
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
65-
distance = log(distance),
66-
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
62+
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
63+
defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
64+
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
6765
) %>%
68-
step_rm(coord_x, coord_y) %>%
6966
step_zv(all_predictors()) %>%
7067
step_ns(angle, deg_free = tune("angle")) %>%
71-
step_ns(distance, deg_free = tune("distance")) %>%
68+
step_ns(coord_x, deg_free = tune("coord_x")) %>%
7269
step_normalize(all_numeric_predictors())
7370
7471
glm_spline_wflow <-
@@ -80,7 +77,7 @@ glm_spline_wflow <-
8077
## Create a grid
8178

8279
```{r}
83-
set.seed(2)
80+
set.seed(12)
8481
grid <-
8582
glm_spline_wflow %>%
8683
extract_parameter_set_dials() %>%
@@ -101,16 +98,16 @@ Try creating a regular grid.
10198
## Update parameter ranges
10299

103100
```{r}
104-
set.seed(2)
101+
set.seed(12)
105102
grid <-
106103
glm_spline_wflow %>%
107104
extract_parameter_set_dials() %>%
108-
update(angle = spline_degree(c(2L, 20L)),
109-
distance = spline_degree(c(2L, 20L))) %>%
105+
update(angle = spline_degree(c(2L, 50L)),
106+
coord_x = spline_degree(c(2L, 50L))) %>%
110107
grid_latin_hypercube(size = 25)
111108
112109
grid %>%
113-
ggplot(aes(angle, distance)) +
110+
ggplot(aes(angle, coord_x)) +
114111
geom_point(size = 4)
115112
```
116113

@@ -123,6 +120,7 @@ ctrl <- control_grid(save_pred = TRUE, parallel_over = "everything")
123120
glm_spline_res <-
124121
glm_spline_wflow %>%
125122
tune_grid(resamples = nhl_val, grid = grid, control = ctrl)
123+
126124
glm_spline_res
127125
```
128126

@@ -157,33 +155,20 @@ show_best(glm_spline_res, metric = "roc_auc")
157155
select_best(glm_spline_res, metric = "roc_auc")
158156
```
159157

160-
## Your turn
161-
162-
Try an alternative selection strategy.
163-
164-
Read the docs for `select_by_pct_loss()`.
165-
166-
Try choosing a model that has a simpler (less "wiggly") relationship for `distance`.
167-
168-
```{r}
169-
# Your code here!
170-
171-
```
172-
173158
## Boosted trees
174159

175160
```{r}
176161
xgb_spec <-
177162
boost_tree(
178-
trees = 500, min_n = tune(), stop_iter = tune(), tree_depth = tune(),
163+
trees = tune(), min_n = tune(), tree_depth = tune(),
179164
learn_rate = tune(), loss_reduction = tune()
180165
) %>%
181166
set_mode("classification") %>%
182-
set_engine("xgboost", validation = 1/10) # <- for better early stopping
167+
set_engine("xgboost")
183168
184169
xgb_rec <-
185170
recipe(on_goal ~ ., data = nhl_train) %>%
186-
step_lencode_mixed(player, outcome = vars(on_goal)) %>%
171+
step_lencode_mixed(shooter, goaltender, outcome = vars(on_goal)) %>%
187172
step_dummy(all_nominal_predictors()) %>%
188173
step_zv(all_predictors())
189174
@@ -218,7 +203,7 @@ set.seed(9)
218203
219204
xgb_res <-
220205
xgb_wflow %>%
221-
tune_grid(resamples = nhl_val, grid = 15, control = ctrl) # automatic grid now!
206+
tune_grid(resamples = nhl_val, grid = 30, control = ctrl) # automatic grid now!
222207
```
223208

224209
## Your turn
@@ -246,12 +231,10 @@ autoplot(xgb_res)
246231
coord_rec <-
247232
xgb_rec %>%
248233
step_mutate(
249-
angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
250-
distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
251-
distance = log(distance),
252-
behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
253-
) %>%
254-
step_rm(coord_x, coord_y)
234+
angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
235+
defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
236+
behind_goal_line = ifelse(coord_x >= 89, 1, 0)
237+
)
255238
256239
xgb_coord_wflow <-
257240
workflow() %>%
@@ -261,14 +244,16 @@ xgb_coord_wflow <-
261244
set.seed(9)
262245
xgb_coord_res <-
263246
xgb_coord_wflow %>%
264-
tune_grid(resamples = nhl_val, grid = 20, control = ctrl)
247+
tune_grid(resamples = nhl_val, grid = 30, control = ctrl)
265248
```
266249

267250
## Did the machine figure it out?
268251

269252
```{r}
270-
show_best(xgb_res, metric = "roc_auc")
271-
show_best(xgb_coord_res, metric = "roc_auc")
253+
# no extra features
254+
show_best(xgb_res, metric = "roc_auc", n = 3)
255+
# with additional coordinate features
256+
show_best(xgb_coord_res, metric = "roc_auc", n = 3)
272257
```
273258

274259
## Compare models
@@ -282,22 +267,11 @@ glm_spline_res %>%
282267

283268
```{r}
284269
# Best boosting results
285-
xgb_coord_res %>%
270+
xgb_res %>%
286271
show_best(metric = "roc_auc", n = 1) %>%
287272
select(.metric, .estimator, mean, n, std_err, .config)
288273
```
289274

290-
## Your turn
291-
292-
Can you get better ROC results with xgboost?
293-
294-
Try increasing `learn_rate` beyond the original range.
295-
296-
```{r}
297-
# Your code here!
298-
299-
```
300-
301275
## Updating the workflow
302276

303277
```{r}
@@ -366,7 +340,7 @@ glm_explainer <- explain_tidymodels(
366340
final_glm_spline_wflow,
367341
data = dplyr::select(nhl_train, -on_goal),
368342
# DALEX required an integer for factors:
369-
y = as.integer(nhl_train$on_goal),
343+
y = as.integer(nhl_train$on_goal) - 1,
370344
verbose = FALSE
371345
)
372346
```
@@ -381,13 +355,13 @@ pdp_coord_x <- model_profile(
381355
glm_explainer,
382356
variables = "coord_x",
383357
N = 500,
384-
groups = "position"
358+
groups = "strength"
385359
)
386360
```
387361

388362
## Your turn
389363

390-
Try grouping by another variable, like `game_type` or `dow`.
364+
Try grouping by another variable, like `extra_attacker` or `game_seconds`.
391365

392366
```{r}
393367
# Your code here!

0 commit comments

Comments
 (0)