11---
2- title : " 3 - Tuning Hyperparameters - Classwork"
2+ title : " 6 - Tuning Hyperparameters - Classwork"
33subtitle : " Machine learning with tidymodels"
44editor_options :
55 chunk_output_type : console
@@ -30,25 +30,24 @@ nhl_val <- validation_split(nhl_train_and_val, prop = 0.80)
3030
3131nhl_train <- analysis(nhl_val$splits[[1]])
3232
33- nhl_distance_rec <-
33+ nhl_position_rec <-
3434 recipe(on_goal ~ ., data = nhl_train) %>%
35- step_lencode_mixed(player , outcome = vars(on_goal)) %>%
35+ step_lencode_mixed(shooter, goaltender , outcome = vars(on_goal)) %>%
3636 step_other(all_nominal_predictors()) %>% # TODO: keep this?
3737 step_dummy(all_nominal_predictors()) %>%
3838 step_zv(all_predictors()) %>%
3939 step_mutate(
40- angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
41- distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
42- distance = log(distance)
40+ angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi)),
41+ behind_goal_line = ifelse(coord_x >= 89, 1, 0)
4342 )
4443
45- nhl_distance_wflow <-
44+ nhl_position_wflow <-
4645 workflow() %>%
47- add_recipe(nhl_distance_rec ) %>%
46+ add_recipe(nhl_position_rec ) %>%
4847 add_model(logistic_reg())
4948
50- nhl_distance_res <-
51- nhl_distance_wflow %>%
49+ nhl_position_res <-
50+ nhl_position_wflow %>%
5251 fit_resamples(nhl_val)
5352```
5453
@@ -57,18 +56,16 @@ nhl_distance_res <-
5756``` {r}
5857glm_rec <-
5958 recipe(on_goal ~ ., data = nhl_train) %>%
60- step_lencode_mixed(player , outcome = vars(on_goal)) %>%
59+ step_lencode_mixed(shooter, goaltender , outcome = vars(on_goal)) %>%
6160 step_dummy(all_nominal_predictors()) %>%
6261 step_mutate(
63- angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
64- distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
65- distance = log(distance),
66- behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
62+ angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
63+ defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
64+ behind_goal_line = ifelse(coord_x >= 89, 1, 0)
6765 ) %>%
68- step_rm(coord_x, coord_y) %>%
6966 step_zv(all_predictors()) %>%
7067 step_ns(angle, deg_free = tune("angle")) %>%
71- step_ns(distance , deg_free = tune("distance ")) %>%
68+ step_ns(coord_x , deg_free = tune("coord_x ")) %>%
7269 step_normalize(all_numeric_predictors())
7370
7471glm_spline_wflow <-
@@ -80,7 +77,7 @@ glm_spline_wflow <-
8077## Create a grid
8178
8279``` {r}
83- set.seed(2 )
80+ set.seed(12 )
8481grid <-
8582 glm_spline_wflow %>%
8683 extract_parameter_set_dials() %>%
@@ -101,16 +98,16 @@ Try creating a regular grid.
10198## Update parameter ranges
10299
103100``` {r}
104- set.seed(2 )
101+ set.seed(12 )
105102grid <-
106103 glm_spline_wflow %>%
107104 extract_parameter_set_dials() %>%
108- update(angle = spline_degree(c(2L, 20L )),
109- distance = spline_degree(c(2L, 20L ))) %>%
105+ update(angle = spline_degree(c(2L, 50L )),
106+ coord_x = spline_degree(c(2L, 50L ))) %>%
110107 grid_latin_hypercube(size = 25)
111108
112109grid %>%
113- ggplot(aes(angle, distance )) +
110+ ggplot(aes(angle, coord_x )) +
114111 geom_point(size = 4)
115112```
116113
@@ -123,6 +120,7 @@ ctrl <- control_grid(save_pred = TRUE, parallel_over = "everything")
123120glm_spline_res <-
124121 glm_spline_wflow %>%
125122 tune_grid(resamples = nhl_val, grid = grid, control = ctrl)
123+
126124glm_spline_res
127125```
128126
@@ -157,33 +155,20 @@ show_best(glm_spline_res, metric = "roc_auc")
157155select_best(glm_spline_res, metric = "roc_auc")
158156```
159157
160- ## Your turn
161-
162- Try an alternative selection strategy.
163-
164- Read the docs for ` select_by_pct_loss() ` .
165-
166- Try choosing a model that has a simpler (less "wiggly") relationship for ` distance ` .
167-
168- ``` {r}
169- # Your code here!
170-
171- ```
172-
173158## Boosted trees
174159
175160``` {r}
176161xgb_spec <-
177162 boost_tree(
178- trees = 500, min_n = tune(), stop_iter = tune(), tree_depth = tune(),
163+ trees = tune(), min_n = tune(), tree_depth = tune(),
179164 learn_rate = tune(), loss_reduction = tune()
180165 ) %>%
181166 set_mode("classification") %>%
182- set_engine("xgboost", validation = 1/10) # <- for better early stopping
167+ set_engine("xgboost")
183168
184169xgb_rec <-
185170 recipe(on_goal ~ ., data = nhl_train) %>%
186- step_lencode_mixed(player , outcome = vars(on_goal)) %>%
171+ step_lencode_mixed(shooter, goaltender , outcome = vars(on_goal)) %>%
187172 step_dummy(all_nominal_predictors()) %>%
188173 step_zv(all_predictors())
189174
@@ -218,7 +203,7 @@ set.seed(9)
218203
219204xgb_res <-
220205 xgb_wflow %>%
221- tune_grid(resamples = nhl_val, grid = 15 , control = ctrl) # automatic grid now!
206+ tune_grid(resamples = nhl_val, grid = 30 , control = ctrl) # automatic grid now!
222207```
223208
224209## Your turn
@@ -246,12 +231,10 @@ autoplot(xgb_res)
246231coord_rec <-
247232 xgb_rec %>%
248233 step_mutate(
249- angle = abs(atan2(abs(coord_y), (89 - abs(coord_x))) * (180 / pi)),
250- distance = sqrt((89 - abs(coord_x))^2 + abs(coord_y)^2),
251- distance = log(distance),
252- behind_goal_line = ifelse(abs(coord_x) >= 89, 1, 0)
253- ) %>%
254- step_rm(coord_x, coord_y)
234+ angle = abs( atan2(abs(coord_y), (89 - coord_x) ) * (180 / pi) ),
235+ defensive_zone = ifelse(coord_x <= -25.5, 1, 0),
236+ behind_goal_line = ifelse(coord_x >= 89, 1, 0)
237+ )
255238
256239xgb_coord_wflow <-
257240 workflow() %>%
@@ -261,14 +244,16 @@ xgb_coord_wflow <-
261244set.seed(9)
262245xgb_coord_res <-
263246 xgb_coord_wflow %>%
264- tune_grid(resamples = nhl_val, grid = 20 , control = ctrl)
247+ tune_grid(resamples = nhl_val, grid = 30 , control = ctrl)
265248```
266249
267250## Did the machine figure it out?
268251
269252``` {r}
270- show_best(xgb_res, metric = "roc_auc")
271- show_best(xgb_coord_res, metric = "roc_auc")
253+ # no extra features
254+ show_best(xgb_res, metric = "roc_auc", n = 3)
255+ # with additional coordinate features
256+ show_best(xgb_coord_res, metric = "roc_auc", n = 3)
272257```
273258
274259## Compare models
@@ -282,22 +267,11 @@ glm_spline_res %>%
282267
283268``` {r}
284269# Best boosting results
285- xgb_coord_res %>%
270+ xgb_res %>%
286271 show_best(metric = "roc_auc", n = 1) %>%
287272 select(.metric, .estimator, mean, n, std_err, .config)
288273```
289274
290- ## Your turn
291-
292- Can you get better ROC results with xgboost?
293-
294- Try increasing ` learn_rate ` beyond the original range.
295-
296- ``` {r}
297- # Your code here!
298-
299- ```
300-
301275## Updating the workflow
302276
303277``` {r}
@@ -366,7 +340,7 @@ glm_explainer <- explain_tidymodels(
366340 final_glm_spline_wflow,
367341 data = dplyr::select(nhl_train, -on_goal),
368342 # DALEX required an integer for factors:
369- y = as.integer(nhl_train$on_goal),
343+ y = as.integer(nhl_train$on_goal) - 1 ,
370344 verbose = FALSE
371345)
372346```
@@ -381,13 +355,13 @@ pdp_coord_x <- model_profile(
381355 glm_explainer,
382356 variables = "coord_x",
383357 N = 500,
384- groups = "position "
358+ groups = "strength "
385359)
386360```
387361
388362## Your turn
389363
390- Try grouping by another variable, like ` game_type ` or ` dow ` .
364+ Try grouping by another variable, like ` extra_attacker ` or ` game_seconds ` .
391365
392366``` {r}
393367# Your code here!
0 commit comments