
huangapple go评论79阅读模式

Using Yardstick to calculate RMSE for aggregate of predictions per group





  1. # 加载数据并稍微转换Neighborhood变量
  2. library(tidymodels)
  3. df <- ames
  4. df <- recipe(Sale_Price ~ ., data = df) %>%
  5. step_other(Neighborhood, threshold = .04) %>%
  6. prep() %>%
  7. bake(new_data = df)
  8. # 根据邻居分割数据
  9. set.seed(1)
  10. df_splits <- group_initial_split(df, group = Neighborhood)
  11. df_train <- training(df_splits)
  12. df_test <- testing(df_splits)
  13. set.seed(2)
  14. df_folds <- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)
  15. # 用于建模Sale_Price的简单配方
  16. rec <- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)
  17. # 为MARS和RF设置规范
  18. mars_earth_spec <-
  19. mars(prod_degree = tune()) %>%
  20. set_engine('earth') %>%
  21. set_mode('regression')
  22. rand_forest_ranger_spec <-
  23. rand_forest(mtry = tune(), min_n = tune()) %>%
  24. set_engine('ranger') %>%
  25. set_mode('regression')
  26. # 设置将我们的配方与模型配对的工作流
  27. no_pre_proc <-
  28. workflow_set(
  29. preproc = list(simple = rec),
  30. models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  31. )
  32. # 调整模型
  33. grid_ctrl <-
  34. control_grid(
  35. save_pred = TRUE,
  36. parallel_over = "everything",
  37. save_workflow = TRUE
  38. )
  39. grid_results <-
  40. no_pre_proc %>%
  41. workflow_map(
  42. seed = 1503,
  43. resamples = df_folds,
  44. grid = 5,
  45. control = grid_ctrl
  46. )
  47. # 通过RMSE对模型进行排名,基于它们估计单个房屋的性能
  48. grid_results %>%
  49. rank_results() %>%
  50. filter(.metric == "rmse") %>%
  51. select(model, .config, rmse = mean, rank)
  52. # 这不是我想要的
  53. # 我想要根据每个社区的总体预测的RMSE对模型进行排名,与总销售价格的总和相比
  54. # 也许我需要类似的东西... Truth = sum(Sale_Price, by = Neighborhood), estimate = sum(.pred, by Neighborhood)



Sometimes I don't want to assess my models on their performance on predicting single observations, but rather I want to assess how a model performs for predictions in aggregate for groups. The group resampling tools in rsample, like group_vfold_cv, are great for ensuring all data splitting keeps groups together. But I want to assess models on group performance rather than performance for single observations.

For example, maybe I want to use a model that predicts induvial housing prices, but I'm ultimately going to use the model to estimate how much a neighborhood is worth.
Using the Ames dataset as an example. We can build models to predict house's sale price. But instead of tuning the model base on the model performance for predicting individual houses, I want to tune the model on its performance in predicting the sum of housing prices for a neighborhood. (I'm imagining that the Ames dataset is "complete" for each neighborhood.)

I have provided a sample code below. And for speed reasons, I kept the resampling and grid minimal.

  1. #Load in data and transform Neighborhood variable a little
  2. library(tidymodels)
  3. df &lt;- ames
  4. df &lt;- recipe(Sale_Price ~ ., data = df) %&gt;%
  5. step_other(Neighborhood, threshold = .04) %&gt;%
  6. prep() %&gt;%
  7. bake(new_data = df)
  8. #Split data based off nieghborhoods
  9. set.seed(1)
  10. df_splits &lt;- group_initial_split(df, group = Neighborhood)
  11. df_train &lt;- training(df_splits)
  12. df_test &lt;- testing(df_splits)
  13. set.seed(2)
  14. df_folds &lt;- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)
  15. #Simple recipe for modeling Sale_Price
  16. rec &lt;- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)
  17. #Setting up specification for MARS and RF
  18. mars_earth_spec &lt;-
  19. mars(prod_degree = tune()) %&gt;%
  20. set_engine(&#39;earth&#39;) %&gt;%
  21. set_mode(&#39;regression&#39;)
  22. rand_forest_ranger_spec &lt;-
  23. rand_forest(mtry = tune(), min_n = tune()) %&gt;%
  24. set_engine(&#39;ranger&#39;) %&gt;%
  25. set_mode(&#39;regression&#39;)
  26. #Setting up the workflow that pairs our recipe with models
  27. no_pre_proc &lt;-
  28. workflow_set(
  29. preproc = list(simple = rec),
  30. models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  31. )
  32. #Tune the models
  33. grid_ctrl &lt;-
  34. control_grid(
  35. save_pred = TRUE,
  36. parallel_over = &quot;everything&quot;,
  37. save_workflow = TRUE
  38. )
  39. grid_results &lt;-
  40. no_pre_proc %&gt;%
  41. workflow_map(
  42. seed = 1503,
  43. resamples = df_folds,
  44. grid = 5,
  45. control = grid_ctrl
  46. )
  47. #Ranking the models by RMSE for models based off their performance estimating individual houses
  48. grid_results %&gt;%
  49. rank_results() %&gt;%
  50. filter(.metric == &quot;rmse&quot;) %&gt;%
  51. select(model, .config, rmse = mean, rank)
  52. #This is not what I want
  53. #I want to rank the models by RMSE of aggregate predictions per neighborhood against the aggregate sale price
  54. #Maybe I need something like... Truth = sum(Sale_Price, by = Neighborhood), estimate = sum(.pred, by Nieghborhood)

I can assess model's RMSE for individual houses, but I want to assess model's RMSE for neighborhood worth.


得分: 1


由于我们在 control_grid() 中设置了 save_pred = TRUE,我们可以使用 collect_predictions()summarize = FALSE 获取所有这些预测。

然后,一系列 {dplyr} 函数和可以应用于分组数据框的 rmse() 应该可以得到你想要的结果。

  1. #加载数据并稍微转换 Neighborhood 变量
  2. library(tidymodels)
  3. df <- ames
  4. df <- recipe(Sale_Price ~ ., data = df) %>%
  5. step_other(Neighborhood, threshold = .04) %>%
  6. prep() %>%
  7. bake(new_data = df)
  8. #基于邻里拆分数据
  9. set.seed(1)
  10. df_splits <- group_initial_split(df, group = Neighborhood)
  11. df_train <- training(df_splits)
  12. df_test <- testing(df_splits)
  13. set.seed(2)
  14. df_folds <- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)
  15. #建立 Sale_Price 的简单建模配方
  16. rec <- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)
  17. #为 MARS 和 RF 设置规格
  18. mars_earth_spec <-
  19. mars(prod_degree = tune()) %>%
  20. set_engine('earth') %>%
  21. set_mode('regression')
  22. rand_forest_ranger_spec <-
  23. rand_forest(mtry = tune(), min_n = tune()) %>%
  24. set_engine('ranger') %>%
  25. set_mode('regression')
  26. #建立将我们的配方与模型配对的工作流
  27. no_pre_proc <-
  28. workflow_set(
  29. preproc = list(simple = rec),
  30. models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  31. )
  32. #调整模型
  33. grid_ctrl <-
  34. control_grid(
  35. save_pred = TRUE,
  36. parallel_over = "everything",
  37. save_workflow = TRUE
  38. )
  39. grid_results <-
  40. no_pre_proc %>%
  41. workflow_map(
  42. seed = 1503,
  43. resamples = df_folds,
  44. grid = 5,
  45. control = grid_ctrl
  46. )
  47. #> i Creating pre-processing data to finalize unknown parameter: mtry
  48. grid_results %>%
  49. collect_predictions(summarize = FALSE) %>%
  50. mutate(Neighborhood = df_train$Neighborhood[.row]) %>%
  51. group_by(id, model, .config, Neighborhood) %>%
  52. summarise(Sale_Price = sum(Sale_Price), .pred = sum(.pred), .groups = "drop") %>%
  53. group_by(id, model, .config) %>%
  54. rmse(truth = Sale_Price, estimate = .pred) %>%
  55. group_by(model, .config) %>%
  56. summarize(mean_rmse = mean(.estimate), .groups = "drop") %>%
  57. arrange(mean_rmse)
  58. #> # A tibble: 7 × 3
  59. #> model .config mean_rmse
  60. #> <chr> <chr> <dbl>
  61. #> 1 rand_forest Preprocessor1_Model1 2667177.
  62. #> 2 mars Preprocessor1_Model2 2695526.
  63. #> 3 rand_forest Preprocessor1_Model4 2819628.
  64. #> 4 rand_forest Preprocessor1_Model5 2824109.
  65. #> 5 rand_forest Preprocessor1_Model3 2845252.
  66. #> 6 rand_forest Preprocessor1_Model2 3059321.
  67. #> 7 mars Preprocessor1_Model1 3563432.

There isn't built-in support for that goal, but you should be able to do it manually.

Since we have save_pred = TRUE in control_grid(), we can get all of those predictions using collect_predictions() with summarize = FALSE.

Then a series of {dplyr} functions and rmse() which can be applied to grouped data.frames should give you what you want.

  1. #Load in data and transform Neighborhood variable a little
  2. library(tidymodels)
  3. df &lt;- ames
  4. df &lt;- recipe(Sale_Price ~ ., data = df) %&gt;%
  5. step_other(Neighborhood, threshold = .04) %&gt;%
  6. prep() %&gt;%
  7. bake(new_data = df)
  8. #Split data based off nieghborhoods
  9. set.seed(1)
  10. df_splits &lt;- group_initial_split(df, group = Neighborhood)
  11. df_train &lt;- training(df_splits)
  12. df_test &lt;- testing(df_splits)
  13. set.seed(2)
  14. df_folds &lt;- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)
  15. #Simple recipe for modeling Sale_Price
  16. rec &lt;- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)
  17. #Setting up specification for MARS and RF
  18. mars_earth_spec &lt;-
  19. mars(prod_degree = tune()) %&gt;%
  20. set_engine(&#39;earth&#39;) %&gt;%
  21. set_mode(&#39;regression&#39;)
  22. rand_forest_ranger_spec &lt;-
  23. rand_forest(mtry = tune(), min_n = tune()) %&gt;%
  24. set_engine(&#39;ranger&#39;) %&gt;%
  25. set_mode(&#39;regression&#39;)
  26. #Setting up the workflow that pairs our recipe with models
  27. no_pre_proc &lt;-
  28. workflow_set(
  29. preproc = list(simple = rec),
  30. models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  31. )
  32. #Tune the models
  33. grid_ctrl &lt;-
  34. control_grid(
  35. save_pred = TRUE,
  36. parallel_over = &quot;everything&quot;,
  37. save_workflow = TRUE
  38. )
  39. grid_results &lt;-
  40. no_pre_proc %&gt;%
  41. workflow_map(
  42. seed = 1503,
  43. resamples = df_folds,
  44. grid = 5,
  45. control = grid_ctrl
  46. )
  47. #&gt; i Creating pre-processing data to finalize unknown parameter: mtry
  48. grid_results %&gt;%
  49. collect_predictions(summarize = FALSE) %&gt;%
  50. mutate(Neighborhood = df_train$Neighborhood[.row]) %&gt;%
  51. group_by(id, model, .config, Neighborhood) %&gt;%
  52. summarise(Sale_Price = sum(Sale_Price), .pred = sum(.pred), .groups = &quot;drop&quot;) %&gt;%
  53. group_by(id, model, .config) %&gt;%
  54. rmse(truth = Sale_Price, estimate = .pred) %&gt;%
  55. group_by(model, .config) %&gt;%
  56. summarize(mean_rmse = mean(.estimate), .groups = &quot;drop&quot;) %&gt;%
  57. arrange(mean_rmse)
  58. #&gt; # A tibble: 7 &#215; 3
  59. #&gt; model .config mean_rmse
  60. #&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt;
  61. #&gt; 1 rand_forest Preprocessor1_Model1 2667177.
  62. #&gt; 2 mars Preprocessor1_Model2 2695526.
  63. #&gt; 3 rand_forest Preprocessor1_Model4 2819628.
  64. #&gt; 4 rand_forest Preprocessor1_Model5 2824109.
  65. #&gt; 5 rand_forest Preprocessor1_Model3 2845252.
  66. #&gt; 6 rand_forest Preprocessor1_Model2 3059321.
  67. #&gt; 7 mars Preprocessor1_Model1 3563432.

  • 本文由 发表于 2023年3月3日 23:38:30
  • 转载请务必保留本文链接:https://go.coder-hub.com/75629097.html



:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:
