英文:
tidymodels: loss_accuracy provides no variable importance results
问题
使用鸢尾花数据集,通过迭代搜索对knn分类器进行了调优,以进行多类别分类。然而,在DALEX::model_parts()中使用loss accuracy来计算变量重要性时,结果为空。
我会感激任何想法。非常感谢您的支持!
英文:
Using the iris dataset, a knn-classifier was tuned with iterative search for multiple classification. However, using loss accuracy in DALEX::model_parts() for variable importance, provides empty results.
I would appreciate any ideas. Thank you so much for your support!
library(tidyverse)
library(tidymodels)
library(DALEXtra)
tidymodels_prefer()
df <- iris 
# split
set.seed(2023)
splits <- initial_split(df, strata = Species, prop = 4/5)
df_train <- training(splits)
df_test  <-  testing(splits)
# workflow
df_rec <- recipe(Species ~ ., data = df_train) 
knn_model <- nearest_neighbor(neighbors = tune()) %>% 
  set_engine("kknn") %>% 
  set_mode("classification")
df_wflow <- workflow() %>%
  add_model(knn_model) %>%
  add_recipe(df_rec) 
# cross-validation
set.seed(2023)
knn_res <-
  df_wflow %>%
  tune_bayes(
    metrics = metric_set(accuracy),
    resamples = vfold_cv(df_train, strata = "Species", v = 2),
    control = control_bayes(verbose = TRUE, save_pred = TRUE))
# fit
best_k <- knn_res %>%
  select_best("accuracy")
knn_mod <- df_wflow %>%
  finalize_workflow(best_k) %>%
  fit(df_train)
# variable importance
knn_exp <- explain_tidymodels(extract_fit_parsnip(knn_mod), 
                   data = df_rec %>% prep() %>% bake(new_data = NULL, all_predictors()),
                   y = df_train$Species)
set.seed(2023)
vip <- model_parts(knn_exp, type = "variable_importance", loss_function = loss_accuracy)
plot(vip) # empty plot
答案1
得分: 1
以下是您要翻译的内容:
You are getting 0 for all your results because the model type according to {DALEX} is "multiclass".
These calculations would have worked well if the type is "classification".
knn_exp$model_info$type
# [1] "multiclass"
This means that the prediction that happens will be the predicted probabilities (here we get 1s and 0s because the modeling is quite overfit)
predicted <- knn_exp$predict_function(knn_exp$model, newdata = df_train)
predicted
#      setosa versicolor virginica
# [1,]      1          0         0
# [2,]      1          0         0
# [3,]      1          0         0
# [4,]      1          0         0
# [5,]      1          0         0
# [6,]      1          0         0
# ...
When you use loss_accuracy() as your loss function, it does that by using the following calculations
loss_accuracy
# function (observed, predicted, na.rm = TRUE) 
# mean(observed == predicted, na.rm = na.rm)
# <bytecode: 0x159276bb8>
# <environment: namespace:DALEX>
# attr(,"loss_name")
# [1] "Accuracy"
And we can see why this becomes an issue if we do the calculations steps by step. First we define the observed as the outcome factor
observed <- df_train$Species
observed
#   [1] setosa     setosa     setosa     setosa     setosa     setosa    
#   [7] setosa     setosa     setosa     setosa     setosa     setosa    
#  [13] setosa     setosa     setosa     setosa     setosa     setosa    
#  [19] setosa     setosa     setosa     setosa     setosa     setosa    
#  [25] setosa     setosa     setosa     setosa     setosa     setosa    
#  [31] setosa     setosa     setosa     setosa     setosa     setosa    
#  [37] setosa     setosa     setosa     setosa     versicolor versicolor
#  [43] versicolor versicolor versicolor versicolor versicolor versicolor
#  [49] versicolor versicolor versicolor versicolor versicolor versicolor
#  [55] versicolor versicolor versicolor versicolor versicolor versicolor
#  [61] versicolor versicolor versicolor versicolor versicolor versicolor
#  [67] versicolor versicolor versicolor versicolor versicolor versicolor
#  [73] versicolor versicolor versicolor versicolor versicolor versicolor
#  [79] versicolor versicolor virginica  virginica  virginica  virginica 
#  [85] virginica  virginica  virginica  virginica  virginica  virginica 
#  [91] virginica  virginica  virginica  virginica  virginica  virginica 
#  [97] virginica  virginica  virginica  virginica  virginica  virginica 
# [103] virginica  virginica  virginica  virginica  virginica  virginica 
# [109] virginica  virginica  virginica  virginica  virginica  virginica 
# [115] virginica  virginica  virginica  virginica  virginica  virginica 
# Levels: setosa versicolor virginica
since observed is a factor vector, and predicted is a numeric matrix we get back a logical matrix of FALSE since the values are never the same.
head(observed == predicted)
#      setosa versicolor virginica
# [1,]  FALSE      FALSE     FALSE
# [2,]  FALSE      FALSE     FALSE
# [3,]  FALSE      FALSE     FALSE
# [4,]  FALSE      FALSE     FALSE
# [5,]  FALSE      FALSE     FALSE
# [6,]  FALSE      FALSE     FALSE
So when we take the mean of this we get the expected 0.
mean(observed == predicted)
# [1] 0
英文:
You are getting 0 for all your results because the the model type according to {DALEX} is "multiclass".
These calculations would have worked well if the type is "classification".
knn_exp$model_info$type
#> [1] "multiclass"
This means that the prediction that happens will be the predicted probabilities (here we get 1s and 0s because the modeling is quite overfit)
predicted <- knn_exp$predict_function(knn_exp$model, newdata = df_train)
predicted
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0
#> ...
When you use loss_accuracy() as your loss function, it does that by using the following calculations
loss_accuracy
#> function (observed, predicted, na.rm = TRUE) 
#> mean(observed == predicted, na.rm = na.rm)
#> <bytecode: 0x159276bb8>
#> <environment: namespace:DALEX>
#> attr(,"loss_name")
#> [1] "Accuracy"
And we can see why this becomes an issue if we do the calculations steps by step. First we define the observed as the outcome factor
observed <- df_train$Species
observed
#>   [1] setosa     setosa     setosa     setosa     setosa     setosa    
#>   [7] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [13] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [19] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [25] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [31] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [37] setosa     setosa     setosa     setosa     versicolor versicolor
#>  [43] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [49] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [55] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [61] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [67] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [73] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [79] versicolor versicolor virginica  virginica  virginica  virginica 
#>  [85] virginica  virginica  virginica  virginica  virginica  virginica 
#>  [91] virginica  virginica  virginica  virginica  virginica  virginica 
#>  [97] virginica  virginica  virginica  virginica  virginica  virginica 
#> [103] virginica  virginica  virginica  virginica  virginica  virginica 
#> [109] virginica  virginica  virginica  virginica  virginica  virginica 
#> [115] virginica  virginica  virginica  virginica  virginica  virginica 
#> Levels: setosa versicolor virginica
since observed is a factor vector, and predicted is a numeric matrix we get back a logical matrix of FALSE since the values are never the same.
head(observed == predicted)
#>      setosa versicolor virginica
#> [1,]  FALSE      FALSE     FALSE
#> [2,]  FALSE      FALSE     FALSE
#> [3,]  FALSE      FALSE     FALSE
#> [4,]  FALSE      FALSE     FALSE
#> [5,]  FALSE      FALSE     FALSE
#> [6,]  FALSE      FALSE     FALSE
So when we take the mean of this we get the expected 0.
mean(observed == predicted)
#> [1] 0
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论