Shap summary plots for XGBoost with categorical data inputs

huangapple go评论71阅读模式
英文:

Shap summary plots for XGBoost with categorical data inputs

问题

XGBoost支持直接将特征输入为类别,这在存在许多分类变量时非常有用。但似乎这与Shap不兼容:

import pandas as pd
import xgboost
import shap

# 测试数据
test_data = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')

# 拟合xgboost
model = xgboost.XGBRegressor(enable_categorical=True,
                                   tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

# 使用Shap解释
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data)

抛出错误:ValueError: 数据的DataFrame.dtypes必须是整数、浮点数、布尔值或类别。

在这种情况下是否可以使用Shap?

英文:

XGBoost supports inputting features as categories directly, which is very useful when there are a lot of categorical variables. This doesn't seem to be compatible with Shap:

import pandas as pd
import xgboost
import shap

# Test data
test_data = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')

# Fit xgboost
model = xgboost.XGBRegressor(enable_categorical=True,
                                       tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

# Explain with Shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data)

Throws an error: ValueError: DataFrame.dtypes for data must be int, float, bool or category.

Is it possible to use Shap in this situation?

答案1

得分: 1

以下是您要翻译的代码部分:

from sklearn.ensemble import GradientBoostingRegressor
import shap
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.DataFrame({'target':[23,42,58,29,28],
                  'feature_1' : [38, 83, 38, 28, 57],
                  'feature_2' : ['A', 'B', 'A', 'C','A']
              })

df["feature_1"]=df["feature_1"].astype(int)
df["target"]=df["target"].astype(int)

encoder = preprocessing.LabelEncoder()
df["feature_2"]=encoder.fit_transform(df["feature_2"])

print(df)
SEED=42
model = GradientBoostingRegressor(n_estimators=300, max_depth=8, random_state=SEED)

scale= StandardScaler()

#X=df["feature_1","feature_2"]
columns=["feature_1","feature_2"]
n_features=len(columns)
X=np.array(scale.fit_transform(df[columns])).reshape(-1,n_features)
y=np.array(df["target"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
model.fit(X_train,y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df)

print(shap_values)


y_pred=model.predict(X_test)

x=np.arange(len(X_test))
plt.bar(x,y_test)
plt.bar(x,y_pred,color='green')
plt.show()

df = pd.DataFrame({'target':[23,42,58,29,28],
                  'feature_1' : [38, 83, 38, 28, 57],
                  'feature_2' : ['A', 'B', 'A', 'C','A']
              })

df["feature_1"]=df["feature_1"].astype(int)
df["target"]=df["target"].astype(int)

encoder = preprocessing.LabelEncoder()
df["feature_2"]=encoder.fit_transform(df["feature_2"])

SEED=42
#model = xgboost.XGBRegressor(enable_categorical=True,tree_method='hist')
model=xgboost.XGBRegressor(enable_categorical=True,tree_method='hist')
#model = GradientBoostingRegressor(n_estimators=100, max_depth=2, random_state=SEED)

scale= StandardScaler()

#X=df["feature_1","feature_2"]
columns=["feature_1","feature_2"]
n_features=len(columns)
X=np.array(scale.fit_transform(df[columns])).reshape(-1,n_features)
y=np.array(df["target"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.6,random_state=42)
model.fit(X_train,y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values)

y_pred=model.predict(X_test)

x=np.arange(len(X_test))
plt.bar(x,y_test)
plt.bar(x,y_pred,color='green')
plt.show()

希望这对您有所帮助。

英文:

I used GradientBoostingRegressor and reshaped the array into 2 features per element

from sklearn.ensemble import GradientBoostingRegressor
import shap
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']
                  })

df["feature_1"]=df["feature_1"].astype(int)
df["target"]=df["target"].astype(int)

encoder = preprocessing.LabelEncoder()
df["feature_2"]=encoder.fit_transform(df["feature_2"])

print(df)
SEED=42
    model = GradientBoostingRegressor(n_estimators=300, max_depth=8, random_state=SEED)

scale= StandardScaler()

#X=df[["feature_1","feature_2"]]
columns=["feature_1","feature_2"]
n_features=len(columns)
X=np.array(scale.fit_transform(df[columns])).reshape(-1,n_features)
y=np.array(df["target"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
model.fit(X_train,y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df)

print(shap_values)


y_pred=model.predict(X_test)

x=np.arange(len(X_test))
plt.bar(x,y_test)
plt.bar(x,y_pred,color='green')
plt.show()

output:

target  feature_1  feature_2
0      23         38          0
1      42         83          1
2      58         38          0
3      29         28          2
4      28         57          0

Shap values

[[-4.65720266 -3.00946401  0.        ]
 [ 2.32860133 -3.00946401  0.        ]
 [ 2.32860133 -3.00946401  0.        ]
 [-4.65720266 -3.00946401  0.        ]
 [-4.65720266 -3.00946401  0.        ]]

or

    df = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']
                  })

df["feature_1"]=df["feature_1"].astype(int)
df["target"]=df["target"].astype(int)

encoder = preprocessing.LabelEncoder()
df["feature_2"]=encoder.fit_transform(df["feature_2"])

SEED=42
#model = xgboost.XGBRegressor(enable_categorical=True,tree_method='hist')
model=xgboost.XGBRegressor(enable_categorical=True,tree_method='hist')
#model = GradientBoostingRegressor(n_estimators=100, max_depth=2, random_state=SEED)

scale= StandardScaler()

#X=df[["feature_1","feature_2"]]
columns=["feature_1","feature_2"]
n_features=len(columns)
X=np.array(scale.fit_transform(df[columns])).reshape(-1,n_features)
y=np.array(df["target"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.6,random_state=42)
model.fit(X_train,y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values)


y_pred=model.predict(X_test)

x=np.arange(len(X_test))
plt.bar(x,y_test)
plt.bar(x,y_pred,color='green')
plt.show()

答案2

得分: 1

不幸的是,
使用xgboost生成带有分类变量的shap值是一个未解决的问题。请参见,例如,https://github.com/slundberg/shap/issues/2662

鉴于您的具体示例,我使用Dmatrix作为shap的输入使其运行(Dmatrix是xgboost模型的基本数据类型输入,请参见学习API。您正在使用的sklearn API,至少在训练时不需要Dmatrix):

```python
import pandas as pd
import xgboost as xgb
import shap

# 测试数据
test_data = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())
# 用xgboost拟合
model = xgb.XGBRegressor(enable_categorical=True,
                                       tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

# 使用Shap解释
test_data_dm = xgb.DMatrix(data=test_data.drop('target', axis=1), label=test_data['target'], enable_categorical=True)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data_dm)
print(shap_values)

但是,在存在分类变量时生成shap值的能力非常不稳定:例如,如果您在xgboost中添加其他参数,您会得到错误“Check failed: !HasCategoricalSplit()”,这是我第一个链接中引用的错误。

import pandas as pd
import xgboost as xgb
import shap

# 测试数据
test_data = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())
# 用xgboost拟合
model = xgb.XGBRegressor(colsample_bylevel= 0.7, 
                             enable_categorical=True,
                             tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

# 使用Shap解释
test_data_dm = xgb.DMatrix(data=test_data.drop('target', axis=1), label=test_data['target'], enable_categorical=True)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data_dm)
shap_values

我已经搜索了几个月的解决方案,但是,最终,根据我的理解,目前还不可能使用xgboost和分类变量生成shap值(我希望有人能用一个可重现的例子反驳我的观点)。
我建议您尝试使用Catboost

########################## 编辑 ############################

使用Catboost的示例

import pandas as pd
import catboost as cb
import shap

# 测试数据
test_data = pd.DataFrame({'target':[23,42,58,29,28],
                      'feature_1' : [38, 83, 38, 28, 57],
                      'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())

model = cb.CatBoostRegressor(iterations=100)
model.fit(test_data.drop('target', axis=1), test_data['target'],
                    cat_features=['feature_2'], verbose=False)

# 使用Shap解释
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data.drop('target', axis=1))
shap_values
print('shap values: \n',shap_values)

<details>
<summary>英文:</summary>

Unfortunately,
generating shap values with xgboost using categorical variables is an open issue. See, f.e.,  https://github.com/slundberg/shap/issues/2662

Given your specific example, I made it run using Dmatrix as input of shap (Dmatrix is the basic data type input of xgboost models, see the Learning API. The sklearn api, that you are using, doesn&#39;t need the Dmatrix, at least for training):

import pandas as pd
import xgboost as xgb
import shap

Test data

test_data = pd.DataFrame({'target':[23,42,58,29,28],
'feature_1' : [38, 83, 38, 28, 57],
'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())

Fit xgboost

model = xgb.XGBRegressor(enable_categorical=True,
tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

Explain with Shap

test_data_dm = xgb.DMatrix(data=test_data.drop('target', axis=1), label=test_data['target'], enable_categorical=True)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data_dm)
print(shap_values)


But the ability to generate shap values when there are categorical variables is very unstable: f.e., if you add other parameters in the xgboost you get the error &quot;Check failed: !HasCategoricalSplit()&quot;, which is the error referenced in my first link

import pandas as pd
import xgboost as xgb
import shap

Test data

test_data = pd.DataFrame({'target':[23,42,58,29,28],
'feature_1' : [38, 83, 38, 28, 57],
'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())

Fit xgboost

model = xgb.XGBRegressor(colsample_bylevel= 0.7,
enable_categorical=True,
tree_method='hist')
model.fit(test_data.drop('target', axis=1), test_data['target'] )

Explain with Shap

test_data_dm = xgb.DMatrix(data=test_data.drop('target', axis=1), label=test_data['target'], enable_categorical=True)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data_dm)
shap_values


I&#39;ve searched for a solution for months but, to conclude, as for my understanding, it is not really possible yet to generate shap values with xgboost and categorical variables (I hope someone can contradict me, with a reproducible example). 
I suggest you try with the Catboost

########################## EDIT ############################

An example with Catboost

import pandas as pd
import catboost as cb
import shap

Test data

test_data = pd.DataFrame({'target':[23,42,58,29,28],
'feature_1' : [38, 83, 38, 28, 57],
'feature_2' : ['A', 'B', 'A', 'C','A']})
test_data['feature_2'] = test_data['feature_2'].astype('category')
print(test_data.info())

model = cb.CatBoostRegressor(iterations=100)
model.fit(test_data.drop('target', axis=1), test_data['target'],
cat_features=['feature_2'], verbose=False)

Explain with Shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_data.drop('target', axis=1))
shap_values
print('shap values: \n',shap_values)



</details>



huangapple
  • 本文由 发表于 2023年3月31日 21:34:43
  • 转载请务必保留本文链接:https://go.coder-hub.com/75899158.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定