Note
Go to the endto download the full example code.
Getting started with categorical data
Experimental support for categorical data.
In before, users need to run an encoder themselves before passing the data into XGBoost,which creates a sparse matrix and potentially increase memory usage. This demoshowcases the experimental categorical data support, more advanced features are planned.
Added in version 1.5.0.
See Also
fromtypingimportTupleimportnumpyasnpimportpandasaspdimportxgboostasxgbdefmake_categorical(n_samples:int,n_features:int,n_categories:int,onehot:bool)->Tuple[pd.DataFrame,pd.Series]:"""Make some random data for demo."""rng=np.random.RandomState(1994)pd_dict={}foriinrange(n_features+1):c=rng.randint(low=0,high=n_categories,size=n_samples)pd_dict[str(i)]=pd.Series(c,dtype=np.int64)df=pd.DataFrame(pd_dict)label=df.iloc[:,0]df=df.iloc[:,1:]foriinrange(0,n_features):label+=df.iloc[:,i]label+=1df=df.astype("category")categories=np.arange(0,n_categories)forcolindf.columns:df[col]=df[col].cat.set_categories(categories)ifonehot:returnpd.get_dummies(df),labelreturndf,labeldefmain()->None:# Use builtin categorical data support# For scikit-learn interface, the input data should be pandas DataFrame or cudf# DataFrame with categorical features. If an numpy/cupy array is used instead, the# `feature_types` for `XGBRegressor` should be set accordingly.X,y=make_categorical(100,10,4,False)# Specify `enable_categorical` to True, also we use onehot-encoding-based split here# for demonstration. For details see the document of `max_cat_to_onehot`.reg=xgb.XGBRegressor(tree_method="hist",enable_categorical=True,max_cat_to_onehot=5,device="cuda")reg.fit(X,y,eval_set=[(X,y)])# Pass in already encoded dataX_enc,y_enc=make_categorical(100,10,4,True)reg_enc=xgb.XGBRegressor(tree_method="hist",device="cuda")reg_enc.fit(X_enc,y_enc,eval_set=[(X_enc,y_enc)])reg_results=np.array(reg.evals_result()["validation_0"]["rmse"])reg_enc_results=np.array(reg_enc.evals_result()["validation_0"]["rmse"])# Check that they have same resultsnp.testing.assert_allclose(reg_results,reg_enc_results)# Convert to DMatrix for SHAP valuebooster:xgb.Booster=reg.get_booster()m=xgb.DMatrix(X,enable_categorical=True)# specify categorical data support.SHAP=booster.predict(m,pred_contribs=True)margin=booster.predict(m,output_margin=True)np.testing.assert_allclose(np.sum(SHAP,axis=len(SHAP.shape)-1),margin,rtol=1e-3)if__name__=="__main__":main()