Note
Go to the endto download the full example code.
Train XGBoost with cat_in_the_dat dataset
A simple demo for categorical data support using dataset from Kaggle categorical datatutorial.
The excellent tutorial is at:https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
And the data can be found at:https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
Added in version 1.6.0.
See Also
from__future__importannotationsimportosfromtempfileimportTemporaryDirectoryfromtimeimporttimeimportpandasaspdfromsklearn.metricsimportroc_auc_scorefromsklearn.model_selectionimporttrain_test_splitimportxgboostasxgbdefload_cat_in_the_dat()->tuple[pd.DataFrame,pd.Series]:"""Assuming you have already downloaded the data into `input` directory."""df_train=pd.read_csv("./input/cat-in-the-dat/train.csv")print("train data set has got{} rows and{} columns".format(df_train.shape[0],df_train.shape[1]))X=df_train.drop(["target"],axis=1)y=df_train["target"]foriinrange(0,5):X["bin_"+str(i)]=X["bin_"+str(i)].astype("category")foriinrange(0,5):X["nom_"+str(i)]=X["nom_"+str(i)].astype("category")foriinrange(5,10):X["nom_"+str(i)]=X["nom_"+str(i)].apply(int,base=16)foriinrange(0,6):X["ord_"+str(i)]=X["ord_"+str(i)].astype("category")print("train data set has got{} rows and{} columns".format(X.shape[0],X.shape[1]))returnX,yparams={"tree_method":"hist","device":"cuda","n_estimators":32,"colsample_bylevel":0.7,}defcategorical_model(X:pd.DataFrame,y:pd.Series,output_dir:str)->None:"""Train using builtin categorical data support from XGBoost"""X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1994,test_size=0.2)# Be aware that the encoding for X_train and X_test are the same here. In practice,# we should try to use an encoder like (sklearn OrdinalEncoder) to obtain the# categorical values.# Specify `enable_categorical` to True.clf=xgb.XGBClassifier(**params,eval_metric="auc",enable_categorical=True,max_cat_to_onehot=1,# We use optimal partitioning exclusively)clf.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)])clf.save_model(os.path.join(output_dir,"categorical.json"))y_score=clf.predict_proba(X_test)[:,1]# proba of positive samplesauc=roc_auc_score(y_test,y_score)print("AUC of using builtin categorical data support:",auc)defonehot_encoding_model(X:pd.DataFrame,y:pd.Series,output_dir:str)->None:"""Train using one-hot encoded data."""X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)# Specify `enable_categorical` to False as we are using encoded data.clf=xgb.XGBClassifier(**params,eval_metric="auc",enable_categorical=False)clf.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)],)clf.save_model(os.path.join(output_dir,"one-hot.json"))y_score=clf.predict_proba(X_test)[:,1]# proba of positive samplesauc=roc_auc_score(y_test,y_score)print("AUC of using onehot encoding:",auc)if__name__=="__main__":X,y=load_cat_in_the_dat()withTemporaryDirectory()astmpdir:start=time()categorical_model(X,y,tmpdir)end=time()print("Duration:categorical",end-start)X=pd.get_dummies(X)start=time()onehot_encoding_model(X,y,tmpdir)end=time()print("Duration:onehot",end-start)