Note
Go to the endto download the full example code.
Experimental support for external memory
This is similar to the one inquantile_data_iterator.py, but for external memoryinstead of Quantile DMatrix. The feature is not ready for production use yet.
Added in version 1.5.0.
Seethe tutorial for more details.
Changed in version 3.0.0:Added
ExtMemQuantileDMatrix.
To run the example, following packages in addition to XGBoost native dependencies arerequired:
scikit-learn
Ifdevice iscuda, following are also needed:
cupy
rmm
cuda-python
Not shown in this example, but you should pay attention to NUMA configuration asdiscussed in the tutorial.
importargparseimportosimporttempfilefromtypingimportCallable,List,Literal,Tupleimportnumpyasnpfromsklearn.datasetsimportmake_regressionimportxgboostdefdevice_mem_total()->int:"""The total number of bytes of memory this GPU has."""importcuda.bindings.runtimeascudartstatus,free,total=cudart.cudaMemGetInfo()ifstatus!=cudart.cudaError_t.cudaSuccess:raiseRuntimeError(cudart.cudaGetErrorString(status))returntotaldefmake_batches(n_samples_per_batch:int,n_features:int,n_batches:int,tmpdir:str,)->List[Tuple[str,str]]:files:List[Tuple[str,str]]=[]rng=np.random.RandomState(1994)foriinrange(n_batches):X,y=make_regression(n_samples_per_batch,n_features,random_state=rng)X_path=os.path.join(tmpdir,"X-"+str(i)+".npy")y_path=os.path.join(tmpdir,"y-"+str(i)+".npy")np.save(X_path,X)np.save(y_path,y)files.append((X_path,y_path))returnfilesclassIterator(xgboost.DataIter):"""A custom iterator for loading files in batches."""def__init__(self,device:Literal["cpu","cuda"],file_paths:List[Tuple[str,str]])->None:self.device=deviceself._file_paths=file_pathsself._it=0# XGBoost will generate some cache files under the current directory with the# prefix "cache"super().__init__(cache_prefix=os.path.join(".","cache"))defload_file(self)->Tuple[np.ndarray,np.ndarray]:"""Load a single batch of data."""X_path,y_path=self._file_paths[self._it]# When the `ExtMemQuantileDMatrix` is used, the device must match. GPU cannot# consume CPU input data and vice-versa.ifself.device=="cpu":X=np.load(X_path)y=np.load(y_path)else:X=cp.load(X_path)y=cp.load(y_path)assertX.shape[0]==y.shape[0]returnX,ydefnext(self,input_data:Callable)->bool:"""Advance the iterator by 1 step and pass the data to XGBoost. This function is called by XGBoost during the construction of ``DMatrix`` """ifself._it==len(self._file_paths):# return False to let XGBoost know this is the end of iterationreturnFalse# input_data is a keyword-only function passed in by XGBoost and has the similar# signature to the ``DMatrix`` constructor.X,y=self.load_file()input_data(data=X,label=y)self._it+=1returnTruedefreset(self)->None:"""Reset the iterator to its beginning"""self._it=0defhist_train(it:Iterator)->None:"""The hist tree method can use a special data structure `ExtMemQuantileDMatrix` for faster initialization and lower memory usage (recommended). .. versionadded:: 3.0.0 """# For non-data arguments, specify it here once instead of passing them by the `next`# method.Xy=xgboost.ExtMemQuantileDMatrix(it,missing=np.nan,enable_categorical=False)booster=xgboost.train({"tree_method":"hist","max_depth":4,"device":it.device},Xy,evals=[(Xy,"Train")],num_boost_round=10,)booster.predict(Xy)defapprox_train(it:Iterator)->None:"""The approx tree method uses the basic `DMatrix` (not recommended)."""# For non-data arguments, specify it here once instead of passing them by the `next`# method.Xy=xgboost.DMatrix(it,missing=np.nan,enable_categorical=False)# ``approx`` is also supported, but less efficient due to sketching. It's# recommended to use `hist` instead.booster=xgboost.train({"tree_method":"approx","max_depth":4,"device":it.device},Xy,evals=[(Xy,"Train")],num_boost_round=10,)booster.predict(Xy)defmain(tmpdir:str,args:argparse.Namespace)->None:"""Entry point for training."""# generate some random data for demofiles=make_batches(n_samples_per_batch=1024,n_features=17,n_batches=31,tmpdir=tmpdir)it=Iterator(args.device,files)hist_train(it)approx_train(it)defsetup_rmm()->None:"""Setup RMM for GPU-based external memory training. It's important to use RMM with `CudaAsyncMemoryResource` or `ArenaMemoryResource` for GPU-based external memory to improve performance. If XGBoost is not built with RMM support, a warning is raised when constructing the `DMatrix`. """importrmmfromrmm.allocators.cupyimportrmm_cupy_allocatorfromrmm.mrimportArenaMemoryResourceifnotxgboost.build_info()["USE_RMM"]:returntotal=device_mem_total()mr=rmm.mr.CudaMemoryResource()mr=ArenaMemoryResource(mr,arena_size=int(total*0.9))rmm.mr.set_current_device_resource(mr)# Set the allocator for cupy as well.cp.cuda.set_allocator(rmm_cupy_allocator)if__name__=="__main__":parser=argparse.ArgumentParser()parser.add_argument("--device",choices=["cpu","cuda"],default="cpu")args=parser.parse_args()ifargs.device=="cuda":importcupyascpsetup_rmm()# Make sure XGBoost is using RMM for all allocations.withxgboost.config_context(use_rmm=True):withtempfile.TemporaryDirectory()astmpdir:main(tmpdir,args)else:withtempfile.TemporaryDirectory()astmpdir:main(tmpdir,args)