DMatrix or DeviceQuantileDMatrix when using Dask XGB

krissyfond · January 21, 2021, 5:15pm

In this youtube video from Rapids, they convert a pandas dataframe into a dask_cudf, load it into GPU and then directly feed that into the model. They never convert the data into a DMatrix or a DeviceQuantileDmatrix etc. Why does that work?

When I try to replicate this, I get the following error:

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-13-b76ffb08a7eb> in <module>
      6 params = {'tree_method':'gpu_hist','objective':'rank:pairwise','min_child_weight':mcw,'max_depth':3,'eta':learnRate}
      7 watchlist = [(xTrainDC, 'train')]
----> 8 regLong = dask_xgboost.train(client, params, xTrainDC, yTrainDC, num_boost_round=numRounds,evals=watchlist,verbose_eval=verbosity)

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in train(client, params, data, labels, dmatrix_kwargs, **kwargs)
    233     """
    234     return client.sync(_train, client, params, data,
--> 235                        labels, dmatrix_kwargs, **kwargs)
    236 
    237 

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    835         else:
    836             return sync(
--> 837                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    838             )
    839 

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
    767                     if exc_info is not None:
    768                         try:
--> 769                             yielded = self.gen.throw(*exc_info)  # type: ignore
    770                         finally:
    771                             # Break up a reference to itself

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in _train(client, params, data, labels, dmatrix_kwargs, **kwargs)
    193 
    194     # Get the results, only one will be non-None
--> 195     results = yield client._gather(futures)
    196     result = [v for v in results if v]
    197     if not params.get('dask_all_models', False):

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1853                             exc = CancelledError(key)
   1854                         else:
-> 1855                             raise exception.with_traceback(traceback)
   1856                         raise exc
   1857                     if errors == "skip":

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in train_part()
    116         logger.info("Starting Rabit, Rank %d", xgb.rabit.get_rank())
    117 
--> 118         bst = xgb.train(param, dtrain, **kwargs)
    119         result = bst
    120         if xgb.rabit.get_rank() > 0 and not param.get('dask_all_models', False):

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/training.py in train()
    210                            evals=evals,
    211                            obj=obj, feval=feval,
--> 212                            xgb_model=xgb_model, callbacks=callbacks)
    213 
    214 

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/training.py in _train_internal()
     29             params += [('eval_metric', eval_metric)]
     30 
---> 31     bst = Booster(params, [dtrain] + [d[0] for d in evals])
     32     nboost = 0
     33     num_parallel_tree = 1

/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/core.py in __init__()
    934         for d in cache:
    935             if not isinstance(d, DMatrix):
--> 936                 raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
    937             self._validate_features(d)
    938 

Exception: ('invalid cache item: DataFrame', [<xgboost.core.DMatrix object at 0x7f45a83ddf10>, <dask_cudf.DataFrame | 4 tasks | 4 npartitions>])

hcho3 · January 27, 2021, 2:22pm

The video is using the dask_xgboost package, which uses a very old version of XGBoost (0.90). For the latest XGBoost version, please use the module xgboost.dask instead. Take a look at the tutorial https://xgboost.readthedocs.io/en/latest/tutorials/dask.html