In this youtube video from Rapids, they convert a pandas dataframe into a dask_cudf, load it into GPU and then directly feed that into the model. They never convert the data into a DMatrix or a DeviceQuantileDmatrix etc. Why does that work?
When I try to replicate this, I get the following error:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-13-b76ffb08a7eb> in <module>
6 params = {'tree_method':'gpu_hist','objective':'rank:pairwise','min_child_weight':mcw,'max_depth':3,'eta':learnRate}
7 watchlist = [(xTrainDC, 'train')]
----> 8 regLong = dask_xgboost.train(client, params, xTrainDC, yTrainDC, num_boost_round=numRounds,evals=watchlist,verbose_eval=verbosity)
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in train(client, params, data, labels, dmatrix_kwargs, **kwargs)
233 """
234 return client.sync(_train, client, params, data,
--> 235 labels, dmatrix_kwargs, **kwargs)
236
237
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
835 else:
836 return sync(
--> 837 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
838 )
839
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
767 if exc_info is not None:
768 try:
--> 769 yielded = self.gen.throw(*exc_info) # type: ignore
770 finally:
771 # Break up a reference to itself
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in _train(client, params, data, labels, dmatrix_kwargs, **kwargs)
193
194 # Get the results, only one will be non-None
--> 195 results = yield client._gather(futures)
196 result = [v for v in results if v]
197 if not params.get('dask_all_models', False):
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1853 exc = CancelledError(key)
1854 else:
-> 1855 raise exception.with_traceback(traceback)
1856 raise exc
1857 if errors == "skip":
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/dask_xgboost/core.py in train_part()
116 logger.info("Starting Rabit, Rank %d", xgb.rabit.get_rank())
117
--> 118 bst = xgb.train(param, dtrain, **kwargs)
119 result = bst
120 if xgb.rabit.get_rank() > 0 and not param.get('dask_all_models', False):
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/training.py in train()
210 evals=evals,
211 obj=obj, feval=feval,
--> 212 xgb_model=xgb_model, callbacks=callbacks)
213
214
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/training.py in _train_internal()
29 params += [('eval_metric', eval_metric)]
30
---> 31 bst = Booster(params, [dtrain] + [d[0] for d in evals])
32 nboost = 0
33 num_parallel_tree = 1
/usr/local/share/anaconda3/envs/rapidsai/lib/python3.7/site-packages/xgboost/core.py in __init__()
934 for d in cache:
935 if not isinstance(d, DMatrix):
--> 936 raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
937 self._validate_features(d)
938
Exception: ('invalid cache item: DataFrame', [<xgboost.core.DMatrix object at 0x7f45a83ddf10>, <dask_cudf.DataFrame | 4 tasks | 4 npartitions>])