Hi everyone! First post in this community, so apologies if I am repeating content.
I am trying to train a model with XGBoost + Dask and getting an error message that I am having difficulties interpreting. A full notebook output is available here.
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
Cell In[12], line 10
2 params = {
3 'objective': 'reg:squarederror',
4 'learning_rate': 0.1,
5 'max_depth': 3,
6 'n_estimators': 10
7 }
9 # Train the XGBoost model with Dask
---> 10 model = xgb.dask.train(client, params, dtrain, num_boost_round=params['n_estimators'])
12 # Make predictions on the validation dataset
13 predictions = xgb.dask.predict(client, model, dval)
File /opt/conda/lib/python3.10/site-packages/xgboost/core.py:620, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
618 for k, arg in zip(sig.parameters, args):
619 kwargs[k] = arg
--> 620 return func(**kwargs)
File /opt/conda/lib/python3.10/site-packages/xgboost/dask.py:1057, in train(client, params, dtrain, num_boost_round, evals, obj, feval, early_stopping_rounds, xgb_model, verbose_eval, callbacks, custom_metric)
1055 client = _xgb_get_client(client)
1056 args = locals()
-> 1057 return client.sync(
1058 _train_async,
1059 global_config=config.get_config(),
1060 dconfig=_get_dask_config(),
1061 **args,
1062 )
File /opt/conda/lib/python3.10/site-packages/distributed/utils.py:349, in SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
347 return future
348 else:
--> 349 return sync(
350 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
351 )
File /opt/conda/lib/python3.10/site-packages/distributed/utils.py:416, in sync(loop, func, callback_timeout, *args, **kwargs)
414 if error:
415 typ, exc, tb = error
--> 416 raise exc.with_traceback(tb)
417 else:
418 return result
File /opt/conda/lib/python3.10/site-packages/distributed/utils.py:389, in sync.<locals>.f()
387 future = wait_for(future, callback_timeout)
388 future = asyncio.ensure_future(future)
--> 389 result = yield future
390 except Exception:
391 error = sys.exc_info()
File /opt/conda/lib/python3.10/site-packages/tornado/gen.py:767, in Runner.run(self)
765 try:
766 try:
--> 767 value = future.result()
768 except Exception as e:
769 # Save the exception for later. It's important that
770 # gen.throw() not be called inside this try/except block
771 # because that makes sys.exc_info behave unexpectedly.
772 exc: Optional[Exception] = e
File /opt/conda/lib/python3.10/site-packages/xgboost/dask.py:993, in _train_async(client, global_config, dconfig, params, dtrain, num_boost_round, evals, obj, feval, early_stopping_rounds, verbose_eval, xgb_model, callbacks, custom_metric)
990 evals_name = []
991 evals_id = []
--> 993 results = await map_worker_partitions(
994 client,
995 dispatched_train,
996 # extra function parameters
997 params,
998 _rabit_args,
999 id(dtrain),
1000 evals_name,
1001 evals_id,
1002 *([dtrain] + evals_data),
1003 # workers to be used for training
1004 workers=workers,
1005 )
1006 return list(filter(lambda ret: ret is not None, results))[0]
File /opt/conda/lib/python3.10/site-packages/xgboost/dask.py:529, in map_worker_partitions(client, func, workers, *refs)
525 fut = client.submit(
526 func, *args, pure=False, workers=[addr], allow_other_workers=False
527 )
528 futures.append(fut)
--> 529 results = await client.gather(futures)
530 return results
File /opt/conda/lib/python3.10/site-packages/distributed/client.py:2208, in Client._gather(self, futures, errors, direct, local_worker)
2206 exc = CancelledError(key)
2207 else:
-> 2208 raise exception.with_traceback(traceback)
2209 raise exc
2210 if errors == "skip":
File /opt/conda/lib/python3.10/site-packages/xgboost/dask.py:960, in dispatched_train()
957 eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
958 evals.append((eval_Xy, evals_name[i]))
--> 960 booster = worker_train(
961 params=local_param,
962 dtrain=Xy,
963 num_boost_round=num_boost_round,
964 evals_result=local_history,
965 evals=evals if len(evals) != 0 else None,
966 obj=obj,
967 feval=feval,
968 custom_metric=custom_metric,
969 early_stopping_rounds=early_stopping_rounds,
970 verbose_eval=verbose_eval,
971 xgb_model=xgb_model,
972 callbacks=callbacks,
973 )
974 if Xy.num_row() != 0:
975 ret: Optional[TrainReturnT] = {
976 "booster": booster,
977 "history": local_history,
978 }
File /opt/conda/lib/python3.10/site-packages/xgboost/core.py:620, in inner_f()
618 for k, arg in zip(sig.parameters, args):
619 kwargs[k] = arg
--> 620 return func(**kwargs)
File /opt/conda/lib/python3.10/site-packages/xgboost/training.py:185, in train()
183 if cb_container.before_iteration(bst, i, dtrain, evals):
184 break
--> 185 bst.update(dtrain, i, obj)
186 if cb_container.after_iteration(bst, i, dtrain, evals):
187 break
File /opt/conda/lib/python3.10/site-packages/xgboost/core.py:1918, in update()
1915 self._validate_dmatrix_features(dtrain)
1917 if fobj is None:
-> 1918 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1919 ctypes.c_int(iteration),
1920 dtrain.handle))
1921 else:
1922 pred = self.predict(dtrain, output_margin=True, training=True)
File /opt/conda/lib/python3.10/site-packages/xgboost/core.py:279, in _check_call()
268 """Check the return value of C API call
269
270 This function will raise exception when error occurs.
(...)
276 return value from API calls
277 """
278 if ret != 0:
--> 279 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
XGBoostError: [19:05:24] /opt/conda/conda-bld/work/src/common/quantile.cc:345: Check failed: worker_feature.data():
Stack trace:
[bt] (0) /opt/conda/lib/libxgboost.so(+0x20f254) [0x2b0f7a710254]
[bt] (1) /opt/conda/lib/libxgboost.so(+0x29cd9d) [0x2b0f7a79dd9d]
[bt] (2) /opt/conda/lib/libxgboost.so(+0x29df55) [0x2b0f7a79ef55]
[bt] (3) /opt/conda/lib/libxgboost.so(+0x29e391) [0x2b0f7a79f391]
[bt] (4) /opt/conda/lib/python3.10/site-packages/sklearn/utils/../../../../libgomp.so.1(GOMP_parallel+0x46) [0x2b0eca5ab54f]
[bt] (5) /opt/conda/lib/libxgboost.so(void xgboost::common::ParallelFor<unsigned long, xgboost::common::SketchContainerImpl<xgboost::common::WQuantileSketch<float, float> >::AllReduce(std::vector<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer, std::allocator<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer> >*, std::vector<int, std::allocator<int> >*)::{lambda(auto:1)#2}>(unsigned long, int, xgboost::common::Sched, xgboost::common::SketchContainerImpl<xgboost::common::WQuantileSketch<float, float> >::AllReduce(std::vector<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer, std::allocator<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer> >*, std::vector<int, std::allocator<int> >*)::{lambda(auto:1)#2})+0x273) [0x2b0f7a7959c3]
[bt] (6) /opt/conda/lib/libxgboost.so(xgboost::common::SketchContainerImpl<xgboost::common::WQuantileSketch<float, float> >::AllReduce(std::vector<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer, std::allocator<xgboost::common::QuantileSketchTemplate<float, float, xgboost::common::WQSummary<float, float> >::SummaryContainer> >*, std::vector<int, std::allocator<int> >*)+0x8f7) [0x2b0f7a7a4077]
[bt] (7) /opt/conda/lib/libxgboost.so(xgboost::common::SketchContainerImpl<xgboost::common::WQuantileSketch<float, float> >::MakeCuts(xgboost::common::HistogramCuts*)+0xd1) [0x2b0f7a7a4581]
[bt] (8) /opt/conda/lib/libxgboost.so(xgboost::common::SketchOnDMatrix(xgboost::DMatrix*, int, int, bool, xgboost::common::Span<float, 18446744073709551615ul>)+0x1b6f) [0x2b0f7a755c5f]
Any ideas on what this could be related to?