XGBoost4J-Spark fails on training xgboost 0.90


#1

I’m trying to get XGBoost running on a PySpark server. The setup I have is:

  • Spark v 2.3.1
  • Scala v 2.11.8
  • Java 64-bit server vm v 1.8.0_121
  • Python 2.7

I am using the XGBoost Jar Files from here.

I’m trying to just train a model right now with the following code. This is done in a Jupyter Notebook which has PySpark on the back end. Normal PySpark usage seems to work fine.

import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars file:///home/.../xgboost4j-spark-0.90.jar,file:///home/.../xgboost4j-0.90.jar pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.appName('XGB_trainer').getOrCreate()

spark.sparkContext.addPyFile("file:///home/.../sparkxgb_090.zip")

from sparkxgb.xgboost import XGBoostRegressor

training = spark.read.format("libsvm").load("file:///home/.../example_data.txt")

xgbr = XGBoostRegressor(objective="reg:tweedie", featuresCol="features", labelCol="label")

xgbr.fit(training)

However, when I try to run my code, I’m getting an error, which can be seen below.

I don’t think this is a PySpark specific error since normal PySpark works fine and it seems to load in the function calls for XGBoost fine as well. Does anyone have any insight into how to fix this error?

Py4JJavaErrorTraceback (most recent call last)
<ipython-input-12-58cf61278e4b> in <module>()
----> 1 xgbr.fit(training)

/opt/spark-2.3.1/python/pyspark/ml/base.py in fit(self, dataset, params)
    130                 return self.copy(params)._fit(dataset)
    131             else:
--> 132                 return self._fit(dataset)
    133         else:
    134             raise ValueError("Params must be either a param map or a list/tuple of param maps, "

/opt/spark-2.3.1/python/pyspark/ml/wrapper.py in _fit(self, dataset)
    286
    287     def _fit(self, dataset):
--> 288         java_model = self._fit_java(dataset)
    289         model = self._create_model(java_model)
    290         return self._copyValues(model)

/opt/spark-2.3.1/python/pyspark/ml/wrapper.py in _fit_java(self, dataset)
    283         """
    284         self._transfer_params_to_java()
--> 285         return self._java_obj.fit(dataset._jdf)
    286
    287     def _fit(self, dataset):

/opt/spark-2.3.1/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258
   1259         for temp_arg in temp_args:

/opt/spark-2.3.1/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/opt/spark-2.3.1/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o91.fit.
: ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$.ml$dmlc$xgboost4j$scala$spark$XGBoost$$postTrackerReturnProcessing(XGBoost.scala:582)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$2.apply(XGBoost.scala:459)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$2.apply(XGBoost.scala:435)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.immutable.List.map(List.scala:285)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$.trainDistributed(XGBoost.scala:434)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor.train(XGBoostRegressor.scala:190)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor.train(XGBoostRegressor.scala:48)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)