What's relations among cpus, cores, executors, nThread and num_workers


#1

I am struggled in Xgboost job which halted or timeouted for messy configuration: spark.task.cpus, executor-cores, executor-num and xgboost parameters: nThread and num_workers. When I config executor-num equal to num_workers, my program will fail for timeout. I have to set executor-num much more than num_workers in a big dataset training. Could you help me clarify their relations or give some advice for these configuration? Thanks in advance.


#2

19/07/12 09:55:09 ERROR YarnSchedulerBackend$YarnSchedulerEndpoint: Error requesting driver to remove executor 633 after disconnection.
org.apache.spark.rpc.RpcEnvStoppedException: RpcEnv already stopped.
at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:155)
at org.apache.spark.rpc.netty.Dispatcher.postLocalMessage(Dispatcher.scala:132)
at org.apache.spark.rpc.netty.NettyRpcEnv.ask(NettyRpcEnv.scala:228)
at org.apache.spark.rpc.netty.NettyRpcEndpointRef.ask(NettyRpcEnv.scala:515)
at org.apache.spark.rpc.RpcEndpointRef.ask(RpcEndpointRef.scala:63)
at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$org$apache$spark$scheduler$cluster$YarnSchedulerBackend$$handleExecutorDisconnectedFromDriver$2.apply(YarnSchedulerBackend.scala:253)
at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$org$apache$spark$scheduler$cluster$YarnSchedulerBackend$$handleExecutorDisconnectedFromDriver$2.apply(YarnSchedulerBackend.scala:252)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:253)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:251)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.processBatch$1(BatchingExecutor.scala:63)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:78)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at scala.concurrent.BatchingExecutor$Batch.run(BatchingExecutor.scala:54)
at scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:601)
at scala.concurrent.BatchingExecutor$class.execute(BatchingExecutor.scala:106)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:599)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.trySuccess(Promise.scala:94)
at scala.concurrent.impl.Promise$DefaultPromise.trySuccess(Promise.scala:153)
at org.apache.spark.rpc.netty.NettyRpcEnv.org$apache$spark$rpc$netty$NettyRpcEnv$$onSuccess$1(NettyRpcEnv.scala:216)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$ask$2.apply(NettyRpcEnv.scala:225)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$ask$2.apply(NettyRpcEnv.scala:224)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Promise$class.success(Promise.scala:86)
at scala.concurrent.impl.Promise$DefaultPromise.success(Promise.scala:153)
at org.apache.spark.rpc.netty.LocalNettyRpcCallContext.send(NettyRpcCallContext.scala:50)
at org.apache.spark.rpc.netty.NettyRpcCallContext.reply(NettyRpcCallContext.scala:32)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6$$anonfun$apply$7.apply(YarnAllocator.scala:639)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6$$anonfun$apply$7.apply(YarnAllocator.scala:639)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6.apply(YarnAllocator.scala:639)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6.apply(YarnAllocator.scala:634)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1.apply(YarnAllocator.scala:634)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1.apply(YarnAllocator.scala:563)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at org.apache.spark.deploy.yarn.YarnAllocator.processCompletedContainers(YarnAllocator.scala:563)
at org.apache.spark.deploy.yarn.YarnAllocator.allocateResources(YarnAllocator.scala:286)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$1.run(ApplicationMaster.scala:458)
19/07/12 09:55:09 ERROR YarnSchedulerBackend$YarnSchedulerEndpoint: Error requesting driver to remove executor 4632 after disconnection.
org.apache.spark.rpc.RpcEnvStoppedException: RpcEnv already stopped.
at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:155)
at org.apache.spark.rpc.netty.Dispatcher.postLocalMessage(Dispatcher.scala:132)
at org.apache.spark.rpc.netty.NettyRpcEnv.ask(NettyRpcEnv.scala:228)
at org.apache.spark.rpc.netty.NettyRpcEndpointRef.ask(NettyRpcEnv.scala:515)
at org.apache.spark.rpc.RpcEndpointRef.ask(RpcEndpointRef.scala:63)
at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$org$apache$spark$scheduler$cluster$YarnSchedulerBackend$$handleExecutorDisconnectedFromDriver$2.apply(YarnSchedulerBackend.scala:253)
at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$org$apache$spark$scheduler$cluster$YarnSchedulerBackend$$handleExecutorDisconnectedFromDriver$2.apply(YarnSchedulerBackend.scala:252)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:253)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:251)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:326)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.processBatch$1(BatchingExecutor.scala:63)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:78)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55)
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at scala.concurrent.BatchingExecutor$Batch.run(BatchingExecutor.scala:54)
at scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:601)
at scala.concurrent.BatchingExecutor$class.execute(BatchingExecutor.scala:106)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:599)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.trySuccess(Promise.scala:94)
at scala.concurrent.impl.Promise$DefaultPromise.trySuccess(Promise.scala:153)
at org.apache.spark.rpc.netty.NettyRpcEnv.org$apache$spark$rpc$netty$NettyRpcEnv$$onSuccess$1(NettyRpcEnv.scala:216)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$ask$2.apply(NettyRpcEnv.scala:225)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$ask$2.apply(NettyRpcEnv.scala:224)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark_project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:136)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Promise$class.success(Promise.scala:86)
at scala.concurrent.impl.Promise$DefaultPromise.success(Promise.scala:153)
at org.apache.spark.rpc.netty.LocalNettyRpcCallContext.send(NettyRpcCallContext.scala:50)
at org.apache.spark.rpc.netty.NettyRpcCallContext.reply(NettyRpcCallContext.scala:32)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6$$anonfun$apply$7.apply(YarnAllocator.scala:639)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6$$anonfun$apply$7.apply(YarnAllocator.scala:639)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6.apply(YarnAllocator.scala:639)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1$$anonfun$apply$6.apply(YarnAllocator.scala:634)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1.apply(YarnAllocator.scala:634)
at org.apache.spark.deploy.yarn.YarnAllocator$$anonfun$processCompletedContainers$1.apply(YarnAllocator.scala:563)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at org.apache.spark.deploy.yarn.YarnAllocator.processCompletedContainers(YarnAllocator.scala:563)
at org.apache.spark.deploy.yarn.YarnAllocator.allocateResources(YarnAllocator.scala:286)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$1.run(ApplicationMaster.scala:458)
19/07/12 09:55:09 ERROR ApplicationMaster: User class threw exception: ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.ml$dmlc$xgboost4j$scala$spark$XGBoost$$postTrackerReturnProcessing(XGBoost.scala:511)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$1.apply(XGBoost.scala:404)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$1.apply(XGBoost.scala:381)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:285)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.trainDistributed(XGBoost.scala:380)
at ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier.train(XGBoostClassifier.scala:196)
at ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier.train(XGBoostClassifier.scala:48)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at com.didi.ecr.model.BaseTrain$.train(BaseTrain.scala:145)
at com.didi.ecr.model.BaseTrain$.main(BaseTrain.scala:99)
at com.didi.ecr.model.BaseTrain.main(BaseTrain.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:675)
19/07/12 09:55:09 INFO ApplicationMaster: Final app status: FAILED, exitCode: 15, (reason: User class threw exception: ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed) @hcho3 Could you help me debug this problem? Thanks.