XGBoost single node Multi GPU DaskDeviceQuantileDMatrix Freezes

from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask import array as da
from dask import dataframe as dd
import xgboost as xgb
from xgboost import dask as dxgb
from xgboost.dask import DaskDMatrix
import cupy as cp
import argparse


def using_dask_matrix(client: Client, X, y):
    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(client,
                            {'verbosity': 2,
                             # Golden line for GPU training
                             'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=4, evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction


def using_quantile_device_dmatrix(client: Client, X, y):
    '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
     method that reduces memory overhead.  When training on GPU pipeline, it's
     preferred over `DaskDMatrix`.

    .. versionadded:: 1.2.0

    '''
    # Input must be on GPU for `DaskDeviceQuantileDMatrix`.
    X = dask_cudf.from_dask_dataframe(X)
    y = dask_cudf.from_dask_dataframe(y)

    # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
    # that it can not be used for anything else than training.
    dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
    output = xgb.dask.train(client,
                            {'verbosity': 2,
                             'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=4)

    prediction = xgb.dask.predict(client, output, X)
    return prediction


ddqdm = 1
# `LocalCUDACluster` is used for assigning GPU to XGBoost processes.  Here
# `n_workers` represents the number of GPUs since we use one GPU per worker
# process.
with LocalCUDACluster() as cluster:
    with Client(cluster) as client:
        # generate some random data for demonstration
        m = 100000
        n = 100
        X = dd.from_dask_array(da.random.random(size=(m, n), chunks=100))
        y = dd.from_dask_array(da.random.random(size=(m, ), chunks=100))

        if ddqdm == 1:
            print('Using DaskDeviceQuantileDMatrix')
            from_ddqdm = using_quantile_device_dmatrix(client, X, y)
            print(from_ddqdm)
        else:
            print('Using DMatrix')
            from_dmatrix = using_dask_matrix(client, X, y)

This code gets stuck infinitely until distributed library kills the process for not communicating.
Library version:
xgboost == 1.5.0
dask == 2021.11.1
dask_cudf == 21.10.01

It runs fine if I use DaskDMatrix though. Any clue to this will really help a lot.

This is the official example. I have just modified it to use dask_cudf dataframe instead of cupy arrays as in the example.

This works all good. The issue was on my end when replicating this issue as well. There is no problem with the script which runs great.