Hi, I am trying to train a model with a XGBClassifier on a large dataset (dozens of millions of observations with dozens/hundreds of columns), so it is crucial to use GPU to speed up calculations. I am using cloud instances with 4 A10G Tensor Core GPUs (23 GB each). The right driver is installed and the device is correctly recognized. The server has also 192 CPUs and 384 GB of RAM. I am unable to run models using more train data than what I can fit in the single GPU (so around 23GB of data). I am trying both to directly run Xgboost with the following code:
import xgboost as xgb
import numpy as np
import os
import subprocess
import time
# Get the number of available GPUs using nvidia-smi
num_gpus = int(subprocess.getoutput("nvidia-smi --query-gpu=name --format=csv,noheader | wc -l"))
# Generate a list of GPU indices (e.g., '0,1,2,...')
gpu_indices = ",".join(map(str, range(num_gpus)))
# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_indices
print(f'There are {num_gpus} gpus')
print(gpu_indices)
# Example: Create a large dataset
n_samples = 10_000_000 # 10 million rows
n_features = 100
X = np.random.rand(n_samples, n_features)
y = np.random.randint(0, 2, size=n_samples)
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X, label=y)
print('Data ready')
# XGBoost parameters
params = {
'objective': 'binary:logistic',
'tree_method': 'hist', # Use GPU-accelerated histogram algorithm
'device': 'cuda',
'max_depth': 3,
'eta': 0.1,
'eval_metric': 'auc',
}
# Train the model using multiple GPUs
t0 = time.time()
model = xgb.train(params, dtrain, num_boost_round=100)
print(f"Training complete in {time.time()-t0} seconds.")
or using dask with the following code:
import numpy as np
import subprocess
import os
from xgboost import dask as dxgb
import xgboost as xgb
import dask
import dask.distributed
import dask.array as da
import dask.dataframe as dd
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
import polars as pl
def get_cluster():
# Create a LocalCUDACluster with one worker per GPU
cluster = LocalCUDACluster(n_workers=4, jit_unspill=True, log_spilling=True)
client = Client(cluster)
return client
def generate_data(n_samples=5*10**5, n_features=100, n_partitions=4):
# Generate random data directly in dask_cudf
print("Generating data")
X = dask_cudf.from_cudf(cudf.DataFrame(np.random.rand(n_samples, n_features)), npartitions=n_partitions)
y = dask_cudf.from_cudf(cudf.DataFrame(np.random.randint(0, 2, size=(n_samples, 1))), npartitions=n_partitions)
print("sending data to GPU")
X.persist()
y.persist()
return X, y
if __name__=="__main__":
# Get the number of available GPUs using nvidia-smi
num_gpus = int(subprocess.getoutput("nvidia-smi --query-gpu=name --format=csv,noheader | wc -l"))
# Generate a list of GPU indices (e.g., '0,1,2,...')
gpu_indices = ",".join(map(str, range(num_gpus)))
# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_indices
# Create client
client = get_cluster()
print(client.scheduler_info())
dask_X, dask_y = generate_data(8*10**6)
print(
f"Memory usage: X:{dask_X.memory_usage().sum().compute()/2**30}, y:{dask_y.memory_usage().sum().compute()/2**30}"
)
dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
# Set up parameters for XGBoost
params = {
'objective': 'binary:logistic', # Objective function for binary classification
'learning_rate': 0.1, # Step size shrinkage used to prevent overfitting
'max_depth': 6, # Maximum depth of a tree
'eval_metric': 'logloss', # Evaluation metrics for validation data
'subsample': 0.8, # Subsample ratio of the training instance
'colsample_bytree': 0.8, # Subsample ratio of columns when constructing each tree
'tree_method': 'hist', # Use GPU accelerated histogram algorithm
'device': 'cuda'
}
num_boost_round = 100
print('Starting the training')
# Train the model
output = xgb.dask.train(client,
params,
dtrain,
num_boost_round=num_boost_round,
evals=[(dtrain, 'train')])
# Output contains the model and evaluation history
bst = output['booster'] # The trained model
history = output['history'] # Training history
print(f"Training logloss: {history['train']['logloss'][-1]}")
In both cases, my code crashes with memory error as soon as I exceed the memory of a single GPU, which I monitor with nvidia-smi. In the first snippet of code I see that only the first device is used. In the second with dask I see the data being partially distributed among devices, but the code still crashes as soon as I’m over the size of the single device. Using dask I also tried to distribute better the data with dask_X, dask_y = client.scatter([X, y], broadcast=True) but the script just hangs while scattering. How can I train this model on more data than what fits in a single GPU? I really don’t understand why I cannot train a model on more than 23GB of data while having 96GB of GPU memory.
Thanks!