Theoretically, tree is invariant for X simple transformation, such as “a * X - b”. However, I do some simple tests, and I surprisingly found different version xgboost has different odd behavior, and all simple transformation will lead to unequal results. Here is the R code:
xgboost 0.71.2, change X
to X - 8
library(xgboost)
set.seed(111)
N <- 80000
p <- 50
X <- matrix(runif(N * p, 0, 1), ncol = p)
colnames(X) <- paste0("x", 1:p)
beta <- runif(p)
y <- X %*% beta #+ rnorm(N, mean = 0, sd = 0.1)
tr <- sample.int(N, N * 0.75)
###
param1 <- list(nrounds = 10, num_parallel_tree = 1, nthread = 10L, eta = 0.3, max_depth = 30,
seed = 2018, colsample_bytree = 1, subsample = 1, min_child_weight = 10,
tree_method = "exact")
param1$data <- X[tr,]
param1$label <- y[tr]
set.seed(2019)
bst1 <- do.call(xgboost::xgboost, param1)
test_pred1 <- predict(bst1, newdata = X[-tr,])
newX <- X - 8
param2 <- list(nrounds = 10, num_parallel_tree = 1, nthread = 10L, eta = 0.3, max_depth = 30,
seed = 2018, colsample_bytree = 1, subsample = 1, min_child_weight = 10,
tree_method = "exact")
param2$data <- newX[tr,]
param2$label <- y[tr]
set.seed(2019)
bst2 <- do.call(xgboost::xgboost, param2)
test_pred2 <- predict(bst2, newdata = newX[-tr,])
summary(test_pred1 - test_pred2)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-1.784631 -0.316670 -0.001692 0.002795 0.321040 1.831196
R sessionInfo()
> sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.5 LTS
Matrix products: default
BLAS: /usr/lib/openblas-base/libblas.so.3
LAPACK: /usr/lib/libopenblasp-r0.2.18.so
locale:
[1] LC_CTYPE=C LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] xgboost_0.71.2
loaded via a namespace (and not attached):
[1] compiler_3.5.2 magrittr_1.5 Matrix_1.2-15 tools_3.5.2 stringi_1.2.4 grid_3.5.2 data.table_1.11.8 lattice_0.20-38
xgboost from master compilation, 0.81.0.1, change X - 8
to X - 1
or X / 10
library(xgboost)
set.seed(111)
N <- 80000
p <- 50
X <- matrix(runif(N * p, 0, 1), ncol = p)
colnames(X) <- paste0("x", 1:p)
beta <- runif(p)
y <- X %*% beta #+ rnorm(N, mean = 0, sd = 0.1)
tr <- sample.int(N, N * 0.75)
###
param1 <- list(nrounds = 10, num_parallel_tree = 1, nthread = 10L, eta = 0.3, max_depth = 30,
seed = 2018, colsample_bytree = 1, subsample = 1, min_child_weight = 10,
tree_method = "exact")
param1$data <- X[tr,]
param1$label <- y[tr]
set.seed(2019)
bst1 <- do.call(xgboost::xgboost, param1)
test_pred1 <- predict(bst1, newdata = X[-tr,])
newX <- X - 1
param2 <- list(nrounds = 10, num_parallel_tree = 1, nthread = 10L, eta = 0.3, max_depth = 30,
seed = 2018, colsample_bytree = 1, subsample = 1, min_child_weight = 10,
tree_method = "exact")
param2$data <- newX[tr,]
param2$label <- y[tr]
set.seed(2019)
bst2 <- do.call(xgboost::xgboost, param2)
test_pred2 <- predict(bst2, newdata = newX[-tr,])
summary(test_pred1 - test_pred2)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-0.714748 -0.109097 -0.003238 -0.002930 0.105858 0.726057