I think I might have found a bug but I wanted to check here to see if I’m just doing something stupid before creating an issue. Basically, I’m trying to do early stopping inside of RandomizedSearchCV where my model is a scikit pipeline. My pipeline handles data type conversion so xgboost only sees ints and floats. The system works fine when doing simple fitting, but when I add early stopping I get type errors. Here is a minimum example to showcase the issue
from typing import *
import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, key, reshape=False):
self.key = key
self.reshape = reshape
def fit(self, x, y=None):
return self
def transform(self, data_dict: Dict[Any, List[Any]]):
if self.reshape:
return data_dict[self.key].values[:, None]
else:
# TODO MANAGE PANDAS TYPES HERE!!!
return data_dict[self.key].values
cv = 2
trainX= pd.DataFrame([["a"], ["b"], ["c"]], columns=["feature"])
trainY = np.array([1, 2, 3])
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"classifier__subsample" : [0.5, 0.8]}
fit_params={"classifier__early_stopping_rounds":42,
"classifier__eval_metric" : "mae",
"classifier__eval_set" : [[testX, testY]]}
clf = xgb.XGBRegressor()
model = Pipeline(
[
("selector", ItemSelectorTransformer(key="feature", reshape=True)),
("encoder", OneHotEncoder(handle_unknown="ignore")),
("classifier", clf),
]
)
gridsearch = RandomizedSearchCV(model, paramGrid, verbose=1,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))
# This works fine
gridsearch.fit(trainX, trainY)
# This throws a ValueError: DataFrame.dtypes for data must be int, float or bool
gridsearch.fit(trainX, trainY, **fit_params)
You can make this work by performing the transformations upfront but then you have to prefit things like onehot encoder. This provides a little leakage in the crossvalidation steps and would be nice to avoid.
import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from typing import *
class ItemSelectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, key, reshape=False):
self.key = key
self.reshape = reshape
def fit(self, x, y=None):
return self
def transform(self, data_dict: Dict[Any, List[Any]]):
if self.reshape:
return data_dict[self.key].values[:, None]
else:
# TODO MANAGE PANDAS TYPES HERE!!!
return data_dict[self.key].values
cv = 2
trainX= pd.DataFrame([["a"], ["b"], ["c"]], columns=["feature"])
trainY = np.array([1, 2, 3])
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"classifier__subsample" : [0.5, 0.8]}
clf = xgb.XGBRegressor()
model = Pipeline(
[
("selector", ItemSelectorTransformer(key="feature", reshape=True)),
("encoder", OneHotEncoder(handle_unknown="ignore").fit([["a"], ["b"], ["c"]])),
("classifier", clf),
]
)
fit_params={"classifier__early_stopping_rounds":42,
"classifier__eval_metric" : "mae",
"classifier__eval_set" : [[model.named_steps["encoder"].transform(model.named_steps["selector"].transform(testX)), testY]]}
gridsearch = RandomizedSearchCV(model, paramGrid, verbose=1,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))
# This works fine
gridsearch.fit(trainX, trainY)
print("done")
# This throws a ValueError: DataFrame.dtypes for data must be int, float or bool
gridsearch.fit(trainX, trainY, **fit_params)