How to use Early Stopping in an sklearn pipeline when the pipeline handles type conversion

nbertagnolli · June 25, 2020, 7:45pm

I think I might have found a bug but I wanted to check here to see if I’m just doing something stupid before creating an issue. Basically, I’m trying to do early stopping inside of RandomizedSearchCV where my model is a scikit pipeline. My pipeline handles data type conversion so xgboost only sees ints and floats. The system works fine when doing simple fitting, but when I add early stopping I get type errors. Here is a minimum example to showcase the issue

from typing import *
import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin


class ItemSelectorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, key, reshape=False):
        self.key = key
        self.reshape = reshape

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict: Dict[Any, List[Any]]):
        if self.reshape:
            return data_dict[self.key].values[:, None]
        else:
            # TODO MANAGE PANDAS TYPES HERE!!!
            return data_dict[self.key].values

cv = 2

trainX= pd.DataFrame([["a"], ["b"], ["c"]], columns=["feature"])
trainY = np.array([1, 2, 3])

# these are the evaluation sets
testX = trainX 
testY = trainY

paramGrid = {"classifier__subsample" : [0.5, 0.8]}

fit_params={"classifier__early_stopping_rounds":42, 
            "classifier__eval_metric" : "mae", 
            "classifier__eval_set" : [[testX, testY]]}

clf = xgb.XGBRegressor()
model = Pipeline(
    [
        ("selector", ItemSelectorTransformer(key="feature", reshape=True)),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("classifier", clf),
    ]
)

gridsearch = RandomizedSearchCV(model, paramGrid, verbose=1,             
         cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))

# This works fine
gridsearch.fit(trainX, trainY)

# This throws a ValueError: DataFrame.dtypes for data must be int, float or bool
gridsearch.fit(trainX, trainY, **fit_params)

You can make this work by performing the transformations upfront but then you have to prefit things like onehot encoder. This provides a little leakage in the crossvalidation steps and would be nice to avoid.

import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from typing import *


class ItemSelectorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, key, reshape=False):
        self.key = key
        self.reshape = reshape

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict: Dict[Any, List[Any]]):
        if self.reshape:
            return data_dict[self.key].values[:, None]
        else:
            # TODO MANAGE PANDAS TYPES HERE!!!
            return data_dict[self.key].values

cv = 2

trainX= pd.DataFrame([["a"], ["b"], ["c"]], columns=["feature"])
trainY = np.array([1, 2, 3])

# these are the evaluation sets
testX = trainX 
testY = trainY

paramGrid = {"classifier__subsample" : [0.5, 0.8]}

clf = xgb.XGBRegressor()
model = Pipeline(
    [
        ("selector", ItemSelectorTransformer(key="feature", reshape=True)),
        ("encoder", OneHotEncoder(handle_unknown="ignore").fit([["a"], ["b"], ["c"]])),
        ("classifier", clf),
    ]
)

fit_params={"classifier__early_stopping_rounds":42, 
            "classifier__eval_metric" : "mae", 
            "classifier__eval_set" : [[model.named_steps["encoder"].transform(model.named_steps["selector"].transform(testX)), testY]]}

gridsearch = RandomizedSearchCV(model, paramGrid, verbose=1,             
         cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))

# This works fine
gridsearch.fit(trainX, trainY)
print("done")

# This throws a ValueError: DataFrame.dtypes for data must be int, float or bool
gridsearch.fit(trainX, trainY, **fit_params)

john-hawkins · September 2, 2021, 12:13am

Hi @nbertagnolli

I am facing this same issue.

Did you find a way around it?