- Notifications
You must be signed in to change notification settings - Fork1.3k
Description
Describe the bug
SMOTENC fails tofit_resample
if input columns of bothbool
andcategory
types are present.
In fact, it looks like the use ofbool
or of acategorical
column with underlyingbool
data forces the conversion of othercategorical
input columns intofloat
, which may end up in error if the conversion is not supported.
Steps/Code to Reproduce
importpandasaspdfromimblearn.over_samplingimportSMOTENCXx=pd.DataFrame({'c':pd.Categorical([xforxin"abbacaba"*3]),'f':[.3,.5,.1,.2]*6,'b':[False,False,True]*8})y=pd.DataFrame({'out':[1,0,0,0,0,1,0,0,1,1,0,0]*2})# Xx.info(), y.info()SMOTENC(categorical_features=[0]).fit_resample(Xx.drop(columns=['b']),y)# OK cat + floatSMOTENC(categorical_features=[0]).fit_resample(Xx.astype({'b':float}),y)# OK cat + float + floatSMOTENC(categorical_features=[0]).fit_resample(Xx.astype({'c':str}),y)# OK 2str/cat + float + boolSMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':str}),y)# OK cat + float + 2str/catSMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':str}).astype({'b':'category'}),y)# OK cat + float + 2str2cat/catSMOTENC(categorical_features=[0]).fit_resample(Xx,y)# ERROR cat + float + bool# SMOTENC(categorical_features=[0,2]).fit_resample(Xx, y) # ERROR cat + float + bool/cat# SMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':'category'}), y) # ERROR cat + float + 2cat/cat
If column'c'
is defined from a string of digits instead ( e.g.'c':pd.Categorical([x for x in "01102010"*3])
) no exceptions are raised, but categorical columns in the output of the problematic calls present onlyNan
values
Expected Results
No error, balancedX,y
in return
Actual Results
Traceback of the first faulty call in the example, other two are very similar
---------------------------------------------------------------------------ValueError Traceback (most recent call last)File ~/miniconda3/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:551, in Categorical.astype(self, dtype, copy) 550 try:--> 551 new_cats = new_cats.astype(dtype=dtype, copy=copy) 552 fill_value = self.categories._na_valueValueError: could not convert string to float: 'a'During handling of the above exception, another exception occurred:ValueError Traceback (most recent call last)Cell In[50], line 23 20 SMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':str}), y) # OK cat + float + 2str/cat 21 SMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':str}).astype({'b':'category'}), y) # OK cat + float + 2str2cat/cat---> 23 SMOTENC(categorical_features=[0]).fit_resample(Xx, y) # ERROR cat + float + bool 24 # SMOTENC(categorical_features=[0,2]).fit_resample(Xx, y) # ERROR cat + float + bool/cat 25 # SMOTENC(categorical_features=[0,2]).fit_resample(Xx.astype({'b':'category'}), y) # ERROR cat + float + 2cat/catFile ~/miniconda3/lib/python3.10/site-packages/imblearn/base.py:203, in BaseSampler.fit_resample(self, X, y) 182 """Resample the dataset. 183 184 Parameters (...) 200 The corresponding label of `X_resampled`. 201 """ 202 self._validate_params()--> 203 return super().fit_resample(X, y)File ~/miniconda3/lib/python3.10/site-packages/imblearn/base.py:82, in SamplerMixin.fit_resample(self, X, y) 80 check_classification_targets(y) 81 arrays_transformer = ArraysTransformer(X, y)---> 82 X, y, binarize_y = self._check_X_y(X, y) 84 self.sampling_strategy_ = check_sampling_strategy( 85 self.sampling_strategy, y, self._sampling_type 86 ) 88 output = self._fit_resample(X, y)File ~/miniconda3/lib/python3.10/site-packages/imblearn/over_sampling/_smote/base.py:540, in SMOTENC._check_X_y(self, X, y) 536 """Overwrite the checking to let pass some string for categorical 537 features. 538 """ 539 y, binarize_y = check_target_type(y, indicate_one_vs_all=True)--> 540 X, y = self._validate_data( 541 X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"] 542 ) 543 return X, y, binarize_yFile ~/miniconda3/lib/python3.10/site-packages/sklearn/base.py:554, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 552 y = check_array(y, input_name="y", **check_y_params) 553 else:--> 554 X, y = check_X_y(X, y, **check_params) 555 out = X, y 557 if not no_val_X and check_params.get("ensure_2d", True):File ~/miniconda3/lib/python3.10/site-packages/sklearn/utils/validation.py:1104, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1099 estimator_name = _check_estimator_name(estimator) 1100 raise ValueError( 1101 f"{estimator_name} requires y to be passed, but the target y is None" 1102 )-> 1104 X = check_array( 1105 X, 1106 accept_sparse=accept_sparse, 1107 accept_large_sparse=accept_large_sparse, 1108 dtype=dtype, 1109 order=order, 1110 copy=copy, 1111 force_all_finite=force_all_finite, 1112 ensure_2d=ensure_2d, 1113 allow_nd=allow_nd, 1114 ensure_min_samples=ensure_min_samples, 1115 ensure_min_features=ensure_min_features, 1116 estimator=estimator, 1117 input_name="X", 1118 ) 1120 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1122 check_consistent_length(X, y)File ~/miniconda3/lib/python3.10/site-packages/sklearn/utils/validation.py:808, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 803 if pandas_requires_conversion: 804 # pandas dataframe requires conversion earlier to handle extension dtypes with 805 # nans 806 # Use the original dtype for conversion if dtype is None 807 new_dtype = dtype_orig if dtype is None else dtype--> 808 array = array.astype(new_dtype) 809 # Since we converted here, we do not need to convert again later 810 dtype = NoneFile ~/miniconda3/lib/python3.10/site-packages/pandas/core/generic.py:6240, in NDFrame.astype(self, dtype, copy, errors) 6233 results = [ 6234 self.iloc[:, i].astype(dtype, copy=copy) 6235 for i in range(len(self.columns)) 6236 ] 6238 else: 6239 # else, only a single dtype is given-> 6240 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6241 return self._constructor(new_data).__finalize__(self, method="astype") 6243 # GH 33113: handle empty frame or seriesFile ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:450, in BaseBlockManager.astype(self, dtype, copy, errors) 449 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:--> 450 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs) 350 applied = b.apply(f, **kwargs) 351 else:--> 352 applied = getattr(b, f)(**kwargs) 353 except (TypeError, NotImplementedError): 354 if not ignore_failures:File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/blocks.py:526, in Block.astype(self, dtype, copy, errors) 508 """ 509 Coerce to the new dtype. 510 (...) 522 Block 523 """ 524 values = self.values--> 526 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 528 new_values = maybe_coerce_values(new_values) 529 newb = self.make_block(new_values)File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:299, in astype_array_safe(values, dtype, copy, errors) 296 return values.copy() 298 try:--> 299 new_values = astype_array(values, dtype, copy=copy) 300 except (ValueError, TypeError): 301 # e.g. astype_nansafe can fail on object-dtype of strings 302 # trying to convert to float 303 if errors == "ignore":File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:227, in astype_array(values, dtype, copy) 223 return values 225 if not isinstance(values, np.ndarray): 226 # i.e. ExtensionArray--> 227 values = values.astype(dtype, copy=copy) 229 else: 230 values = astype_nansafe(values, dtype, copy=copy)File ~/miniconda3/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:562, in Categorical.astype(self, dtype, copy) 557 except ( 558 TypeError, # downstream error msg for CategoricalIndex is misleading 559 ValueError, 560 ): 561 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"--> 562 raise ValueError(msg) 564 result = take_nd( 565 new_cats, ensure_platform_int(self._codes), fill_value=fill_value 566 ) 568 return resultValueError: Cannot cast object dtype to float64
Versions
Linux-5.4.0-135-generic-x86_64-with-glibc2.31
Python 3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]
NumPy 1.22.3
SciPy 1.8.1
Scikit-Learn 1.2.0
Pandas 1.5.2
Imbalanced-Learn 0.10.1