Jul 8, 2023 · Jul 8, 2023 · Jul 8, 2023
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst

 In this data set, the first and last features are considered as categorical
 features. One needs to provide this information to:class:`SMOTENC` via the
 parameters ``categorical_features`` either by passing the indices of these
 features or a boolean mask marking these features::
 parameters ``categorical_features`` either by passing the indices, the feature
 names when `X` is a pandas DataFrame, or a boolean mask marking these features::

  >>> from imblearn.over_sampling import SMOTENC
  >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
  :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not
  None`) now accept any data types and will not attempt any data conversion.
  :pr:`1004` by :user:`Guillaume Lemaitre <glemaitre>`.

 - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
  when passing the `categorical_features` parameter.
  :pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
 fromsklearn.baseimportclone
 fromsklearn.exceptionsimportDataConversionWarning
 fromsklearn.preprocessingimportOneHotEncoder,OrdinalEncoder
 fromsklearn.utilsimport_safe_indexing,check_array,check_random_state
 fromsklearn.utilsimport (
 _get_column_indices,
 _safe_indexing,
 check_array,
 check_random_state,
 )
 fromsklearn.utils.sparsefuncs_fastimport (
 csc_mean_variance_axis0,
 csr_mean_variance_axis0,

    Parameters
    ----------
    categorical_features : array-like of shape (n_cat_features,) or (n_features,)
    categorical_features : array-like of shape (n_cat_features,) or (n_features,),\
            dtype={{bool, int, str}}
        Specified which features are categorical. Can either be:

        - array of indices specifying the categorical features;
        - array of `int` corresponding to the indices specifying the categorical
          features;
        - array of `str` corresponding to the feature names. `X` should be a pandas
          :class:`pandas.DataFrame` in this case.
        - mask array of shape (n_features, ) and ``bool`` dtype for which
          ``True`` indicates the categorical features.

 self._check_feature_names(X,reset=True)
 returnX,y,binarize_y

 def_validate_estimator(self):
 super()._validate_estimator()
 categorical_features=np.asarray(self.categorical_features)
 ifcategorical_features.dtype.name=="bool":
 self.categorical_features_=np.flatnonzero(categorical_features)
 else:
 ifany(
                [catnotinnp.arange(self.n_features_)forcatincategorical_features]
            ):
 raiseValueError(
 f"Some of the categorical indices are out of range. Indices"
 f" should be between 0 and{self.n_features_-1}"
                )
 self.categorical_features_=categorical_features
 def_validate_column_types(self,X):
 self.categorical_features_=np.array(
 _get_column_indices(X,self.categorical_features)
        )
 self.continuous_features_=np.setdiff1d(
 np.arange(self.n_features_),self.categorical_features_
        )

 def_validate_estimator(self):
 super()._validate_estimator()
 ifself.categorical_features_.size==self.n_features_in_:
 raiseValueError(
 "SMOTE-NC is not designed to work only with categorical "
            )

 self.n_features_=_num_features(X)
 self._validate_column_types(X)
 self._validate_estimator()

 # compute the median of the standard deviation of the minority class
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
    X[:, 3] = rng.randint(3, size=30)
    y = np.array([0] * 10 + [1] * 20)
    # return the categories
    return X, y, [True, False, True]
    return X, y, [True, False,False,True]


 def data_heterogneous_unordered_multiclass():
    X, y, _ = data_heterogneous_unordered()
    categorical_features = [0, 10]
    smote = SMOTENC(random_state=0, categorical_features=categorical_features)
    with pytest.raises(ValueError, match="indices are out of range"):
    with pytest.raises(ValueError, match="all features must be in"):
        smote.fit_resample(X, y)


    X_res, y_res = smote.fit_resample(X, y)
    pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
    assert len(X_res) == len(y_res)


 def test_smotenc_categorical_features_str():
    """Check that we support array-like of strings for `categorical_features` using
    pandas dataframe.
    """
    pd = pytest.importorskip("pandas")

    X = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            "B": ["a", "b"] * 5,
            "C": ["a", "b", "c"] * 3 + ["a"],
        }
    )
    X = pd.concat([X] * 10, ignore_index=True)
    y = np.array([0] * 70 + [1] * 30)
    smote = SMOTENC(categorical_features=["B", "C"], random_state=0)
    X_res, y_res = smote.fit_resample(X, y)
    assert X_res["B"].isin(["a", "b"]).all()
    assert X_res["C"].isin(["a", "b", "c"]).all()
    counter = Counter(y_res)
    assert counter[0] == counter[1] == 70
    assert_array_equal(smote.categorical_features_, [1, 2])
    assert_array_equal(smote.continuous_features_, [0])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -192,8 +192,8 @@ which categorical data are treated differently::

		In this data set, the first and last features are considered as categorical
		features. One needs to provide this information to:class:`SMOTENC` via the
		parameters ``categorical_features`` either by passing the indices of these
		features or a boolean mask marking these features::
		parameters ``categorical_features`` either by passing the indices, the feature
		names when `X` is a pandas DataFrame, or a boolean mask marking these features::

		>>> from imblearn.over_sampling import SMOTENC
		>>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -53,3 +53,7 @@ Enhancements
		:class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not
		None`) now accept any data types and will not attempt any data conversion.
		:pr:`1004` by :user:`Guillaume Lemaitre <glemaitre>`.

		- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
		when passing the `categorical_features` parameter.
		:pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,7 +16,12 @@
		fromsklearn.baseimportclone
		fromsklearn.exceptionsimportDataConversionWarning
		fromsklearn.preprocessingimportOneHotEncoder,OrdinalEncoder
		fromsklearn.utilsimport_safe_indexing,check_array,check_random_state
		fromsklearn.utilsimport (
		_get_column_indices,
		_safe_indexing,
		check_array,
		check_random_state,
		)
		fromsklearn.utils.sparsefuncs_fastimport (
		csc_mean_variance_axis0,
		csr_mean_variance_axis0,
Expand DownExpand Up		@@ -390,10 +395,14 @@ class SMOTENC(SMOTE):

		Parameters
		----------
		categorical_features : array-like of shape (n_cat_features,) or (n_features,)
		categorical_features : array-like of shape (n_cat_features,) or (n_features,),\
		dtype={{bool, int, str}}
		Specified which features are categorical. Can either be:

		- array of indices specifying the categorical features;
		- array of `int` corresponding to the indices specifying the categorical
		features;
		- array of `str` corresponding to the feature names. `X` should be a pandas
		:class:`pandas.DataFrame` in this case.
		- mask array of shape (n_features, ) and ``bool`` dtype for which
		``True`` indicates the categorical features.

Expand DownExpand Up		@@ -565,24 +574,16 @@ def _check_X_y(self, X, y):
		self._check_feature_names(X,reset=True)
		returnX,y,binarize_y

		def_validate_estimator(self):
		super()._validate_estimator()
		categorical_features=np.asarray(self.categorical_features)
		ifcategorical_features.dtype.name=="bool":
		self.categorical_features_=np.flatnonzero(categorical_features)
		else:
		ifany(
		[catnotinnp.arange(self.n_features_)forcatincategorical_features]
		):
		raiseValueError(
		f"Some of the categorical indices are out of range. Indices"
		f" should be between 0 and{self.n_features_-1}"
		)
		self.categorical_features_=categorical_features
		def_validate_column_types(self,X):
		self.categorical_features_=np.array(
		_get_column_indices(X,self.categorical_features)
		)
		self.continuous_features_=np.setdiff1d(
		np.arange(self.n_features_),self.categorical_features_
		)

		def_validate_estimator(self):
		super()._validate_estimator()
		ifself.categorical_features_.size==self.n_features_in_:
		raiseValueError(
		"SMOTE-NC is not designed to work only with categorical "
Expand All		@@ -600,6 +601,7 @@ def _fit_resample(self, X, y):
		)

		self.n_features_=_num_features(X)
		self._validate_column_types(X)
		self._validate_estimator()

		# compute the median of the standard deviation of the minority class
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -63,7 +63,7 @@ def data_heterogneous_masked():
		X[:, 3] = rng.randint(3, size=30)
		y = np.array([0] * 10 + [1] * 20)
		# return the categories
		return X, y, [True, False, True]
		return X, y, [True, False,False,True]


		def data_heterogneous_unordered_multiclass():
Expand DownExpand Up		@@ -98,7 +98,7 @@ def test_smotenc_error():
		X, y, _ = data_heterogneous_unordered()
		categorical_features = [0, 10]
		smote = SMOTENC(random_state=0, categorical_features=categorical_features)
		with pytest.raises(ValueError, match="indices are out of range"):
		with pytest.raises(ValueError, match="all features must be in"):
		smote.fit_resample(X, y)


Expand DownExpand Up		@@ -324,3 +324,28 @@ def test_smotenc_bool_categorical():
		X_res, y_res = smote.fit_resample(X, y)
		pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
		assert len(X_res) == len(y_res)


		def test_smotenc_categorical_features_str():
		"""Check that we support array-like of strings for `categorical_features` using
		pandas dataframe.
		"""
		pd = pytest.importorskip("pandas")

		X = pd.DataFrame(
		{
		"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
		"B": ["a", "b"] * 5,
		"C": ["a", "b", "c"] * 3 + ["a"],
		}
		)
		X = pd.concat([X] * 10, ignore_index=True)
		y = np.array([0] * 70 + [1] * 30)
		smote = SMOTENC(categorical_features=["B", "C"], random_state=0)
		X_res, y_res = smote.fit_resample(X, y)
		assert X_res["B"].isin(["a", "b"]).all()
		assert X_res["C"].isin(["a", "b", "c"]).all()
		counter = Counter(y_res)
		assert counter[0] == counter[1] == 70
		assert_array_equal(smote.categorical_features_, [1, 2])
		assert_array_equal(smote.continuous_features_, [0])