Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

ENH support array-like of str for categorical_features in SMOTENC#1008

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletionsdoc/over_sampling.rst
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -192,8 +192,8 @@ which categorical data are treated differently::

In this data set, the first and last features are considered as categorical
features. One needs to provide this information to :class:`SMOTENC` via the
parameters ``categorical_features`` either by passing the indices of these
features or a boolean mask marking these features::
parameters ``categorical_features`` either by passing the indices, the feature
names when `X` is a pandas DataFrame, or a boolean mask marking these features::

>>> from imblearn.over_sampling import SMOTENC
>>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
Expand Down
4 changes: 4 additions & 0 deletionsdoc/whats_new/v0.11.rst
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -53,3 +53,7 @@ Enhancements
:class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not
None`) now accept any data types and will not attempt any data conversion.
:pr:`1004` by :user:`Guillaume Lemaitre <glemaitre>`.

- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
when passing the `categorical_features` parameter.
:pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
36 changes: 19 additions & 17 deletionsimblearn/over_sampling/_smote/base.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -16,7 +16,12 @@
from sklearn.base import clone
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils import _safe_indexing, check_array, check_random_state
from sklearn.utils import (
_get_column_indices,
_safe_indexing,
check_array,
check_random_state,
)
from sklearn.utils.sparsefuncs_fast import (
csc_mean_variance_axis0,
csr_mean_variance_axis0,
Expand DownExpand Up@@ -390,10 +395,14 @@ class SMOTENC(SMOTE):

Parameters
----------
categorical_features : array-like of shape (n_cat_features,) or (n_features,)
categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
dtype={{bool, int, str}}
Specified which features are categorical. Can either be:

- array of indices specifying the categorical features;
- array of `int` corresponding to the indices specifying the categorical
features;
- array of `str` corresponding to the feature names. `X` should be a pandas
:class:`pandas.DataFrame` in this case.
- mask array of shape (n_features, ) and ``bool`` dtype for which
``True`` indicates the categorical features.

Expand DownExpand Up@@ -565,24 +574,16 @@ def _check_X_y(self, X, y):
self._check_feature_names(X, reset=True)
return X, y, binarize_y

def _validate_estimator(self):
super()._validate_estimator()
categorical_features = np.asarray(self.categorical_features)
if categorical_features.dtype.name == "bool":
self.categorical_features_ = np.flatnonzero(categorical_features)
else:
if any(
[cat not in np.arange(self.n_features_) for cat in categorical_features]
):
raise ValueError(
f"Some of the categorical indices are out of range. Indices"
f" should be between 0 and {self.n_features_ - 1}"
)
self.categorical_features_ = categorical_features
def _validate_column_types(self, X):
self.categorical_features_ = np.array(
_get_column_indices(X, self.categorical_features)
)
self.continuous_features_ = np.setdiff1d(
np.arange(self.n_features_), self.categorical_features_
)

def _validate_estimator(self):
super()._validate_estimator()
if self.categorical_features_.size == self.n_features_in_:
raise ValueError(
"SMOTE-NC is not designed to work only with categorical "
Expand All@@ -600,6 +601,7 @@ def _fit_resample(self, X, y):
)

self.n_features_ = _num_features(X)
self._validate_column_types(X)
self._validate_estimator()

# compute the median of the standard deviation of the minority class
Expand Down
29 changes: 27 additions & 2 deletionsimblearn/over_sampling/_smote/tests/test_smote_nc.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -63,7 +63,7 @@ def data_heterogneous_masked():
X[:, 3] = rng.randint(3, size=30)
y = np.array([0] * 10 + [1] * 20)
# return the categories
return X, y, [True, False, True]
return X, y, [True, False,False,True]


def data_heterogneous_unordered_multiclass():
Expand DownExpand Up@@ -98,7 +98,7 @@ def test_smotenc_error():
X, y, _ = data_heterogneous_unordered()
categorical_features = [0, 10]
smote = SMOTENC(random_state=0, categorical_features=categorical_features)
with pytest.raises(ValueError, match="indices are out of range"):
with pytest.raises(ValueError, match="all features must be in"):
smote.fit_resample(X, y)


Expand DownExpand Up@@ -324,3 +324,28 @@ def test_smotenc_bool_categorical():
X_res, y_res = smote.fit_resample(X, y)
pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
assert len(X_res) == len(y_res)


def test_smotenc_categorical_features_str():
"""Check that we support array-like of strings for `categorical_features` using
pandas dataframe.
"""
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"B": ["a", "b"] * 5,
"C": ["a", "b", "c"] * 3 + ["a"],
}
)
X = pd.concat([X] * 10, ignore_index=True)
y = np.array([0] * 70 + [1] * 30)
smote = SMOTENC(categorical_features=["B", "C"], random_state=0)
X_res, y_res = smote.fit_resample(X, y)
assert X_res["B"].isin(["a", "b"]).all()
assert X_res["C"].isin(["a", "b", "c"]).all()
counter = Counter(y_res)
assert counter[0] == counter[1] == 70
assert_array_equal(smote.categorical_features_, [1, 2])
assert_array_equal(smote.continuous_features_, [0])

[8]ページ先頭

©2009-2025 Movatter.jp