Jul 7, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 7, 2023 · Jul 7, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
 jobs:
  doc:
    docker:
      - image:circleci/python:3.7.7-buster
      - image:cimg/python:3.8.12
    environment:
      - USERNAME: "glemaitre"
      - ORGANIZATION: "imbalanced-learn"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev:v2.3.0
    rev:v4.3.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
 -   repo: https://github.com/psf/black
    rev:22.3.0
    rev:23.3.0
    hooks:
    -   id: black
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.9.2
 -   repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
    rev: v0.0.272
    hooks:
    -   id:flake8
 types: [file, python]
    -   id:ruff
 args: ["--fix", "--show-source"]
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev:v0.782
    rev:v1.3.0
    hooks:
     -  id: mypy
        files:sklearn/
        files:imblearn/
        additional_dependencies: [pytest==6.2.4]
 -   repo: https://github.com/PyCQA/isort
    rev: 5.10.1
    hooks:
    -   id: isort
diff --git a/README.rst b/README.rst

 .. |PythonMinVersion| replace:: 3.8
 .. |NumPyMinVersion| replace:: 1.17.3
 .. |SciPyMinVersion| replace:: 1.3.2
 .. |SciPyMinVersion| replace:: 1.5.0
 .. |ScikitLearnMinVersion| replace:: 1.0.2
 .. |MatplotlibMinVersion| replace:: 3.1.2
 .. |PandasMinVersion| replace:: 1.0.5
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
 - job: git_commit
  displayName: Get Git Commit
  pool:
    vmImage: ubuntu-20.04
    vmImage: ubuntu-22.04
  steps:
    - bash: |
        set -ex
    )
  displayName: Linting
  pool:
    vmImage: ubuntu-20.04
    vmImage: ubuntu-22.04
  steps:
    - task: UsePythonVersion@0
      inputs:
        versionSpec: '3.9'
    - bash: |
        # Include pytest compatibility with mypy
        pip installpytestflake8 mypy==0.782 black==22.3isort
        pip install flake8pytestmypy==1.3.0 black==23.3ruff==0.0.272
      displayName: Install linters
    - bash: |
        black --check --diff .
      displayName: Run black
    - bash: |
 isort --check --diff .
      displayName: Runisort
 ruffcheck --show-source .
      displayName: Runruff
    - bash: |
        ./build_tools/azure/linting.sh
      displayName: Run linting
 - template: build_tools/azure/posix.yml
  parameters:
    name: Linux_Nightly
    vmImage: ubuntu-20.04
    vmImage: ubuntu-22.04
    dependsOn: [git_commit, linting]
    condition: |
      and(
 - template: build_tools/azure/posix.yml
  parameters:
    name: Linux_Runs
    vmImage: ubuntu-20.04
    vmImage: ubuntu-22.04
    dependsOn: [git_commit]
    condition: |
      and(
 - template: build_tools/azure/posix.yml
  parameters:
    name: Linux
    vmImage: ubuntu-20.04
    vmImage: ubuntu-22.04
    dependsOn: [linting, git_commit]
    condition: |
      and(
        THREADPOOLCTL_VERSION: 'min'
        COVERAGE: 'false'
      # Linux + Python 3.8 build with OpenBLAS and without SITE_JOBLIB
 py38_conda_defaults_openblas:
 py38_conda_conda_forge_openblas:
        DISTRIB: 'conda'
        CONDA_CHANNEL: 'conda-forge'
        PYTHON_VERSION: '3.8'
        TEST_DOCSTRINGS: 'true'
        CHECK_WARNINGS: 'false'
        SKLEARN_VERSION: '1.1.3'
      pylatest_pip_openblas_sklearn_intermediate_bis:
        DISTRIB: 'conda-pip-latest'
        PYTHON_VERSION: '3.10'
        TEST_DOCS: 'true'
        TEST_DOCSTRINGS: 'true'
        CHECK_WARNINGS: 'false'
        SKLEARN_VERSION: '1.2.2'
      pylatest_pip_tensorflow:
        DISTRIB: 'conda-pip-latest-tensorflow'
        CONDA_CHANNEL: 'conda-forge'
        CONDA_CHANNEL: 'conda-forge'
        CPU_COUNT: '3'
        TEST_DOCS: 'true'
      pylatest_conda_mkl_no_openmp:
      # TODO: re-enable when we find out why MKL on defaults segfaults
      # It seems that scikit-learn from defaults channel is built with LLVM/CLANG OMP
      # while we use MKL OMP. This could be the cause of the segfaults.
      # pylatest_conda_mkl_no_openmp:
      #   DISTRIB: 'conda'
      #   BLAS: 'mkl'
      #   SKLEARN_SKIP_OPENMP_TEST: 'true'
      #   CPU_COUNT: '3'
      #   TEST_DOCS: 'true'
      conda_conda_forge_openblas:
        DISTRIB: 'conda'
        BLAS: 'mkl'
        SKLEARN_SKIP_OPENMP_TEST: 'true'
        CPU_COUNT: '3'
        CONDA_CHANNEL: 'conda-forge'
        BLAS: 'openblas'
        TEST_DOCS: 'true'
        CPU_COUNT: '3'

 - template: build_tools/azure/windows.yml
  parameters:
diff --git a/build_tools/azure/linting.sh b/build_tools/azure/linting.sh
 # pipefail is necessary to propagate exit codes
 set -o pipefail

 flake8 --show-source .
 echo -e "No problem detected by flake8\n"

 # For docstrings and warnings of deprecated attributes to be rendered
 # properly, the property decorator must come before the deprecated decorator
 # (else they are treated as functions)
    echo "$doctest_directive"
    exit 1
 fi

 joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")"

 if [ ! -z "$joblib_import" ]; then
    echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:"
    echo "$joblib_import"
    exit 1
 fi
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
 cp setup.cfg $TEST_DIR
 cd $TEST_DIR

 # python -c "import joblib; print(f'Number of cores (physical): \
 # {joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')"
 # python -c "import sklearn; sklearn.show_versions()"
 python -c "import imblearn; imblearn.show_versions()"

 if ! command -v conda &> /dev/null
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
    deactivate
 fi

 # Install dependencies with miniconda
 wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
    -O miniconda.sh
 chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
 export PATH="$MINICONDA_PATH/bin:$PATH"
 conda update --yes --quiet conda
 MAMBAFORGE_PATH=$HOME/mambaforge
 # Install dependencies with mamba
 wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
    -O mambaforge.sh
 chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MAMBAFORGE_PATH
 export PATH="$MAMBAFORGE_PATH/bin:$PATH"
 mamba update --yes --quiet conda

 # imports get_dep
 source build_tools/shared.sh
diff --git a/doc/Makefile b/doc/Makefile
 #

 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXOPTS    = -v
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
diff --git a/doc/_templates/breadcrumbs.html b/doc/_templates/breadcrumbs.html
diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst

 Let's first check the balancing ratio on this dataset::

  >>> y.value_counts(normalize=True)
  <=50K    0.98801
  >50K     0.01199
  Name: class, dtype: float64
  >>> from collections import Counter
  >>> {key: value / len(y) for key, value in Counter(y).items()}
  {'<=50K': 0.988..., '>50K': 0.011...}

 To later highlight some of the issue, we will keep aside a left-out set that we
 will not use for the evaluation of the model::
 class. We evaluate the generalization performance of the classifier via
 cross-validation::

  >>> from sklearn.experimental import enable_hist_gradient_boosting
  >>> from sklearn.ensemble import HistGradientBoostingClassifier
  >>> from sklearn.model_selection import cross_validate
  >>> model = HistGradientBoostingClassifier(random_state=0)
  ...     f"{cv_results['test_score'].std():.3f}"
  ... )
  Balanced accuracy mean +/- std. dev.: 0.724 +/- 0.042

 The cross-validation performance looks good, but evaluating the classifiers
 on the left-out data shows a different picture::

 The cross-validation performance looks good, but evaluating the classifiers
 on the left-out data shows a different picture::

  >>> scores = []
  >>> for fold_id, cv_model in enumerate(cv_results["estimator"]):
  ... )
  Balanced accuracy mean +/- std. dev.: 0.698 +/- 0.014

 We see that the performance is now worse than the cross-validated performance.
 We see that the performance is now worse than the cross-validated performance.
 Indeed, the data leakage gave us too optimistic results due to the reason
 stated earlier in this section.

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
 The augmented data set should be used instead of the original data set to train
 a classifier::

  >>> from sklearn.svm importLinearSVC
  >>> clf =LinearSVC()
  >>> from sklearn.linear_model importLogisticRegression
  >>> clf =LogisticRegression()
  >>> clf.fit(X_resampled, y_resampled)
 LinearSVC(...)
 LogisticRegression(...)

 In the figure below, we compare the decision functions of a classifier trained
 using the over-sampled data set and the original data set.
  >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
  >>> print(sorted(Counter(y_resampled).items()))
  [(0, 4674), (1, 4674), (2, 4674)]
  >>> clf_smote =LinearSVC().fit(X_resampled, y_resampled)
  >>> clf_smote =LogisticRegression().fit(X_resampled, y_resampled)
  >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
  >>> print(sorted(Counter(y_resampled).items()))
  [(0, 4673), (1, 4662), (2, 4674)]
  >>> clf_adasyn =LinearSVC().fit(X_resampled, y_resampled)
  >>> clf_adasyn =LogisticRegression().fit(X_resampled, y_resampled)

 The figure below illustrates the major difference of the different
 over-sampling methods.
diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py


 class IssueRole(object):

    EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$")

    def __init__(
diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
 # classifier within a :class:`~imblearn.ensemble.BalancedBaggingClassifier`.

 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa

 from imblearn.ensemble import BalancedBaggingClassifier

diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py
 print(f"Testing target statistics: {Counter(y_test)}")

 # Create a pipeline
 pipeline = make_pipeline(
    NearMiss(version=2), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
 )
 pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression())
 pipeline.fit(X_train, y_train)

 # Classify and report the results
diff --git a/examples/combine/plot_comparison_combine.py b/examples/combine/plot_comparison_combine.py
 # :class:`~imblearn.combine.SMOTEENN` cleans more noisy data than
 # :class:`~imblearn.combine.SMOTETomek`.

 from sklearn.svm importLinearSVC
 from sklearn.linear_model importLogisticRegression

 from imblearn.combine import SMOTEENN, SMOTETomek


 fig, axs = plt.subplots(3, 2, figsize=(15, 25))
 for ax, sampler in zip(axs, samplers):
    clf = make_pipeline(sampler,LinearSVC()).fit(X, y)
    clf = make_pipeline(sampler,LogisticRegression()).fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    plot_resampling(X, y, sampler, ax[1])
 fig.tight_layout()
diff --git a/examples/evaluation/plot_classification_report.py b/examples/evaluation/plot_classification_report.py


 from sklearn import datasets
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC

 from imblearn import over_sampling as os
 from imblearn import pipeline as pl
 pipeline = pl.make_pipeline(
    StandardScaler(),
    os.SMOTE(random_state=RANDOM_STATE),
 LinearSVC(max_iter=10_000, random_state=RANDOM_STATE),
 LogisticRegression(max_iter=10_000),
 )

 # Split the data
diff --git a/examples/evaluation/plot_metrics.py b/examples/evaluation/plot_metrics.py

 # %% [markdown]
 # We will create a pipeline made of a :class:`~imblearn.over_sampling.SMOTE`
 # over-sampler followed by a :class:`~sklearn.svm.LinearSVC` classifier.
 # over-sampler followed by a :class:`~sklearn.linear_model.LogisticRegression`
 # classifier.

 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC

 from imblearn.over_sampling import SMOTE

 model = make_pipeline(
    StandardScaler(),
    SMOTE(random_state=RANDOM_STATE),
 LinearSVC(max_iter=10_000, random_state=RANDOM_STATE),
 LogisticRegression(max_iter=10_000, random_state=RANDOM_STATE),
 )

 # %% [markdown]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,7 +3,7 @@ version: 2
		jobs:
		doc:
		docker:
		- image:circleci/python:3.7.7-buster
		- image:cimg/python:3.8.12
		environment:
		- USERNAME: "glemaitre"
		- ORGANIZATION: "imbalanced-learn"
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,26 +1,23 @@
		repos:
		- repo: https://github.com/pre-commit/pre-commit-hooks
		rev:v2.3.0
		rev:v4.3.0
		hooks:
		- id: check-yaml
		- id: end-of-file-fixer
		- id: trailing-whitespace
		- repo: https://github.com/psf/black
		rev:22.3.0
		rev:23.3.0
		hooks:
		- id: black
		- repo: https://gitlab.com/pycqa/flake8
		rev: 3.9.2
		- repo: https://github.com/astral-sh/ruff-pre-commit
		# Ruff version.
		rev: v0.0.272
		hooks:
		- id:flake8
		types: [file, python]
		- id:ruff
		args: ["--fix", "--show-source"]
		- repo: https://github.com/pre-commit/mirrors-mypy
		rev:v0.782
		rev:v1.3.0
		hooks:
		- id: mypy
		files:sklearn/
		files:imblearn/
		additional_dependencies: [pytest==6.2.4]
		- repo: https://github.com/PyCQA/isort
		rev: 5.10.1
		hooks:
		- id: isort
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,7 +29,7 @@

		.. \|PythonMinVersion\| replace:: 3.8
		.. \|NumPyMinVersion\| replace:: 1.17.3
		.. \|SciPyMinVersion\| replace:: 1.3.2
		.. \|SciPyMinVersion\| replace:: 1.5.0
		.. \|ScikitLearnMinVersion\| replace:: 1.0.2
		.. \|MatplotlibMinVersion\| replace:: 3.1.2
		.. \|PandasMinVersion\| replace:: 1.0.5
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,7 +11,7 @@ jobs:
		- job: git_commit
		displayName: Get Git Commit
		pool:
		vmImage: ubuntu-20.04
		vmImage: ubuntu-22.04
		steps:
		- bash: \|
		set -ex
Expand All		@@ -38,21 +38,21 @@ jobs:
		)
		displayName: Linting
		pool:
		vmImage: ubuntu-20.04
		vmImage: ubuntu-22.04
		steps:
		- task: UsePythonVersion@0
		inputs:
		versionSpec: '3.9'
		- bash: \|
		# Include pytest compatibility with mypy
		pip installpytestflake8 mypy==0.782 black==22.3isort
		pip install flake8pytestmypy==1.3.0 black==23.3ruff==0.0.272
		displayName: Install linters
		- bash: \|
		black --check --diff .
		displayName: Run black
		- bash: \|
		isort --check --diff .
		displayName: Runisort
		ruffcheck --show-source .
		displayName: Runruff
		- bash: \|
		./build_tools/azure/linting.sh
		displayName: Run linting
Expand All		@@ -63,7 +63,7 @@ jobs:
		- template: build_tools/azure/posix.yml
		parameters:
		name: Linux_Nightly
		vmImage: ubuntu-20.04
		vmImage: ubuntu-22.04
		dependsOn: [git_commit, linting]
		condition: \|
		and(
Expand All		@@ -86,7 +86,7 @@ jobs:
		- template: build_tools/azure/posix.yml
		parameters:
		name: Linux_Runs
		vmImage: ubuntu-20.04
		vmImage: ubuntu-22.04
		dependsOn: [git_commit]
		condition: \|
		and(
Expand DownExpand Up		@@ -125,7 +125,7 @@ jobs:
		- template: build_tools/azure/posix.yml
		parameters:
		name: Linux
		vmImage: ubuntu-20.04
		vmImage: ubuntu-22.04
		dependsOn: [linting, git_commit]
		condition: \|
		and(
Expand All		@@ -144,7 +144,7 @@ jobs:
		THREADPOOLCTL_VERSION: 'min'
		COVERAGE: 'false'
		# Linux + Python 3.8 build with OpenBLAS and without SITE_JOBLIB
		py38_conda_defaults_openblas:
		py38_conda_conda_forge_openblas:
		DISTRIB: 'conda'
		CONDA_CHANNEL: 'conda-forge'
		PYTHON_VERSION: '3.8'
Expand All		@@ -170,6 +170,13 @@ jobs:
		TEST_DOCSTRINGS: 'true'
		CHECK_WARNINGS: 'false'
		SKLEARN_VERSION: '1.1.3'
		pylatest_pip_openblas_sklearn_intermediate_bis:
		DISTRIB: 'conda-pip-latest'
		PYTHON_VERSION: '3.10'
		TEST_DOCS: 'true'
		TEST_DOCSTRINGS: 'true'
		CHECK_WARNINGS: 'false'
		SKLEARN_VERSION: '1.2.2'
		pylatest_pip_tensorflow:
		DISTRIB: 'conda-pip-latest-tensorflow'
		CONDA_CHANNEL: 'conda-forge'
Expand DownExpand Up		@@ -263,12 +270,21 @@ jobs:
		CONDA_CHANNEL: 'conda-forge'
		CPU_COUNT: '3'
		TEST_DOCS: 'true'
		pylatest_conda_mkl_no_openmp:
		# TODO: re-enable when we find out why MKL on defaults segfaults
		# It seems that scikit-learn from defaults channel is built with LLVM/CLANG OMP
		# while we use MKL OMP. This could be the cause of the segfaults.
		# pylatest_conda_mkl_no_openmp:
		# DISTRIB: 'conda'
		# BLAS: 'mkl'
		# SKLEARN_SKIP_OPENMP_TEST: 'true'
		# CPU_COUNT: '3'
		# TEST_DOCS: 'true'
		conda_conda_forge_openblas:
		DISTRIB: 'conda'
		BLAS: 'mkl'
		SKLEARN_SKIP_OPENMP_TEST: 'true'
		CPU_COUNT: '3'
		CONDA_CHANNEL: 'conda-forge'
		BLAS: 'openblas'
		TEST_DOCS: 'true'
		CPU_COUNT: '3'

		- template: build_tools/azure/windows.yml
		parameters:
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,9 +4,6 @@ set -e
		# pipefail is necessary to propagate exit codes
		set -o pipefail

		flake8 --show-source .
		echo -e "No problem detected by flake8\n"

		# For docstrings and warnings of deprecated attributes to be rendered
		# properly, the property decorator must come before the deprecated decorator
		# (else they are treated as functions)
Expand All		@@ -33,11 +30,3 @@ then
		echo "$doctest_directive"
		exit 1
		fi

		joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")"

		if [ ! -z "$joblib_import" ]; then
		echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:"
		echo "$joblib_import"
		exit 1
		fi
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,6 +12,9 @@ mkdir -p $TEST_DIR
		cp setup.cfg $TEST_DIR
		cd $TEST_DIR

		# python -c "import joblib; print(f'Number of cores (physical): \
		# {joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')"
		# python -c "import sklearn; sklearn.show_versions()"
		python -c "import imblearn; imblearn.show_versions()"

		if ! command -v conda &> /dev/null
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -89,12 +89,13 @@ if [[ `type -t deactivate` ]]; then
		deactivate
		fi

		# Install dependencies with miniconda
		wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
		-O miniconda.sh
		chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
		export PATH="$MINICONDA_PATH/bin:$PATH"
		conda update --yes --quiet conda
		MAMBAFORGE_PATH=$HOME/mambaforge
		# Install dependencies with mamba
		wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
		-O mambaforge.sh
		chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MAMBAFORGE_PATH
		export PATH="$MAMBAFORGE_PATH/bin:$PATH"
		mamba update --yes --quiet conda

		# imports get_dep
		source build_tools/shared.sh
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,7 +2,7 @@
		#

		# You can set these variables from the command line.
		SPHINXOPTS =
		SPHINXOPTS = -v
		SPHINXBUILD = sphinx-build
		PAPER =
		BUILDDIR = _build
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -53,10 +53,9 @@ increase the effect of the wrongdoings::

		Let's first check the balancing ratio on this dataset::

		>>> y.value_counts(normalize=True)
		<=50K 0.98801
		>50K 0.01199
		Name: class, dtype: float64
		>>> from collections import Counter
		>>> {key: value / len(y) for key, value in Counter(y).items()}
		{'<=50K': 0.988..., '>50K': 0.011...}

		To later highlight some of the issue, we will keep aside a left-out set that we
		will not use for the evaluation of the model::
Expand All		@@ -72,7 +71,6 @@ classifier, without any preprocessing to alleviate the bias toward the majority
		class. We evaluate the generalization performance of the classifier via
		cross-validation::

		>>> from sklearn.experimental import enable_hist_gradient_boosting
		>>> from sklearn.ensemble import HistGradientBoostingClassifier
		>>> from sklearn.model_selection import cross_validate
		>>> model = HistGradientBoostingClassifier(random_state=0)
Expand DownExpand Up		@@ -130,9 +128,9 @@ cross-validation::
		... f"{cv_results['test_score'].std():.3f}"
		... )
		Balanced accuracy mean +/- std. dev.: 0.724 +/- 0.042

		The cross-validation performance looks good, but evaluating the classifiers
		on the left-out data shows a different picture::

		The cross-validation performance looks good, but evaluating the classifiers
		on the left-out data shows a different picture::

		>>> scores = []
		>>> for fold_id, cv_model in enumerate(cv_results["estimator"]):
Expand All		@@ -147,7 +145,7 @@ on the left-out data shows a different picture::
		... )
		Balanced accuracy mean +/- std. dev.: 0.698 +/- 0.014

		We see that the performance is now worse than the cross-validated performance.
		We see that the performance is now worse than the cross-validated performance.
		Indeed, the data leakage gave us too optimistic results due to the reason
		stated earlier in this section.

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -38,10 +38,10 @@ randomly sampling with replacement the current available samples. The
		The augmented data set should be used instead of the original data set to train
		a classifier::

		>>> from sklearn.svm importLinearSVC
		>>> clf =LinearSVC()
		>>> from sklearn.linear_model importLogisticRegression
		>>> clf =LogisticRegression()
		>>> clf.fit(X_resampled, y_resampled)
		LinearSVC(...)
		LogisticRegression(...)

		In the figure below, we compare the decision functions of a classifier trained
		using the over-sampled data set and the original data set.
Expand DownExpand Up		@@ -108,11 +108,11 @@ the same manner::
		>>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
		>>> print(sorted(Counter(y_resampled).items()))
		[(0, 4674), (1, 4674), (2, 4674)]
		>>> clf_smote =LinearSVC().fit(X_resampled, y_resampled)
		>>> clf_smote =LogisticRegression().fit(X_resampled, y_resampled)
		>>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
		>>> print(sorted(Counter(y_resampled).items()))
		[(0, 4673), (1, 4662), (2, 4674)]
		>>> clf_adasyn =LinearSVC().fit(X_resampled, y_resampled)
		>>> clf_adasyn =LogisticRegression().fit(X_resampled, y_resampled)

		The figure below illustrates the major difference of the different
		over-sampling methods.
Expand Down