Source code for smote_variants._smote_variants

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 15 11:15:24 2018

@author: gykovacs
"""

# import system packages
import os
import pickle
import itertools
import logging
import re
import time
import glob
import inspect

# used to parallelize evaluation
from joblib import Parallel, delayed

# numerical methods and arrays
import numpy as np
import pandas as pd

# import packages used for the implementation of sampling methods
from sklearn.model_selection import (RepeatedStratifiedKFold, KFold,
                                     cross_val_score, StratifiedKFold)
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (log_loss, roc_auc_score, accuracy_score,
                             confusion_matrix, f1_score)
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.manifold import LocallyLinearEmbedding, TSNE, Isomap
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone, BaseEstimator, ClassifierMixin

# some statistical methods
from scipy.stats import skew
import scipy.signal as ssignal
import scipy.spatial as sspatial
import scipy.optimize as soptimize
import scipy.special as sspecial
from scipy.stats.mstats import gmean

from ._version import __version__

__author__ = "György Kovács"
__license__ = "MIT"
__email__ = "gyuriofkovacs@gmail.com"

# for handler in _logger.root.handlers[:]:
#    _logger.root.removeHandler(handler)

# setting the _logger format
_logger = logging.getLogger('smote_variants')
_logger.setLevel(logging.DEBUG)
_logger_ch = logging.StreamHandler()
_logger_ch.setFormatter(logging.Formatter(
    "%(asctime)s:%(levelname)s:%(message)s"))
_logger.addHandler(_logger_ch)

# exported names
__all__ = ['__author__',
           '__license__',
           '__version__',
           '__email__',
           'get_all_oversamplers',
           'get_all_noisefilters',
           'get_n_quickest_oversamplers',
           'get_all_oversamplers_multiclass',
           'get_n_quickest_oversamplers_multiclass',
           'evaluate_oversamplers',
           'read_oversampling_results',
           'model_selection',
           'cross_validate',
           'MLPClassifierWrapper',
           'OverSampling',
           'NoiseFilter',
           'TomekLinkRemoval',
           'CondensedNearestNeighbors',
           'OneSidedSelection',
           'CNNTomekLinks',
           'NeighborhoodCleaningRule',
           'EditedNearestNeighbors',
           'SMOTE',
           'SMOTE_TomekLinks',
           'SMOTE_ENN',
           'Borderline_SMOTE1',
           'Borderline_SMOTE2',
           'ADASYN',
           'AHC',
           'LLE_SMOTE',
           'distance_SMOTE',
           'SMMO',
           'polynom_fit_SMOTE',
           'Stefanowski',
           'ADOMS',
           'Safe_Level_SMOTE',
           'MSMOTE',
           'DE_oversampling',
           'SMOBD',
           'SUNDO',
           'MSYN',
           'SVM_balance',
           'TRIM_SMOTE',
           'SMOTE_RSB',
           'ProWSyn',
           'SL_graph_SMOTE',
           'NRSBoundary_SMOTE',
           'LVQ_SMOTE',
           'SOI_CJ',
           'ROSE',
           'SMOTE_OUT',
           'SMOTE_Cosine',
           'Selected_SMOTE',
           'LN_SMOTE',
           'MWMOTE',
           'PDFOS',
           'IPADE_ID',
           'RWO_sampling',
           'NEATER',
           'DEAGO',
           'Gazzah',
           'MCT',
           'ADG',
           'SMOTE_IPF',
           'KernelADASYN',
           'MOT2LD',
           'V_SYNTH',
           'OUPS',
           'SMOTE_D',
           'SMOTE_PSO',
           'CURE_SMOTE',
           'SOMO',
           'ISOMAP_Hybrid',
           'CE_SMOTE',
           'Edge_Det_SMOTE',
           'CBSO',
           'E_SMOTE',
           'DBSMOTE',
           'ASMOBD',
           'Assembled_SMOTE',
           'SDSMOTE',
           'DSMOTE',
           'G_SMOTE',
           'NT_SMOTE',
           'Lee',
           'SPY',
           'SMOTE_PSOBAT',
           'MDO',
           'Random_SMOTE',
           'ISMOTE',
           'VIS_RST',
           'GASMOTE',
           'A_SUWO',
           'SMOTE_FRST_2T',
           'AND_SMOTE',
           'NRAS',
           'AMSCO',
           'SSO',
           'NDO_sampling',
           'DSRBF',
           'Gaussian_SMOTE',
           'kmeans_SMOTE',
           'Supervised_SMOTE',
           'SN_SMOTE',
           'CCR',
           'ANS',
           'cluster_SMOTE',
           'NoSMOTE',
           'MulticlassOversampling',
           'OversamplingClassifier']


[docs]def get_all_oversamplers():
    """
    Returns all oversampling classes

    Returns:
        list(OverSampling): list of all oversampling classes

    Example::

        import smote_variants as sv

        oversamplers= sv.get_all_oversamplers()
    """

    return OverSampling.__subclasses__()


[docs]def get_n_quickest_oversamplers(n=10):
    """
    Returns the n quickest oversamplers based on testing on the datasets of
    the imbalanced_databases package.

    Args:
        n (int): number of oversamplers to return

    Returns:
        list(OverSampling): list of the n quickest oversampling classes

    Example::

        import smote_variants as sv

        oversamplers= sv.get_n_quickest_oversamplers(10)
    """

    runtimes = {'SPY': 0.11, 'OUPS': 0.16, 'SMOTE_D': 0.20, 'NT_SMOTE': 0.20,
                'Gazzah': 0.21, 'ROSE': 0.25, 'NDO_sampling': 0.27,
                'Borderline_SMOTE1': 0.28, 'SMOTE': 0.28,
                'Borderline_SMOTE2': 0.29, 'ISMOTE': 0.30, 'SMMO': 0.31,
                'SMOTE_OUT': 0.37, 'SN_SMOTE': 0.44, 'Selected_SMOTE': 0.47,
                'distance_SMOTE': 0.47, 'Gaussian_SMOTE': 0.48, 'MCT': 0.51,
                'Random_SMOTE': 0.57, 'ADASYN': 0.58, 'SL_graph_SMOTE': 0.58,
                'CURE_SMOTE': 0.59, 'ANS': 0.63, 'MSMOTE': 0.72,
                'Safe_Level_SMOTE': 0.79, 'SMOBD': 0.80, 'CBSO': 0.81,
                'Assembled_SMOTE': 0.82, 'SDSMOTE': 0.88,
                'SMOTE_TomekLinks': 0.91, 'Edge_Det_SMOTE': 0.94,
                'ProWSyn': 1.00, 'Stefanowski': 1.04, 'NRAS': 1.06,
                'AND_SMOTE': 1.13, 'DBSMOTE': 1.17, 'polynom_fit_SMOTE': 1.18,
                'ASMOBD': 1.18, 'MDO': 1.18, 'SOI_CJ': 1.24, 'LN_SMOTE': 1.26,
                'VIS_RST': 1.34, 'TRIM_SMOTE': 1.36, 'LLE_SMOTE': 1.62,
                'SMOTE_ENN': 1.86, 'SMOTE_Cosine': 2.00, 'kmeans_SMOTE': 2.43,
                'MWMOTE': 2.45, 'V_SYNTH': 2.59, 'A_SUWO': 2.81,
                'RWO_sampling': 2.91, 'SMOTE_RSB': 3.88, 'ADOMS': 3.89,
                'SMOTE_IPF': 4.10, 'Lee': 4.16, 'SMOTE_FRST_2T': 4.18,
                'cluster_SMOTE': 4.19, 'SOMO': 4.30, 'DE_oversampling': 4.67,
                'CCR': 4.72, 'NRSBoundary_SMOTE': 5.26, 'AHC': 5.27,
                'ISOMAP_Hybrid': 6.11, 'LVQ_SMOTE': 6.99, 'CE_SMOTE': 7.45,
                'MSYN': 11.92, 'PDFOS': 15.14, 'KernelADASYN': 17.87,
                'G_SMOTE': 19.23, 'E_SMOTE': 19.50, 'SVM_balance': 24.05,
                'SUNDO': 26.21, 'GASMOTE': 31.38, 'DEAGO': 33.39,
                'NEATER': 41.39, 'SMOTE_PSO': 45.12, 'IPADE_ID': 90.01,
                'DSMOTE': 146.73, 'MOT2LD': 149.42, 'Supervised_SMOTE': 195.74,
                'SSO': 215.27, 'DSRBF': 272.11, 'SMOTE_PSOBAT': 324.31,
                'ADG': 493.64, 'AMSCO': 1502.36}

    samplers = get_all_oversamplers()
    samplers = sorted(
        samplers, key=lambda x: runtimes.get(x.__name__, 1e8))

    return samplers[:n]


[docs]def get_all_oversamplers_multiclass(strategy="eq_1_vs_many_successive"):
    """
    Returns all oversampling classes which can be used with the multiclass
    strategy specified

    Args:
        strategy (str): the multiclass oversampling strategy -
                        'eq_1_vs_many_successive'/'equalize_1_vs_many'

    Returns:
        list(OverSampling): list of all oversampling classes which can be used
                            with the multiclass strategy specified

    Example::

        import smote_variants as sv

        oversamplers= sv.get_all_oversamplers_multiclass()
    """

    oversamplers = get_all_oversamplers()

    if (strategy == 'eq_1_vs_many_successive' or
            strategy == 'equalize_1_vs_many'):

        def multiclass_filter(o):
            return ((OverSampling.cat_changes_majority not in o.categories) or
                    ('proportion' in o().get_params()))

        return [o for o in oversamplers if multiclass_filter(o)]
    else:
        raise ValueError(("It is not known which oversamplers work with the"
                          " strategy %s") % strategy)


[docs]def get_n_quickest_oversamplers_multiclass(n,
                                           strategy="eq_1_vs_many_successive"):
    """
    Returns the n quickest oversamplers based on testing on the datasets of
    the imbalanced_databases package, and suitable for using the multiclass
    strategy specified.

    Args:
        n (int): number of oversamplers to return
        strategy (str): the multiclass oversampling strategy -
                        'eq_1_vs_many_successive'/'equalize_1_vs_many'

    Returns:
        list(OverSampling): list of n quickest oversampling classes which can
                    be used with the multiclass strategy specified

    Example::

        import smote_variants as sv

        oversamplers= sv.get_n_quickest_oversamplers_multiclass()
    """

    oversamplers = get_all_oversamplers()
    quickest_oversamplers = get_n_quickest_oversamplers(len(oversamplers))

    if (strategy == 'eq_1_vs_many_successive'
            or strategy == 'equalize_1_vs_many'):

        def multiclass_filter(o):
            return ((OverSampling.cat_changes_majority not in o.categories) or
                    ('proportion' in o().get_params()))

        return [o for o in quickest_oversamplers if multiclass_filter(o)][:n]
    else:
        raise ValueError("It is not known which oversamplers work with the"
                         " strategy %s" % strategy)


def get_all_noisefilters():
    """
    Returns all noise filters
    Returns:
        list(NoiseFilter): list of all noise filter classes
    """
    return NoiseFilter.__subclasses__()


def mode(data):
    values, counts = np.unique(data, return_counts=True)
    return values[np.where(counts == max(counts))[0][0]]


class StatisticsMixin:
    """
    Mixin to compute class statistics and determine minority/majority labels
    """

    def class_label_statistics(self, X, y):
        """
        determines class sizes and minority and majority labels
        Args:
            X (np.array): features
            y (np.array): target labels
        """
        unique, counts = np.unique(y, return_counts=True)
        self.class_stats = dict(zip(unique, counts))
        self.min_label = unique[0] if counts[0] < counts[1] else unique[1]
        self.maj_label = unique[1] if counts[0] < counts[1] else unique[0]
        # shorthands
        self.min_label = self.min_label
        self.maj_label = self.maj_label

    def check_enough_min_samples_for_sampling(self, threshold=2):
        if self.class_stats[self.min_label] < threshold:
            m = ("The number of minority samples (%d) is not enough "
                 "for sampling")
            m = m % self.class_stats[self.min_label]
            _logger.warning(self.__class__.__name__ + ": " + m)
            return False
        return True


class RandomStateMixin:
    """
    Mixin to set random state
    """

    def set_random_state(self, random_state):
        """
        sets the random_state member of the object

        Args:
            random_state (int/np.random.RandomState/None): the random state
                                                                initializer
        """

        self._random_state_init = random_state

        if random_state is None:
            self.random_state = np.random
        elif isinstance(random_state, int):
            self.random_state = np.random.RandomState(random_state)
        elif isinstance(random_state, np.random.RandomState):
            self.random_state = random_state
        elif random_state is np.random:
            self.random_state = random_state
        else:
            raise ValueError(
                "random state cannot be initialized by " + str(random_state))


class ParameterCheckingMixin:
    """
    Mixin to check if parameters come from a valid range
    """

    def check_in_range(self, x, name, r):
        """
        Check if parameter is in range
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            r (list-like(2)): the lower and upper bound of a range
        Throws:
            ValueError
        """
        if x < r[0] or x > r[1]:
            m = ("Value for parameter %s outside the range [%f,%f] not"
                 " allowed: %f")
            m = m % (name, r[0], r[1], x)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_out_range(self, x, name, r):
        """
        Check if parameter is outside of range
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            r (list-like(2)): the lower and upper bound of a range
        Throws:
            ValueError
        """
        if x >= r[0] and x <= r[1]:
            m = "Value for parameter %s in the range [%f,%f] not allowed: %f"
            m = m % (name, r[0], r[1], x)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_less_or_equal(self, x, name, val):
        """
        Check if parameter is less than or equal to value
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            val (numeric): value to compare to
        Throws:
            ValueError
        """
        if x > val:
            m = "Value for parameter %s greater than %f not allowed: %f > %f"
            m = m % (name, val, x, val)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_less_or_equal_par(self, x, name_x, y, name_y):
        """
        Check if parameter is less than or equal to another parameter
        Args:
            x (numeric): the parameter value
            name_x (str): the parameter name
            y (numeric): the other parameter value
            name_y (str): the other parameter name
        Throws:
            ValueError
        """
        if x > y:
            m = ("Value for parameter %s greater than parameter %s not"
                 " allowed: %f > %f")
            m = m % (name_x, name_y, x, y)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_less(self, x, name, val):
        """
        Check if parameter is less than value
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            val (numeric): value to compare to
        Throws:
            ValueError
        """
        if x >= val:
            m = ("Value for parameter %s greater than or equal to %f"
                 " not allowed: %f >= %f")
            m = m % (name, val, x, val)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_less_par(self, x, name_x, y, name_y):
        """
        Check if parameter is less than another parameter
        Args:
            x (numeric): the parameter value
            name_x (str): the parameter name
            y (numeric): the other parameter value
            name_y (str): the other parameter name
        Throws:
            ValueError
        """
        if x >= y:
            m = ("Value for parameter %s greater than or equal to parameter"
                 " %s not allowed: %f >= %f")
            m = m % (name_x, name_y, x, y)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_greater_or_equal(self, x, name, val):
        """
        Check if parameter is greater than or equal to value
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            val (numeric): value to compare to
        Throws:
            ValueError
        """
        if x < val:
            m = "Value for parameter %s less than %f is not allowed: %f < %f"
            m = m % (name, val, x, val)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_greater_or_equal_par(self, x, name_x, y, name_y):
        """
        Check if parameter is less than or equal to another parameter
        Args:
            x (numeric): the parameter value
            name_x (str): the parameter name
            y (numeric): the other parameter value
            name_y (str): the other parameter name
        Throws:
            ValueError
        """
        if x < y:
            m = ("Value for parameter %s less than parameter %s is not"
                 " allowed: %f < %f")
            m = m % (name_x, name_y, x, y)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_greater(self, x, name, val):
        """
        Check if parameter is greater than value
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            val (numeric): value to compare to
        Throws:
            ValueError
        """
        if x <= val:
            m = ("Value for parameter %s less than or equal to %f not allowed"
                 " %f < %f")
            m = m % (name, val, x, val)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_greater_par(self, x, name_x, y, name_y):
        """
        Check if parameter is greater than or equal to another parameter
        Args:
            x (numeric): the parameter value
            name_x (str): the parameter name
            y (numeric): the other parameter value
            name_y (str): the other parameter name
        Throws:
            ValueError
        """
        if x <= y:
            m = ("Value for parameter %s less than or equal to parameter %s"
                 " not allowed: %f <= %f")
            m = m % (name_x, name_y, x, y)

            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_equal(self, x, name, val):
        """
        Check if parameter is equal to value
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            val (numeric): value to compare to
        Throws:
            ValueError
        """
        if x == val:
            m = ("Value for parameter %s equal to parameter %f is not allowed:"
                 " %f == %f")
            m = m % (name, val, x, val)
            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_equal_par(self, x, name_x, y, name_y):
        """
        Check if parameter is equal to another parameter
        Args:
            x (numeric): the parameter value
            name_x (str): the parameter name
            y (numeric): the other parameter value
            name_y (str): the other parameter name
        Throws:
            ValueError
        """
        if x == y:
            m = ("Value for parameter %s equal to parameter %s is not "
                 "allowed: %f == %f")
            m = m % (name_x, name_y, x, y)
            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_isin(self, x, name, li):
        """
        Check if parameter is in list
        Args:
            x (numeric): the parameter value
            name (str): the parameter name
            li (list): list to check if parameter is in it
        Throws:
            ValueError
        """
        if x not in li:
            m = "Value for parameter %s not in list %s is not allowed: %s"
            m = m % (name, str(li), str(x))
            raise ValueError(self.__class__.__name__ + ": " + m)

    def check_n_jobs(self, x, name):
        """
        Check n_jobs parameter
        Args:
            x (int/None): number of jobs
            name (str): the parameter name
        Throws:
            ValueError
        """
        if not ((x is None)
                or (x is not None and isinstance(x, int) and not x == 0)):
            m = "Value for parameter n_jobs is not allowed: %s" % str(x)
            raise ValueError(self.__class__.__name__ + ": " + m)


class ParameterCombinationsMixin:
    """
    Mixin to generate parameter combinations
    """

    @classmethod
    def generate_parameter_combinations(cls, dictionary, raw):
        """
        Generates reasonable paramter combinations
        Args:
            dictionary (dict): dictionary of paramter ranges
            num (int): maximum number of combinations to generate
        """
        if raw:
            return dictionary
        keys = sorted(list(dictionary.keys()))
        values = [dictionary[k] for k in keys]
        combinations = [dict(zip(keys, p))
                        for p in list(itertools.product(*values))]
        return combinations


class NoiseFilter(StatisticsMixin,
                  ParameterCheckingMixin,
                  ParameterCombinationsMixin):
    """
    Parent class of noise filtering methods
    """

    def __init__(self):
        """
        Constructor
        """
        pass

    def remove_noise(self, X, y):
        """
        Removes noise
        Args:
            X (np.array): features
            y (np.array): target labels
        """
        pass

    def get_params(self, deep=False):
        """
        Return parameters

        Returns:
            dict: dictionary of parameters
        """

        return {}

    def set_params(self, **params):
        """
        Set parameters

        Args:
            params (dict): dictionary of parameters
        """

        for key, value in params.items():
            setattr(self, key, value)

        return self


[docs]class TomekLinkRemoval(NoiseFilter):
    """
    Tomek link removal

    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """

[docs]    def __init__(self, strategy='remove_majority', n_jobs=1):
        """
        Constructor of the noise filter.

        Args:
            strategy (str): noise removal strategy:
                            'remove_majority'/'remove_both'
            n_jobs (int): number of jobs
        """
        super().__init__()

        self.check_isin(strategy, 'strategy', [
                        'remove_majority', 'remove_both'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.strategy = strategy
        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise from dataset

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: dataset after noise removal
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        # using 2 neighbors because the first neighbor is the point itself
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        distances, indices = nn.fit(X).kneighbors(X)

        # identify links
        links = []
        for i in range(len(indices)):
            if indices[indices[i][1]][1] == i:
                if not y[indices[i][1]] == y[indices[indices[i][1]][1]]:
                    links.append((i, indices[i][1]))

        # determine links to be removed
        to_remove = []
        for li in links:
            if self.strategy == 'remove_majority':
                if y[li[0]] == self.min_label:
                    to_remove.append(li[1])
                else:
                    to_remove.append(li[0])
            elif self.strategy == 'remove_both':
                to_remove.append(li[0])
                to_remove.append(li[1])
            else:
                m = 'No Tomek link strategy %s implemented' % self.strategy
                raise ValueError(self.__class__.__name__ + ": " + m)

        to_remove = list(set(to_remove))

        return np.delete(X, to_remove, axis=0), np.delete(y, to_remove)


[docs]class CondensedNearestNeighbors(NoiseFilter):
    """
    Condensed nearest neighbors

    References:
        * BibTex::

            @ARTICLE{condensed_nn,
                        author={Hart, P.},
                        journal={IEEE Transactions on Information Theory},
                        title={The condensed nearest neighbor rule (Corresp.)},
                        year={1968},
                        volume={14},
                        number={3},
                        pages={515-516},
                        keywords={Pattern classification},
                        doi={10.1109/TIT.1968.1054155},
                        ISSN={0018-9448},
                        month={May}}
    """

[docs]    def __init__(self, n_jobs=1):
        """
        Constructor of the noise removing object

        Args:
            n_jobs (int): number of jobs
        """
        super().__init__()

        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise from dataset

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: dataset after noise removal
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        # Initial result set consists of all minority samples and 1 majority
        # sample

        X_maj = X[y == self.maj_label]
        X_hat = np.vstack([X[y == self.min_label], X_maj[0]])
        y_hat = np.hstack([np.repeat(self.min_label, len(X_hat)-1),
                           [self.maj_label]])
        X_maj = X_maj[1:]

        # Adding misclassified majority elements repeatedly
        while True:
            knn = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)
            knn.fit(X_hat, y_hat)
            pred = knn.predict(X_maj)

            if np.all(pred == self.maj_label):
                break
            else:
                X_hat = np.vstack([X_hat, X_maj[pred != self.maj_label]])
                y_hat = np.hstack(
                    [y_hat,
                     np.repeat(self.maj_label, len(X_hat) - len(y_hat))])
                X_maj = np.delete(X_maj, np.where(
                    pred != self.maj_label)[0], axis=0)
                if len(X_maj) == 0:
                    break

        return X_hat, y_hat


[docs]class OneSidedSelection(NoiseFilter):
    """
    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods
                                for Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """

[docs]    def __init__(self, n_jobs=1):
        """
        Constructor of the noise removal object

        Args:
            n_jobs (int): number of jobs
        """
        super().__init__()

        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: cleaned features and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        t = TomekLinkRemoval(n_jobs=self.n_jobs)
        X0, y0 = t.remove_noise(X, y)
        cnn = CondensedNearestNeighbors(n_jobs=self.n_jobs)

        return cnn.remove_noise(X0, y0)


[docs]class CNNTomekLinks(NoiseFilter):
    """
    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods
                                for Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """

[docs]    def __init__(self, n_jobs=1):
        """
        Constructor of the noise removal object

        Args:
            n_jobs (int): number of parallel jobs
        """
        super().__init__()

        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: cleaned features and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        c = CondensedNearestNeighbors(n_jobs=self.n_jobs)
        X0, y0 = c.remove_noise(X, y)
        t = TomekLinkRemoval(n_jobs=self.n_jobs)

        return t.remove_noise(X0, y0)


[docs]class NeighborhoodCleaningRule(NoiseFilter):
    """
    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """

[docs]    def __init__(self, n_jobs=1):
        """
        Constructor of the noise removal object

        Args:
            n_jobs (int): number of parallel jobs
        """
        super().__init__()

        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: cleaned features and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        # fitting nearest neighbors with proposed parameter
        # using 4 neighbors because the first neighbor is the point itself
        nn = NearestNeighbors(n_neighbors=4, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X)

        # identifying the samples to be removed
        to_remove = []
        for i in range(len(X)):
            if (y[i] == self.maj_label and
                    mode(y[indices[i][1:]]) == self.min_label):
                # if sample i is majority and the decision based on
                # neighbors is minority
                to_remove.append(i)
            elif (y[i] == self.min_label and
                  mode(y[indices[i][1:]]) == self.maj_label):
                # if sample i is minority and the decision based on
                # neighbors is majority
                for j in indices[i][1:]:
                    if y[j] == self.maj_label:
                        to_remove.append(j)

        # removing the noisy samples and returning the results
        to_remove = list(set(to_remove))
        return np.delete(X, to_remove, axis=0), np.delete(y, to_remove)


[docs]class EditedNearestNeighbors(NoiseFilter):
    """
    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """

[docs]    def __init__(self, remove='both', n_jobs=1):
        """
        Constructor of the noise removal object

        Args:
            remove (str): class to remove from 'both'/'min'/'maj'
            n_jobs (int): number of parallel jobs
        """
        super().__init__()

        self.check_isin(remove, 'remove', ['both', 'min', 'maj'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.remove = remove
        self.n_jobs = n_jobs

[docs]    def remove_noise(self, X, y):
        """
        Removes noise

        Args:
            X (np.matrix): features
            y (np.array): target labels

        Returns:
            np.matrix, np.array: cleaned features and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running noise removal via %s" % self.__class__.__name__)
        self.class_label_statistics(X, y)

        if len(X) < 4:
            _logger.info(self.__class__.__name__ + ': ' +
                         "Not enough samples for noise removal")
            return X.copy(), y.copy()

        nn = NearestNeighbors(n_neighbors=4, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X)

        to_remove = []
        for i in range(len(X)):
            if not y[i] == mode(y[indices[i][1:]]):
                if (self.remove == 'both' or
                    (self.remove == 'min' and y[i] == self.min_label) or
                        (self.remove == 'maj' and y[i] == self.maj_label)):
                    to_remove.append(i)

        return np.delete(X, to_remove, axis=0), np.delete(y, to_remove)

[docs]    def get_params(self):
        """
        Get noise removal parameters

        Returns:
            dict: dictionary of parameters
        """
        return {'remove': self.remove}


class OverSampling(StatisticsMixin,
                   ParameterCheckingMixin,
                   ParameterCombinationsMixin,
                   RandomStateMixin):
    """
    Base class of oversampling methods
    """

    categories = []

    cat_noise_removal = 'NR'
    cat_dim_reduction = 'DR'
    cat_uses_classifier = 'Clas'
    cat_sample_componentwise = 'SCmp'
    cat_sample_ordinary = 'SO'
    cat_sample_copy = 'SCpy'
    cat_memetic = 'M'
    cat_density_estimation = 'DE'
    cat_density_based = 'DB'
    cat_extensive = 'Ex'
    cat_changes_majority = 'CM'
    cat_uses_clustering = 'Clus'
    cat_borderline = 'BL'
    cat_application = 'A'

    def __init__(self):
        pass

    def det_n_to_sample(self, strategy, n_maj, n_min):
        """
        Determines the number of samples to generate
        Args:
            strategy (str/float): if float, the fraction of the difference
                                    of the minority and majority numbers to
                                    generate, like 0.1 means that 10% of the
                                    difference will be generated if str,
                                    like 'min2maj', the minority class will
                                    be upsampled to match the cardinality
                                    of the majority class
        """
        if isinstance(strategy, float) or isinstance(strategy, int):
            return max([0, int((n_maj - n_min)*strategy)])
        else:
            m = "Value %s for parameter strategy is not supported" % strategy
            raise ValueError(self.__class__.__name__ + ": " + m)

    def sample_between_points(self, x, y):
        """
        Sample randomly along the line between two points.
        Args:
            x (np.array): point 1
            y (np.array): point 2
        Returns:
            np.array: the new sample
        """
        return x + (y - x)*self.random_state.random_sample()

    def sample_between_points_componentwise(self, x, y, mask=None):
        """
        Sample each dimension separately between the two points.
        Args:
            x (np.array): point 1
            y (np.array): point 2
            mask (np.array): array of 0,1s - specifies which dimensions
                                to sample
        Returns:
            np.array: the new sample being generated
        """
        if mask is None:
            return x + (y - x)*self.random_state.random_sample()
        else:
            return x + (y - x)*self.random_state.random_sample()*mask

    def sample_by_jittering(self, x, std):
        """
        Sample by jittering.
        Args:
            x (np.array): base point
            std (float): standard deviation
        Returns:
            np.array: the new sample
        """
        return x + (self.random_state.random_sample() - 0.5)*2.0*std

    def sample_by_jittering_componentwise(self, x, std):
        """
        Sample by jittering componentwise.
        Args:
            x (np.array): base point
            std (np.array): standard deviation
        Returns:
            np.array: the new sample
        """
        return x + (self.random_state.random_sample(len(x))-0.5)*2.0 * std

    def sample_by_gaussian_jittering(self, x, std):
        """
        Sample by Gaussian jittering
        Args:
            x (np.array): base point
            std (np.array): standard deviation
        Returns:
            np.array: the new sample
        """
        return self.random_state.normal(x, std)

    def sample(self, X, y):
        """
        The samplig function reimplemented in child classes
        Args:
            X (np.matrix): features
            y (np.array): labels
        Returns:
            np.matrix, np.array: sampled X and y
        """
        return X, y

    def fit_resample(self, X, y):
        """
        Alias of the function "sample" for compatibility with imbalanced-learn
        pipelines
        """
        return self.sample(X, y)

    def sample_with_timing(self, X, y):
        begin = time.time()
        X_samp, y_samp = self.sample(X, y)
        _logger.info(self.__class__.__name__ + ": " +
                     ("runtime: %f" % (time.time() - begin)))
        return X_samp, y_samp

    def preprocessing_transform(self, X):
        """
        Transforms new data according to the possible transformation
        implemented by the function "sample".
        Args:
            X (np.matrix): features
        Returns:
            np.matrix: transformed features
        """
        return X

    def get_params(self, deep=False):
        """
        Returns the parameters of the object as a dictionary.
        Returns:
            dict: the parameters of the object
        """
        pass

    def set_params(self, **params):
        """
        Set parameters

        Args:
            params (dict): dictionary of parameters
        """

        for key, value in params.items():
            setattr(self, key, value)

        return self

    def descriptor(self):
        """
        Returns:
            str: JSON description of the current sampling object
        """
        return str((self.__class__.__name__, str(self.get_params())))

    def __str__(self):
        return self.descriptor()


class UnderSampling(StatisticsMixin,
                    ParameterCheckingMixin,
                    ParameterCombinationsMixin):
    """
    Base class of undersampling approaches.
    """

    def __init__(self):
        """
        Constructorm
        """
        super().__init__()

    def sample(self, X, y):
        """
        Carry out undersampling
        Args:
            X (np.matrix): features
            y (np.array): labels
        Returns:
            np.matrix, np.array: sampled X and y
        """
        pass

    def get_params(self, deep=False):
        """
        Returns the parameters of the object as a dictionary.
        Returns:
            dict: the parameters of the object
        """
        pass

    def descriptor(self):
        """
        Returns:
            str: JSON description of the current sampling object
        """
        return str((self.__class__.__name__, str(self.get_params())))


[docs]class NoSMOTE(OverSampling):
    """
    The goal of this class is to provide a functionality to send data through
    on any model selection/evaluation pipeline with no oversampling carried
    out. It can be used to get baseline estimates on preformance.
    """

    categories = []

[docs]    def __init__(self, random_state=None):
        """
        Constructor of the NoSMOTE object.

        Args:
            random_state (int/np.random.RandomState/None): dummy parameter for \
                        the compatibility of interfaces
        """
        super().__init__()

[docs]    @classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return cls.generate_parameter_combinations({}, raw=False)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        return X.copy(), y.copy()

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {}


[docs]class SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @article{smote,
                author={Chawla, N. V. and Bowyer, K. W. and Hall, L. O. and
                            Kegelmeyer, W. P.},
                title={{SMOTE}: synthetic minority over-sampling technique},
                journal={Journal of Artificial Intelligence Research},
                volume={16},
                year={2002},
                pages={321--357}
              }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the SMOTE object

        Args:
            proportion (float): proportion of the difference of n_maj and
                                n_min to sample e.g. 1.0
            means that after sampling the number of minority samples will
                                 be equal to the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                technique
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determining the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            # _logger.warning(self.__class__.__name__ +
            #                ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting the model
        n_neigh = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neigh, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        if n_to_sample == 0:
            return X.copy(), y.copy()

        # generating samples
        base_indices = self.random_state.choice(list(range(len(X_min))),
                                                n_to_sample)
        neighbor_indices = self.random_state.choice(list(range(1, n_neigh)),
                                                    n_to_sample)

        X_base = X_min[base_indices]
        X_neighbor = X_min[ind[base_indices, neighbor_indices]]

        samples = X_base + np.multiply(self.random_state.rand(n_to_sample,
                                                              1),
                                       X_neighbor - X_base)

        return (np.vstack([X, samples]),
                np.hstack([y, np.hstack([self.min_label]*n_to_sample)]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_TomekLinks(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_tomeklinks_enn,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA},
                    }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the SMOTE object

        Args:
            proportion (float): proportion of the difference of n_maj and
                                n_min to sample e.g. 1.0 means that after
                                sampling the number of minority samples
                                will be equal to the number of majority
                                samples
            n_neighbors (int): control parameter of the nearest neighbor
                                technique
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return SMOTE.parameter_combinations(raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        smote = SMOTE(self.proportion,
                      self.n_neighbors,
                      n_jobs=self.n_jobs,
                      random_state=self.random_state)
        X_new, y_new = smote.sample(X, y)

        t = TomekLinkRemoval(strategy='remove_both', n_jobs=self.n_jobs)

        X_samp, y_samp = t.remove_noise(X_new, y_new)

        if len(X_samp) == 0:
            m = ("All samples have been removed, "
                 "returning the original dataset.")
            _logger.info(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        return X_samp, y_samp

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_ENN(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_tomeklinks_enn,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA},
                    }

    Notes:
        * Can remove too many of minority samples.
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the SMOTE object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (int): control parameter of the nearest neighbor
                                technique
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return SMOTE.parameter_combinations(raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        smote = SMOTE(self.proportion, self.n_neighbors,
                      n_jobs=self.n_jobs, random_state=self.random_state)
        X_new, y_new = smote.sample(X, y)

        enn = EditedNearestNeighbors(n_jobs=self.n_jobs)

        return enn.remove_noise(X_new, y_new)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Borderline_SMOTE1(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{borderlineSMOTE,
                            author="Han, Hui
                            and Wang, Wen-Yuan
                            and Mao, Bing-Huan",
                            editor="Huang, De-Shuang
                            and Zhang, Xiao-Ping
                            and Huang, Guang-Bin",
                            title="Borderline-SMOTE: A New Over-Sampling Method
                                     in Imbalanced Data Sets Learning",
                            booktitle="Advances in Intelligent Computing",
                            year="2005",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="878--887",
                            isbn="978-3-540-31902-3"
                            }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_extensive,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 k_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (int): control parameter of the nearest neighbor
                                    technique for determining the borderline
            k_neighbors (int): control parameter of the nearest neighbor
                                    technique for sampling
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_greater_or_equal(k_neighbors, 'k_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.k_neighbors = k_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'k_neighbors': [3, 5, 7]}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determining number of samples to be generated
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # fitting model
        X_min = X[y == self.min_label]

        n_neighbors = min([len(X), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_min)

        # determining minority samples in danger
        noise = []
        danger = []
        for i in range(len(indices)):
            if self.n_neighbors == sum(y[indices[i][1:]] == self.maj_label):
                noise.append(i)
            elif mode(y[indices[i][1:]]) == self.maj_label:
                danger.append(i)
        X_danger = X_min[danger]
        X_min = np.delete(X_min, np.array(noise).astype(int), axis=0)

        if len(X_danger) == 0:
            _logger.info(self.__class__.__name__ +
                         ": " + "No samples in danger")
            return X.copy(), y.copy()

        # fitting nearest neighbors model to minority samples
        k_neigh = min([len(X_min), self.k_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=k_neigh, n_jobs=self.n_jobs)
        nn.fit(X_min)
        # extracting neighbors of samples in danger
        distances, indices = nn.kneighbors(X_danger)

        # generating samples near points in danger
        base_indices = self.random_state.choice(list(range(len(X_danger))),
                                                n_to_sample)
        neighbor_indices = self.random_state.choice(list(range(1, k_neigh)),
                                                    n_to_sample)

        X_base = X_danger[base_indices]
        X_neighbor = X_min[indices[base_indices, neighbor_indices]]

        samples = X_base + \
            np.multiply(self.random_state.rand(
                n_to_sample, 1), X_neighbor - X_base)

        return (np.vstack([X, samples]),
                np.hstack([y, np.hstack([self.min_label]*n_to_sample)]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'k_neighbors': self.k_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Borderline_SMOTE2(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{borderlineSMOTE,
                            author="Han, Hui
                            and Wang, Wen-Yuan
                            and Mao, Bing-Huan",
                            editor="Huang, De-Shuang
                            and Zhang, Xiao-Ping
                            and Huang, Guang-Bin",
                            title="Borderline-SMOTE: A New Over-Sampling
                                    Method in Imbalanced Data Sets Learning",
                            booktitle="Advances in Intelligent Computing",
                            year="2005",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="878--887",
                            isbn="978-3-540-31902-3"
                            }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_extensive,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 k_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and
                                n_min to sample e.g. 1.0 means that after
                                sampling the number of minority samples
                                will be equal to the number of majority
                                samples
            n_neighbors (int): control parameter of the nearest neighbor
                                technique for determining the borderline
            k_neighbors (int): control parameter of the nearest neighbor
                                technique for sampling
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_greater_or_equal(k_neighbors, 'k_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.k_neighbors = k_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'k_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determining number of samples to be generated
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # fitting nearest neighbors model
        X_min = X[y == self.min_label]

        n_neighbors = min([self.n_neighbors+1, len(X)])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_min)

        # determining minority samples in danger
        noise = []
        danger = []
        for i in range(len(indices)):
            if self.n_neighbors == sum(y[indices[i][1:]] == self.maj_label):
                noise.append(i)
            elif mode(y[indices[i][1:]]) == self.maj_label:
                danger.append(i)
        X_danger = X_min[danger]
        X_min = np.delete(X_min, np.array(noise).astype(int), axis=0)

        if len(X_min) < 2:
            m = ("The number of minority samples after preprocessing (%d) is "
                 "not enough for sampling")
            m = m % (len(X_min))
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        if len(X_danger) == 0:
            m = "No samples in danger"
            _logger.info(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        # fitting nearest neighbors model to minority samples
        k_neigh = self.k_neighbors + 1
        k_neigh = min([k_neigh, len(X)])
        nn = NearestNeighbors(n_neighbors=k_neigh, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_danger)

        # generating the samples
        base_indices = self.random_state.choice(
            list(range(len(X_danger))), n_to_sample)
        neighbor_indices = self.random_state.choice(
            list(range(1, k_neigh)), n_to_sample)

        X_base = X_danger[base_indices]
        X_neighbor = X[indices[base_indices, neighbor_indices]]
        diff = X_neighbor - X_base
        r = self.random_state.rand(n_to_sample, 1)
        mask = y[neighbor_indices] == self.maj_label
        r[mask] = r[mask]*0.5

        samples = X_base + np.multiply(r, diff)

        return (np.vstack([X, samples]),
                np.hstack([y, np.hstack([self.min_label]*n_to_sample)]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'k_neighbors': self.k_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ADASYN(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{adasyn,
                          author={He, H. and Bai, Y. and Garcia,
                                    E. A. and Li, S.},
                          title={{ADASYN}: adaptive synthetic sampling
                                    approach for imbalanced learning},
                          booktitle={Proceedings of IJCNN},
                          year={2008},
                          pages={1322--1328}
                        }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_extensive,
                  OverSampling.cat_borderline,
                  OverSampling.cat_density_based]

[docs]    def __init__(self,
                 n_neighbors=5,
                 d_th=0.9,
                 beta=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            d_th (float): tolerated deviation level from balancedness
            beta (float): target level of balancedness, same as proportion
                            in other techniques
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_greater_or_equal(d_th, 'd_th', 0)
        self.check_greater_or_equal(beta, 'beta', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_neighbors = n_neighbors
        self.d_th = d_th
        self.beta = beta
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'n_neighbors': [3, 5, 7, 9],
                                  'd_th': [0.9],
                                  'beta': [1.0, 0.75, 0.5, 0.25]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # extracting minority samples
        X_min = X[y == self.min_label]

        # checking if sampling is needed
        m_min = len(X_min)
        m_maj = len(X) - m_min

        n_to_sample = (m_maj - m_min)*self.beta

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        d = float(m_min)/m_maj
        if d > self.d_th:
            return X.copy(), y.copy()

        # fitting nearest neighbors model to all samples
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_min)

        # determining the distribution of points to be generated
        r = []
        for i in range(len(indices)):
            r.append(sum(y[indices[i][1:]] ==
                         self.maj_label)/self.n_neighbors)
        r = np.array(r)
        if sum(r) > 0:
            r = r/sum(r)

        if any(np.isnan(r)) or sum(r) == 0:
            _logger.warning(self.__class__.__name__ + ": " +
                            "not enough non-noise samples for oversampling")
            return X.copy(), y.copy()

        # fitting nearest neighbors models to minority samples
        n_neigh = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neigh, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distances, indices = nn.kneighbors(X_min)

        # sampling points
        base_indices = self.random_state.choice(
            list(range(len(X_min))), size=int(n_to_sample), p=r)
        neighbor_indices = self.random_state.choice(
            list(range(1, n_neigh)), int(n_to_sample))

        X_base = X_min[base_indices]
        X_neighbor = X_min[indices[base_indices, neighbor_indices]]
        diff = X_neighbor - X_base
        r = self.random_state.rand(int(n_to_sample), 1)

        samples = X_base + np.multiply(r, diff)

        return (np.vstack([X, samples]),
                np.hstack([y, np.hstack([self.min_label]*int(n_to_sample))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_neighbors': self.n_neighbors,
                'd_th': self.d_th,
                'beta': self.beta,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class AHC(OverSampling):
    """
    References:
        * BibTex::

            @article{AHC,
                    title = "Learning from imbalanced data in surveillance
                             of nosocomial infection",
                    journal = "Artificial Intelligence in Medicine",
                    volume = "37",
                    number = "1",
                    pages = "7 - 18",
                    year = "2006",
                    note = "Intelligent Data Analysis in Medicine",
                    issn = "0933-3657",
                    doi = "https://doi.org/10.1016/j.artmed.2005.03.002",
                    url = {http://www.sciencedirect.com/science/article/
                            pii/S0933365705000850},
                    author = "Gilles Cohen and Mélanie Hilario and Hugo Sax
                                and Stéphane Hugonnet and Antoine Geissbuhler",
                    keywords = "Nosocomial infection, Machine learning,
                                    Support vector machines, Data imbalance"
                    }
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_application]

[docs]    def __init__(self, strategy='min', n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            strategy (str): which class to sample (min/maj/minmaj)
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_isin(strategy, 'strategy', ['min', 'maj', 'minmaj'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.strategy = strategy
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'strategy': ['min', 'maj', 'minmaj']}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample_majority(self, X, n_clusters):
        """
        Sample the majority class

        Args:
            X (np.ndarray): majority samples
            n_clusters (int): number of clusters to find

        Returns:
            np.ndarray: downsampled vectors
        """
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X)
        return kmeans.cluster_centers_

[docs]    def sample_minority(self, X):
        """
        Sampling the minority class

        Args:
            X (np.ndarray): minority samples

        Returns:
            np.ndarray: the oversampled set of vectors
        """
        ac = AgglomerativeClustering(n_clusters=1)
        ac.fit(X)
        n_samples = len(X)

        cc = [None]*len(ac.children_)
        weights = [None]*len(ac.children_)

        def cluster_centers(children, i, cc, weights):
            """
            Extract cluster centers

            Args:
                children (np.array): indices of children
                i (int): index to process
                cc (np.array): cluster centers
                weights (np.array): cluster weights

            Returns:
                int, float: new cluster center, new weight
            """
            if i < n_samples:
                return X[i], 1.0

            if cc[i - n_samples] is None:
                a, w_a = cluster_centers(
                    children, children[i - n_samples][0], cc, weights)
                b, w_b = cluster_centers(
                    children, children[i - n_samples][1], cc, weights)
                cc[i - n_samples] = (w_a*a + w_b*b)/(w_a + w_b)
                weights[i - n_samples] = w_a + w_b

            return cc[i - n_samples], weights[i - n_samples]

        cluster_centers(ac.children_, ac.children_[-1][-1] + 1, cc, weights)

        return np.vstack(cc)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # extracting minority samples
        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        if self.strategy == 'maj':
            X_maj_resampled = self.sample_majority(X_maj, len(X_min))
            return (np.vstack([X_min, X_maj_resampled]),
                    np.hstack([np.repeat(self.min_label, len(X_min)),
                               np.repeat(self.maj_label,
                                         len(X_maj_resampled))]))
        elif self.strategy == 'min':
            X_min_resampled = self.sample_minority(X_min)
            return (np.vstack([X_min_resampled, X_min, X_maj]),
                    np.hstack([np.repeat(self.min_label,
                                         (len(X_min_resampled) + len(X_min))),
                               np.repeat(self.maj_label, len(X_maj))]))
        elif self.strategy == 'minmaj':
            X_min_resampled = self.sample_minority(X_min)
            n_maj_sample = min([len(X_maj), len(X_min_resampled) + len(X_min)])
            X_maj_resampled = self.sample_majority(X_maj, n_maj_sample)
            return (np.vstack([X_min_resampled, X_min, X_maj_resampled]),
                    np.hstack([np.repeat(self.min_label,
                                         (len(X_min_resampled) + len(X_min))),
                               np.repeat(self.maj_label,
                                         len(X_maj_resampled))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'strategy': self.strategy,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class LLE_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{lle_smote,
                            author={Wang, J. and Xu, M. and Wang,
                                    H. and Zhang, J.},
                            booktitle={2006 8th international Conference
                                    on Signal Processing},
                            title={Classification of Imbalanced Data by Using
                                    the SMOTE Algorithm and Locally Linear
                                    Embedding},
                            year={2006},
                            volume={3},
                            number={},
                            pages={},
                            keywords={artificial intelligence;
                                        biomedical imaging;medical computing;
                                        imbalanced data classification;
                                        SMOTE algorithm;
                                        locally linear embedding;
                                        medical imaging intelligence;
                                        synthetic minority oversampling
                                        technique;
                                        high-dimensional data;
                                        low-dimensional space;
                                        Biomedical imaging;
                                        Back;Training data;
                                        Data mining;Biomedical engineering;
                                        Research and development;
                                        Electronic mail;Pattern recognition;
                                        Performance analysis;
                                        Classification algorithms},
                            doi={10.1109/ICOSP.2006.345752},
                            ISSN={2164-5221},
                            month={Nov}}

    Notes:
        * There might be numerical issues if the nearest neighbors contain
            some element multiple times.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_dim_reduction]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_components=2,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj
                                and n_min to sample e.g. 1.0 means that after
                                sampling the number of minority samples will
                                be equal to the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            n_components (int): dimensionality of the embedding space
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 2)
        self.check_greater_or_equal(n_components, 'n_components', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_components': [2, 3, 5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # extracting minority samples
        X_min = X[y == self.min_label]

        # do the locally linear embedding
        lle = LocallyLinearEmbedding(
            self.n_neighbors, self.n_components, n_jobs=self.n_jobs)
        try:
            lle.fit(X_min)
        except Exception as e:
            return X.copy(), y.copy()
        X_min_transformed = lle.transform(X_min)

        # fitting the nearest neighbors model for sampling
        n_neighbors = min([self.n_neighbors+1, len(X_min_transformed)])
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs).fit(X_min_transformed)
        dist, ind = nn.kneighbors(X_min_transformed)

        def solve_for_weights(xi, Z):
            """
            Solve for locally linear embedding weights

            Args:
                xi (np.array): vector
                Z (np.matrix): matrix of neighbors in rows

            Returns:
                np.array: reconstruction weights

            Following https://cs.nyu.edu/~roweis/lle/algorithm.html
            """
            Z = Z - xi
            Z = Z.T
            C = np.dot(Z.T, Z)
            try:
                w = np.linalg.solve(C, np.repeat(1.0, len(C)))
                if np.linalg.norm(w) > 1e8:
                    w = np.repeat(1.0, len(C))
            except Exception as e:
                w = np.repeat(1.0, len(C))
            return w/np.sum(w)

        # generating samples
        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.randint(len(X_min))
            random_coords = self.random_state.choice(ind[idx][1:])
            xi = self.sample_between_points(X_min_transformed[idx],
                                            X_min_transformed[random_coords])
            Z = X_min_transformed[ind[idx][1:]]
            w = solve_for_weights(xi, Z)
            samples.append(np.dot(w, X_min[ind[idx][1:]]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_components': self.n_components,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class distance_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{distance_smote,
                            author={de la Calleja, J. and Fuentes, O.},
                            booktitle={Proceedings of the Twentieth
                                        International Florida Artificial
                                        Intelligence},
                            title={A distance-based over-sampling method
                                    for learning from imbalanced data sets},
                            year={2007},
                            volume={3},
                            pages={634--635}
                            }

    Notes:
        * It is not clear what the authors mean by "weighted distance".
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # extracting minority samples
        X_min = X[y == self.min_label]

        # fitting the model
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.randint(len(X_min))
            mean_vector = np.mean(X_min[ind[idx][1:]], axis=0)
            samples.append(self.sample_between_points(X_min[idx], mean_vector))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMMO(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{smmo,
                            author = {de la Calleja, Jorge and Fuentes, Olac
                                        and González, Jesús},
                            booktitle= {Proceedings of the Twenty-First
                                        International Florida Artificial
                                        Intelligence Research Society
                                        Conference},
                            year = {2008},
                            month = {01},
                            pages = {276-281},
                            title = {Selecting Minority Examples from
                                    Misclassified Data for Over-Sampling.}
                            }

    Notes:
        * In this paper the ensemble is not specified. I have selected
            some very fast, basic classifiers.
        * Also, it is not clear what the authors mean by "weighted distance".
        * The original technique is not prepared for the case when no minority
            samples are classified correctly be the ensemble.
    """

    categories = [OverSampling.cat_borderline,
                  OverSampling.cat_extensive,
                  OverSampling.cat_uses_classifier]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 ensemble=[QuadraticDiscriminantAnalysis(),
                           DecisionTreeClassifier(random_state=2),
                           GaussianNB()],
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            ensemble (list): list of classifiers, if None, default list of
                                classifiers is used
            n_jobs (int): number of parallel jobs
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        try:
            len_ens = len(ensemble)
        except Exception as e:
            raise ValueError('The ensemble needs to be a list-like object')
        if len_ens == 0:
            raise ValueError('At least 1 classifier needs to be specified')
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.ensemble = ensemble
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        ensembles = [[QuadraticDiscriminantAnalysis(),
                      DecisionTreeClassifier(random_state=2),
                      GaussianNB()]]
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'ensemble': ensembles}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # training and in-sample prediction (out-of-sample by k-fold cross
        # validation might be better)
        predictions = []
        for e in self.ensemble:
            predictions.append(e.fit(X, y).predict(X))

        # constructing ensemble prediction
        pred = np.where(np.sum(np.vstack(predictions), axis=0)
                        > len(self.ensemble)/2, 1, 0)

        # create mask of minority samples to sample
        mask_to_sample = np.where(np.logical_and(np.logical_not(
            np.equal(pred, y)), y == self.min_label))[0]
        if len(mask_to_sample) < 2:
            m = "Not enough minority samples selected %d" % len(mask_to_sample)
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_min_to_sample = X[mask_to_sample]

        # fitting nearest neighbors model for sampling
        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min_to_sample)

        # doing the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.randint(len(X_min_to_sample))
            mean = np.mean(X_min[ind[idx][1:]], axis=0)
            samples.append(self.sample_between_points(
                X_min_to_sample[idx], mean))

        return (np.vstack([X, np.vstack([samples])]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'ensemble': self.ensemble,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class polynom_fit_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{polynomial_fit_smote,
                            author={Gazzah, S. and Amara, N. E. B.},
                            booktitle={2008 The Eighth IAPR International
                                        Workshop on Document Analysis Systems},
                            title={New Oversampling Approaches Based on
                                    Polynomial Fitting for Imbalanced Data
                                    Sets},
                            year={2008},
                            volume={},
                            number={},
                            pages={677-684},
                            keywords={curve fitting;learning (artificial
                                        intelligence);mesh generation;pattern
                                        classification;polynomials;sampling
                                        methods;support vector machines;
                                        oversampling approach;polynomial
                                        fitting function;imbalanced data
                                        set;pattern classification task;
                                        class-modular strategy;support
                                        vector machine;true negative rate;
                                        true positive rate;star topology;
                                        bus topology;polynomial curve
                                        topology;mesh topology;Polynomials;
                                        Topology;Support vector machines;
                                        Support vector machine classification;
                                        Pattern classification;Performance
                                        evaluation;Training data;Text
                                        analysis;Data engineering;Convergence;
                                        writer identification system;majority
                                        class;minority class;imbalanced data
                                        sets;polynomial fitting functions;
                                        class-modular strategy},
                            doi={10.1109/DAS.2008.74},
                            ISSN={},
                            month={Sept},}
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 topology='star',
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            topoplogy (str): 'star'/'bus'/'mesh'
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0.0)
        if topology.startswith('poly'):
            self.check_greater_or_equal(
                int(topology.split('_')[-1]), 'topology', 1)
        else:
            self.check_isin(topology, "topology", ['star', 'bus', 'mesh'])

        self.proportion = proportion
        self.topology = topology

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'topology': ['star', 'bus', 'mesh',
                                               'poly_1', 'poly_2', 'poly_3']}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # extracting minority samples
        X_min = X[y == self.min_label]

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        samples = []
        if self.topology == 'star':
            # Implementation of the star topology
            X_mean = np.mean(X_min, axis=0)
            k = max([1, int(np.rint(n_to_sample/len(X_min)))])
            for x in X_min:
                diff = X_mean - x
                for i in range(1, k+1):
                    samples.append(x + float(i)/(k+1)*diff)
        elif self.topology == 'bus':
            # Implementation of the bus topology
            k = max([1, int(np.rint(n_to_sample/len(X_min)))])
            for i in range(1, len(X_min)):
                diff = X_min[i-1] - X_min[i]
                for j in range(1, k+1):
                    samples.append(X_min[i] + float(j)/(k+1)*diff)
        elif self.topology == 'mesh':
            # Implementation of the mesh topology
            if len(X_min)**2 > n_to_sample:
                while len(samples) < n_to_sample:
                    random_i = self.random_state.randint(len(X_min))
                    random_j = self.random_state.randint(len(X_min))
                    diff = X_min[random_i] - X_min[random_j]
                    samples.append(X_min[random_i] + 0.5*diff)
            else:
                n_combs = (len(X_min)*(len(X_min)-1)/2)
                k = max([1, int(np.rint(n_to_sample/n_combs))])
                for i in range(len(X_min)):
                    for j in range(len(X_min)):
                        diff = X_min[i] - X_min[j]
                        for li in range(1, k+1):
                            samples.append(X_min[j] + float(li)/(k+1)*diff)
        elif self.topology.startswith('poly'):
            # Implementation of the polynomial topology
            deg = int(self.topology.split('_')[1])
            dim = len(X_min[0])

            def fit_poly(d):
                return np.poly1d(np.polyfit(np.arange(len(X_min)),
                                            X_min[:, d], deg))

            polys = [fit_poly(d) for d in range(dim)]

            for d in range(dim):
                random_sample = self.random_state.random_sample()*len(X_min)
                samples_gen = [polys[d](random_sample)
                               for _ in range(n_to_sample)]
                samples.append(np.array(samples_gen))
            samples = np.vstack(samples).T

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'topology': self.topology,
                'random_state': self._random_state_init}


[docs]class Stefanowski(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{stefanowski,
                 author = {Stefanowski, Jerzy and Wilk, Szymon},
                 title = {Selective Pre-processing of Imbalanced Data for
                            Improving Classification Performance},
                 booktitle = {Proceedings of the 10th International Conference
                                on Data Warehousing and Knowledge Discovery},
                 series = {DaWaK '08},
                 year = {2008},
                 isbn = {978-3-540-85835-5},
                 location = {Turin, Italy},
                 pages = {283--292},
                 numpages = {10},
                 url = {http://dx.doi.org/10.1007/978-3-540-85836-2_27},
                 doi = {10.1007/978-3-540-85836-2_27},
                 acmid = {1430591},
                 publisher = {Springer-Verlag},
                 address = {Berlin, Heidelberg},
                }
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_sample_copy,
                  OverSampling.cat_borderline]

[docs]    def __init__(self, strategy='weak_amp', n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            strategy (str): 'weak_amp'/'weak_amp_relabel'/'strong_amp'
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_isin(strategy,
                        'strategy',
                        ['weak_amp', 'weak_amp_relabel', 'strong_amp'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.strategy = strategy
        self.n_jobs = n_jobs

        # this method does not maintain randomness, the parameter is
        # introduced for the compatibility of interfaces
        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        if not raw:
            return [{'strategy': 'weak_amp'},
                    {'strategy': 'weak_amp_relabel'},
                    {'strategy': 'strong_amp'}, ]
        else:
            return {'strategy': ['weak_amp', 'weak_amp_relabel', 'strong_amp']}

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if self.class_stats[self.min_label] < 6:
            m = ("The number of minority samples (%d) is not"
                 " enough for sampling")
            m = m % (self.class_stats[self.min_label])
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        # copying y as its values will change
        y = y.copy()
        # fitting the nearest neighbors model for noise filtering, 4 neighbors
        # instead of 3 as the closest neighbor to a point is itself
        nn = NearestNeighbors(n_neighbors=min(4, len(X)), n_jobs=self.n_jobs)
        nn.fit(X)
        distance, indices = nn.kneighbors(X)

        # fitting the nearest neighbors model for sample generation,
        # 6 neighbors instead of 5 for the same reason
        nn5 = NearestNeighbors(n_neighbors=min(6, len(X)), n_jobs=self.n_jobs)
        nn5.fit(X)
        distance5, indices5 = nn5.kneighbors(X)

        # determining noisy and safe flags
        flags = []
        for i in range(len(indices)):
            if mode(y[indices[i][1:]]) == y[i]:
                flags.append('safe')
            else:
                flags.append('noisy')
        flags = np.array(flags)

        D = (y == self.maj_label) & (flags == 'noisy')
        minority_indices = np.where(y == self.min_label)[0]

        samples = []
        if self.strategy == 'weak_amp' or self.strategy == 'weak_amp_relabel':
            # weak mplification - the number of copies is the number of
            # majority nearest neighbors
            for i in minority_indices:
                if flags[i] == 'noisy':
                    k = np.sum(np.logical_and(
                        y[indices[i][1:]] == self.maj_label,
                        flags[indices[i][1:]] == 'safe'))
                    for _ in range(k):
                        samples.append(X[i])
        if self.strategy == 'weak_amp_relabel':
            # relabling - noisy majority neighbors are relabelled to minority
            for i in minority_indices:
                if flags[i] == 'noisy':
                    for j in indices[i][1:]:
                        if y[j] == self.maj_label and flags[j] == 'noisy':
                            y[j] = self.min_label
                            D[j] = False
        if self.strategy == 'strong_amp':
            # safe minority samples are copied as many times as many safe
            # majority samples are among the nearest neighbors
            for i in minority_indices:
                if flags[i] == 'safe':
                    k = np.sum(np.logical_and(
                        y[indices[i][1:]] == self.maj_label,
                        flags[indices[i][1:]] == 'safe'))
                    for _ in range(k):
                        samples.append(X[i])
            # if classified correctly by knn(5), noisy minority samples are
            # amplified by creating as many copies as many save majority
            # samples in its neighborhood are present otherwise amplify
            # based on the 5 neighborhood
            for i in minority_indices:
                if flags[i] == 'noisy':
                    if mode(y[indices5[i][1:]]) == y[i]:
                        k = np.sum(np.logical_and(
                            y[indices[i][1:]] == self.maj_label,
                            flags[indices[i][1:]] == 'safe'))
                    else:
                        k = np.sum(np.logical_and(
                            y[indices5[i][1:]] == self.maj_label,
                            flags[indices5[i][1:]] == 'safe'))
                    for _ in range(k):
                        samples.append(X[i])

        to_remove = np.where(D)[0]

        X_noise_removed = np.delete(X, to_remove, axis=0)
        y_noise_removed = np.delete(y, to_remove, axis=0)

        if len(samples) == 0 and len(X_noise_removed) > 10:
            m = "no samples to add"
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X_noise_removed, y_noise_removed
        elif len(samples) == 0:
            m = "all samples removed as noise, returning the original dataset"
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        return (np.vstack([X_noise_removed,
                           np.vstack(samples)]),
                np.hstack([y_noise_removed,
                           np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'strategy': self.strategy,
                'n_jobs': self.n_jobs}


[docs]class ADOMS(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{adoms,
                            author={Tang, S. and Chen, S.},
                            booktitle={2008 International Conference on
                                        Information Technology and
                                        Applications in Biomedicine},
                            title={The generation mechanism of synthetic
                                    minority class examples},
                            year={2008},
                            volume={},
                            number={},
                            pages={444-447},
                            keywords={medical image processing;
                                        generation mechanism;synthetic
                                        minority class examples;class
                                        imbalance problem;medical image
                                        analysis;oversampling algorithm;
                                        Principal component analysis;
                                        Biomedical imaging;Medical
                                        diagnostic imaging;Information
                                        technology;Biomedical engineering;
                                        Noise generators;Concrete;Nearest
                                        neighbor searches;Data analysis;
                                        Image analysis},
                            doi={10.1109/ITAB.2008.4570642},
                            ISSN={2168-2194},
                            month={May}}
    """

    categories = [OverSampling.cat_dim_reduction,
                  OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and
                                n_min to sample e.g. 1.0 means that after
                                sampling the number of minority samples
                                will be equal to the number of majority
                                samples
            n_neighbors (int): parameter of the nearest neighbor component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0.0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distance, indices = nn.kneighbors(X_min)

        samples = []
        for _ in range(n_to_sample):
            index = self.random_state.randint(len(X_min))
            neighbors = X_min[indices[index]]

            # fitting the PCA
            pca = PCA(n_components=1)
            pca.fit(neighbors)

            # extracting the principal direction
            principal_direction = pca.components_[0]

            # do the sampling according to the description in the paper
            random_index = self.random_state.randint(1, len(neighbors))
            random_neighbor = neighbors[random_index]
            d = np.linalg.norm(random_neighbor - X_min[index])
            r = self.random_state.random_sample()
            inner_product = np.dot(random_neighbor - X_min[index],
                                   principal_direction)
            sign = 1.0 if inner_product > 0.0 else -1.0
            samples.append(X_min[index] + sign*r*d*principal_direction)

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Safe_Level_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{safe_level_smote,
                        author = {
                            Bunkhumpornpat, Chumphol and Sinapiromsaran,
                        Krung and Lursinsap, Chidchanok},
                        title = {Safe-Level-SMOTE: Safe-Level-Synthetic
                                Minority Over-Sampling TEchnique for
                                Handling the Class Imbalanced Problem},
                        booktitle = {Proceedings of the 13th Pacific-Asia
                                    Conference on Advances in Knowledge
                                    Discovery and Data Mining},
                        series = {PAKDD '09},
                        year = {2009},
                        isbn = {978-3-642-01306-5},
                        location = {Bangkok, Thailand},
                        pages = {475--482},
                        numpages = {8},
                        url = {http://dx.doi.org/10.1007/978-3-642-01307-2_43},
                        doi = {10.1007/978-3-642-01307-2_43},
                        acmid = {1533904},
                        publisher = {Springer-Verlag},
                        address = {Berlin, Heidelberg},
                        keywords = {Class Imbalanced Problem, Over-sampling,
                                    SMOTE, Safe Level},
                    }

    Notes:
        * The original method was not prepared for the case when no minority
            sample has minority neighbors.
    """

    categories = [OverSampling.cat_borderline,
                  OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1.0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # fitting nearest neighbors model
        n_neighbors = min([self.n_neighbors+1, len(X)])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distance, indices = nn.kneighbors(X)

        minority_labels = (y == self.min_label)
        minority_indices = np.where(minority_labels)[0]

        # do the sampling
        numattrs = len(X[0])
        samples = []
        for _ in range(n_to_sample):
            index = self.random_state.randint(len(minority_indices))
            neighbor_index = self.random_state.choice(indices[index][1:])

            p = X[index]
            n = X[neighbor_index]

            # find safe levels
            sl_p = np.sum(y[indices[index][1:]] == self.min_label)
            sl_n = np.sum(y[indices[neighbor_index][1:]]
                          == self.min_label)

            if sl_n > 0:
                sl_ratio = float(sl_p)/sl_n
            else:
                sl_ratio = np.inf

            if sl_ratio == np.inf and sl_p == 0:
                pass
            else:
                s = np.zeros(numattrs)
                for atti in range(numattrs):
                    # iterate through attributes and do sampling according to
                    # safe level
                    if sl_ratio == np.inf and sl_p > 0:
                        gap = 0.0
                    elif sl_ratio == 1:
                        gap = self.random_state.random_sample()
                    elif sl_ratio > 1:
                        gap = self.random_state.random_sample()*1.0/sl_ratio
                    elif sl_ratio < 1:
                        gap = (1 - sl_ratio) + \
                            self.random_state.random_sample()*sl_ratio
                    dif = n[atti] - p[atti]
                    s[atti] = p[atti] + gap*dif
                samples.append(s)

        if len(samples) == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "No samples generated")
            return X.copy(), y.copy()
        else:
            return (np.vstack([X, np.vstack(samples)]),
                    np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MSMOTE(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{msmote,
                             author = {Hu, Shengguo and Liang,
                                 Yanfeng and Ma, Lintao and He, Ying},
                             title = {MSMOTE: Improving Classification
                                        Performance When Training Data
                                        is Imbalanced},
                             booktitle = {Proceedings of the 2009 Second
                                            International Workshop on
                                            Computer Science and Engineering
                                            - Volume 02},
                             series = {IWCSE '09},
                             year = {2009},
                             isbn = {978-0-7695-3881-5},
                             pages = {13--17},
                             numpages = {5},
                             url = {https://doi.org/10.1109/WCSE.2009.756},
                             doi = {10.1109/WCSE.2009.756},
                             acmid = {1682710},
                             publisher = {IEEE Computer Society},
                             address = {Washington, DC, USA},
                             keywords = {imbalanced data, over-sampling,
                                        SMOTE, AdaBoost, samples groups,
                                        SMOTEBoost},
                            }

    Notes:
        * The original method was not prepared for the case when all
            minority samples are noise.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting the nearest neighbors model
        n_neighbors = min([len(X), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distance, indices = nn.kneighbors(X_min)

        noise_mask = np.repeat(False, len(X_min))

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            index = self.random_state.randint(len(X_min))

            n_p = np.sum(y[indices[index][1:]] == self.min_label)

            if n_p == self.n_neighbors:
                sample_type = 'security'
            elif n_p == 0:
                sample_type = 'noise'
                noise_mask[index] = True
                if np.all(noise_mask):
                    _logger.info("All minority samples are noise")
                    return X.copy(), y.copy()
            else:
                sample_type = 'border'

            if sample_type == 'security':
                neighbor_index = self.random_state.choice(indices[index][1:])
            elif sample_type == 'border':
                neighbor_index = indices[index][1]
            else:
                continue

            s_gen = self.sample_between_points_componentwise(X_min[index],
                                                             X[neighbor_index])
            samples.append(s_gen)

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class DE_oversampling(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{de_oversampling,
                            author={Chen, L. and Cai, Z. and Chen, L. and
                                    Gu, Q.},
                            booktitle={2010 Third International Conference
                                       on Knowledge Discovery and Data Mining},
                            title={A Novel Differential Evolution-Clustering
                                    Hybrid Resampling Algorithm on Imbalanced
                                    Datasets},
                            year={2010},
                            volume={},
                            number={},
                            pages={81-85},
                            keywords={pattern clustering;sampling methods;
                                        support vector machines;differential
                                        evolution;clustering algorithm;hybrid
                                        resampling algorithm;imbalanced
                                        datasets;support vector machine;
                                        minority class;mutation operators;
                                        crossover operators;data cleaning
                                        method;F-measure criterion;ROC area
                                        criterion;Support vector machines;
                                        Intrusion detection;Support vector
                                        machine classification;Cleaning;
                                        Electronic mail;Clustering algorithms;
                                        Signal to noise ratio;Learning
                                        systems;Data mining;Geology;imbalanced
                                        datasets;hybrid resampling;clustering;
                                        differential evolution;support vector
                                        machine},
                            doi={10.1109/WKDD.2010.48},
                            ISSN={},
                            month={Jan},}
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 crossover_rate=0.5,
                 similarity_threshold=0.5,
                 n_clusters=30, n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): control parameter of the nearest neighbor
                                component
            crossover_rate (float): cross over rate of evoluation
            similarity_threshold (float): similarity threshold paramter
            n_clusters (int): number of clusters for cleansing
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 2)
        self.check_in_range(crossover_rate, 'crossover_rate', [0, 1])
        self.check_in_range(similarity_threshold,
                            'similarity_threshold', [0, 1])
        self.check_greater_or_equal(n_clusters, 'n_clusters', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.crossover_rate = crossover_rate
        self.similarity_threshold = similarity_threshold
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'crossover_rate': [0.1, 0.5, 0.9],
                                  'similarity_threshold': [0.5, 0.9],
                                  'n_clusters': [10, 20, 50]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        d = len(X[0])

        X_min = X[y == self.min_label]

        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distance, indices = nn.kneighbors(X_min)

        # generating samples
        samples = []
        for _ in range(n_to_sample):
            # mutation according to the description in the paper
            random_index = self.random_state.randint(len(X_min))
            random_point = X_min[random_index]
            random_neighbor_indices = self.random_state.choice(
                indices[random_index][1:], 2, replace=False)
            random_neighbor_1 = X_min[random_neighbor_indices[0]]
            random_neighbor_2 = X_min[random_neighbor_indices[1]]

            mutated = random_point + \
                (random_neighbor_1 - random_neighbor_2) * \
                self.random_state.random_sample()

            # crossover - updates the vector 'mutated'
            rand_s = self.random_state.randint(d)
            for i in range(d):
                random_value = self.random_state.random_sample()
                if random_value >= self.crossover_rate and not i == rand_s:
                    mutated[i] = random_point[i]
                elif random_value < self.crossover_rate or i == rand_s:
                    pass

            samples.append(mutated)

        # assembling all data for clearning
        X, y = np.vstack([X, np.vstack(samples)]), np.hstack(
            [y, np.repeat(self.min_label, len(samples))])
        X_min = X[y == self.min_label]

        # cleansing based on clustering
        n_clusters = min([len(X), self.n_clusters])
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X)
        unique_labels = np.unique(kmeans.labels_)

        def cluster_filter(li):
            return len(np.unique(y[np.where(kmeans.labels_ == li)[0]])) == 1

        one_label_clusters = [li for li in unique_labels if cluster_filter(li)]
        to_remove = []

        # going through the clusters having one label only
        for li in one_label_clusters:
            cluster_indices = np.where(kmeans.labels_ == li)[0]
            mean_of_cluster = kmeans.cluster_centers_[li]

            # finding center-like sample
            center_like_index = None
            center_like_dist = np.inf

            for i in cluster_indices:
                dist = np.linalg.norm(X[i] - mean_of_cluster)
                if dist < center_like_dist:
                    center_like_dist = dist
                    center_like_index = i

            # removing the samples similar to the center-like sample
            for i in cluster_indices:
                if i != center_like_index:
                    d = np.inner(X[i], X[center_like_index]) / \
                        (np.linalg.norm(X[i]) *
                         np.linalg.norm(X[center_like_index]))
                    if d > self.similarity_threshold:
                        to_remove.append(i)

        return np.delete(X, to_remove, axis=0), np.delete(y, to_remove)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'crossover_rate': self.crossover_rate,
                'similarity_threshold': self.similarity_threshold,
                'n_clusters': self.n_clusters,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}

# Borrowed from sklearn-dev, will be removed once the sklearn implementation
# becomes stable


class OPTICS:
    def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
                 p=2, metric_params=None, maxima_ratio=.75,
                 rejection_ratio=.7, similarity_threshold=0.4,
                 significant_min=.003, min_cluster_size=.005,
                 min_maxima_ratio=0.001, algorithm='ball_tree',
                 leaf_size=30, n_jobs=1):

        self.max_eps = max_eps
        self.min_samples = min_samples
        self.maxima_ratio = maxima_ratio
        self.rejection_ratio = rejection_ratio
        self.similarity_threshold = similarity_threshold
        self.significant_min = significant_min
        self.min_cluster_size = min_cluster_size
        self.min_maxima_ratio = min_maxima_ratio
        self.algorithm = algorithm
        self.metric = metric
        self.metric_params = metric_params
        self.p = p
        self.leaf_size = leaf_size
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Perform OPTICS clustering
        Extracts an ordered list of points and reachability distances, and
        performs initial clustering using `max_eps` distance specified at
        OPTICS object instantiation.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            The data.
        y : ignored
        Returns
        -------
        self : instance of OPTICS
            The instance.
        """
        n_samples = len(X)

        if self.min_samples > n_samples:
            m = ("Number of training samples (n_samples=%d) must "
                 "be greater than min_samples (min_samples=%d) "
                 "used for clustering.")
            m = m % (n_samples, self.min_samples)
            raise ValueError(self.__class__.__name__ + ": " + m)

        if self.min_cluster_size <= 0 or (self.min_cluster_size !=
                                          int(self.min_cluster_size)
                                          and self.min_cluster_size > 1):
            m = ('min_cluster_size must be a positive integer or '
                 'a float between 0 and 1. Got %r')
            m = m % self.min_cluster_size
            raise ValueError(self.__class__.__name__ + ": " + m)
        elif self.min_cluster_size > n_samples:
            m = ('min_cluster_size must be no greater than the '
                 'number of samples (%d). Got %d')
            m = m % (n_samples, self.min_cluster_size)

            raise ValueError(self.__class__.__name__ + ": " + m)

        # Start all points as 'unprocessed' ##
        self.reachability_ = np.empty(n_samples)
        self.reachability_.fill(np.inf)
        self.core_distances_ = np.empty(n_samples)
        self.core_distances_.fill(np.nan)
        # Start all points as noise ##
        self.labels_ = np.full(n_samples, -1, dtype=int)

        nbrs = NearestNeighbors(n_neighbors=self.min_samples,
                                algorithm=self.algorithm,
                                leaf_size=self.leaf_size, metric=self.metric,
                                metric_params=self.metric_params, p=self.p,
                                n_jobs=self.n_jobs)

        nbrs.fit(X)
        self.core_distances_[:] = nbrs.kneighbors(X,
                                                  self.min_samples)[0][:, -1]

        self.ordering_ = self._calculate_optics_order(X, nbrs)

        return self

    # OPTICS helper functions

    def _calculate_optics_order(self, X, nbrs):
        # Main OPTICS loop. Not parallelizable. The order that entries are
        # written to the 'ordering_' list is important!
        processed = np.zeros(X.shape[0], dtype=bool)
        ordering = np.zeros(X.shape[0], dtype=int)
        ordering_idx = 0
        for point in range(X.shape[0]):
            if processed[point]:
                continue
            if self.core_distances_[point] <= self.max_eps:
                while not processed[point]:
                    processed[point] = True
                    ordering[ordering_idx] = point
                    ordering_idx += 1
                    point = self._set_reach_dist(point, processed, X, nbrs)
            else:  # For very noisy points
                ordering[ordering_idx] = point
                ordering_idx += 1
                processed[point] = True
        return ordering

    def _set_reach_dist(self, point_index, processed, X, nbrs):
        P = X[point_index:point_index + 1]
        indices = nbrs.radius_neighbors(P, radius=self.max_eps,
                                        return_distance=False)[0]

        # Getting indices of neighbors that have not been processed
        unproc = np.compress((~np.take(processed, indices)).ravel(),
                             indices, axis=0)
        # Keep n_jobs = 1 in the following lines...please
        if not unproc.size:
            # Everything is already processed. Return to main loop
            return point_index

        dists = pairwise_distances(P, np.take(X, unproc, axis=0),
                                   self.metric, n_jobs=1).ravel()

        rdists = np.maximum(dists, self.core_distances_[point_index])
        new_reach = np.minimum(np.take(self.reachability_, unproc), rdists)
        self.reachability_[unproc] = new_reach

        # Define return order based on reachability distance
        return (unproc[self.quick_scan(np.take(self.reachability_, unproc),
                                       dists)])

    def isclose(self, a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a-b) <= max([rel_tol*max([abs(a), abs(b)]), abs_tol])

    def quick_scan(self, rdists, dists):
        rdist = np.inf
        dist = np.inf
        n = len(rdists)
        for i in range(n):
            if rdists[i] < rdist:
                rdist = rdists[i]
                dist = dists[i]
                idx = i
            elif self.isclose(rdists[i], rdist):
                if dists[i] < dist:
                    dist = dists[i]
                    idx = i
        return idx


[docs]class SMOBD(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{smobd,
                            author={Cao, Q. and Wang, S.},
                            booktitle={2011 International Conference on
                                        Information Management, Innovation
                                        Management and Industrial
                                        Engineering},
                            title={Applying Over-sampling Technique Based
                                     on Data Density and Cost-sensitive
                                     SVM to Imbalanced Learning},
                            year={2011},
                            volume={2},
                            number={},
                            pages={543-548},
                            keywords={data handling;learning (artificial
                                        intelligence);support vector machines;
                                        oversampling technique application;
                                        data density;cost sensitive SVM;
                                        imbalanced learning;SMOTE algorithm;
                                        data distribution;density information;
                                        Support vector machines;Classification
                                        algorithms;Noise measurement;Arrays;
                                        Noise;Algorithm design and analysis;
                                        Training;imbalanced learning;
                                        cost-sensitive SVM;SMOTE;data density;
                                        SMOBD},
                            doi={10.1109/ICIII.2011.276},
                            ISSN={2155-1456},
                            month={Nov},}
    """

    categories = [OverSampling.cat_uses_clustering,
                  OverSampling.cat_density_based,
                  OverSampling.cat_extensive,
                  OverSampling.cat_noise_removal]

[docs]    def __init__(self,
                 proportion=1.0,
                 eta1=0.5,
                 t=1.8,
                 min_samples=5,
                 max_eps=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            eta1 (float): control parameter of density estimation
            t (float): control parameter of noise filtering
            min_samples (int): minimum samples parameter for OPTICS
            max_eps (float): maximum environment radius paramter for OPTICS
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_in_range(eta1, 'eta1', [0.0, 1.0])
        self.check_greater_or_equal(t, 't', 0)
        self.check_greater_or_equal(min_samples, 'min_samples', 1)
        self.check_greater_or_equal(max_eps, 'max_eps', 0.0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.eta1 = eta1
        self.t = t
        self.min_samples = min_samples
        self.max_eps = max_eps
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'eta1': [0.1, 0.5, 0.9],
                                  't': [1.5, 2.5],
                                  'min_samples': [5],
                                  'max_eps': [0.1, 0.5, 1.0, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determine the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # running the OPTICS technique based on the sklearn implementation
        # TODO: replace to sklearn call once it is stable
        min_samples = min([len(X_min)-1, self.min_samples])
        o = OPTICS(min_samples=min_samples,
                   max_eps=self.max_eps,
                   n_jobs=self.n_jobs)
        o.fit(X_min)
        cd = o.core_distances_
        rd = o.reachability_

        # noise filtering
        cd_average = np.mean(cd)
        rd_average = np.mean(rd)
        noise = np.logical_and(cd > cd_average*self.t, rd > rd_average*self.t)

        # fitting a nearest neighbor model to be able to find
        # neighbors in radius
        n_neighbors = min([len(X_min), self.min_samples+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distances, indices = nn.kneighbors(X_min)

        # determining the density
        factor_1 = cd
        factor_2 = np.array([len(x) for x in nn.radius_neighbors(
            X_min, radius=self.max_eps, return_distance=False)])

        if max(factor_1) == 0 or max(factor_2) == 0:
            return X.copy(), y.copy()

        factor_1 = factor_1/max(factor_1)
        factor_2 = factor_2/max(factor_2)

        df = factor_1*self.eta1 + factor_2*(1 - self.eta1)

        # setting the density at noisy samples to zero
        for i in range(len(noise)):
            if noise[i]:
                df[i] = 0

        if sum(df) == 0 or any(np.isnan(df)) or any(np.isinf(df)):
            return X.copy(), y.copy()

        # normalizing the density
        df_dens = df/sum(df)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(X_min)), p=df_dens)
            neighbor_idx = self.random_state.choice(indices[idx][1:])
            samples.append(self.sample_between_points_componentwise(
                X_min[idx], X_min[neighbor_idx]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'eta1': self.eta1,
                't': self.t,
                'min_samples': self.min_samples,
                'max_eps': self.max_eps,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SUNDO(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{sundo,
                            author={Cateni, S. and Colla, V. and Vannucci, M.},
                            booktitle={2011 11th International Conference on
                                        Intelligent Systems Design and
                                        Applications},
                            title={Novel resampling method for the
                                    classification of imbalanced datasets for
                                    industrial and other real-world problems},
                            year={2011},
                            volume={},
                            number={},
                            pages={402-407},
                            keywords={decision trees;pattern classification;
                                        sampling methods;support vector
                                        machines;resampling method;imbalanced
                                        dataset classification;industrial
                                        problem;real world problem;
                                        oversampling technique;undersampling
                                        technique;support vector machine;
                                        decision tree;binary classification;
                                        synthetic dataset;public dataset;
                                        industrial dataset;Support vector
                                        machines;Training;Accuracy;Databases;
                                        Intelligent systems;Breast cancer;
                                        Decision trees;oversampling;
                                        undersampling;imbalanced dataset},
                            doi={10.1109/ISDA.2011.6121689},
                            ISSN={2164-7151},
                            month={Nov}}
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_application]

[docs]    def __init__(self, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return [{}]

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        n_1 = len(X_min)
        n_0 = len(X) - n_1
        N = int(np.rint(0.5*n_0 - 0.5*n_1 + 0.5))

        if N == 0:
            return X.copy(), y.copy()

        # generating minority samples
        samples = []

        nn = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
        nn.fit(X_maj)

        stds = np.std(X_min, axis=0)
        # At one point the algorithm says to keep those points which are
        # the most distant from majority samples, and not leaving any minority
        # sample isolated. This can be implemented by generating multiple
        # samples for each point and keep the one most distant from the
        # majority samples.
        for _ in range(N):
            i = self.random_state.randint(len(X_min))
            best_sample = None
            best_sample_dist = 0
            for _ in range(3):
                s = self.random_state.normal(X_min[i], stds)
                dist, ind = nn.kneighbors(s.reshape(1, -1))
                if dist[0][0] > best_sample_dist:
                    best_sample_dist = dist[0][0]
                    best_sample = s
            samples.append(best_sample)

        # Extending the minority dataset with the new samples
        X_min_extended = np.vstack([X_min, np.vstack(samples)])

        # Removing N elements from the majority dataset

        # normalize
        mms = MinMaxScaler()
        X_maj_normalized = mms.fit_transform(X_maj)

        # computing the distance matrix
        dm = pairwise_distances(X_maj_normalized, X_maj_normalized)

        # len(X_maj) offsets for the diagonal 0 elements, 2N because
        # every distances appears twice
        threshold = sorted(dm.flatten())[min(
            [len(X_maj) + 2*N, len(dm)*len(dm) - 1])]
        for i in range(len(dm)):
            dm[i, i] = np.inf

        # extracting the coordinates of pairs closer than threshold
        pairs_to_break = np.where(dm < threshold)
        pairs_to_break = np.vstack(pairs_to_break)

        # sorting the pairs, otherwise both points would be removed
        pairs_to_break.sort(axis=0)

        # uniqueing the coordinates - the final number might be less than N
        to_remove = np.unique(pairs_to_break[0])

        # removing the selected elements
        X_maj_cleaned = np.delete(X_maj, to_remove, axis=0)

        return (np.vstack([X_min_extended, X_maj_cleaned]),
                np.hstack([np.repeat(self.min_label, len(X_min_extended)),
                           np.repeat(self.maj_label, len(X_maj_cleaned))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MSYN(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{msyn,
                            author="Fan, Xiannian
                            and Tang, Ke
                            and Weise, Thomas",
                            editor="Huang, Joshua Zhexue
                            and Cao, Longbing
                            and Srivastava, Jaideep",
                            title="Margin-Based Over-Sampling Method for
                                    Learning from Imbalanced Datasets",
                            booktitle="Advances in Knowledge Discovery and
                                        Data Mining",
                            year="2011",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="309--320",
                            abstract="Learning from imbalanced datasets has
                                        drawn more and more attentions from
                                        both theoretical and practical aspects.
                                        Over- sampling is a popular and simple
                                        method for imbalanced learning. In this
                                        paper, we show that there is an
                                        inherently potential risk associated
                                        with the over-sampling algorithms in
                                        terms of the large margin principle.
                                        Then we propose a new synthetic over
                                        sampling method, named Margin-guided
                                        Synthetic Over-sampling (MSYN), to
                                        reduce this risk. The MSYN improves
                                        learning with respect to the data
                                        distributions guided by the
                                        margin-based rule. Empirical study
                                        verities the efficacy of MSYN.",
                            isbn="978-3-642-20847-8"
                            }
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 pressure=1.5,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            pressure (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors in the SMOTE sampling
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(pressure, 'pressure', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.pressure = pressure
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'pressure': [2.5, 2.0, 1.5],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels
        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        min_indices = np.where(y == self.min_label)[0]
        maj_indices = np.where(y == self.maj_label)[0]

        # generating samples
        smote = SMOTE(proportion=self.pressure,
                      n_neighbors=self.n_neighbors,
                      n_jobs=self.n_jobs,
                      random_state=self.random_state)

        X_res, y_res = smote.sample(X, y)
        X_new, _ = X_res[len(X):], y_res[len(X):]

        if len(X_new) == 0:
            m = "Sampling is not needed"
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        # Compute nearest hit and miss for both classes
        nn = NearestNeighbors(n_neighbors=len(X), n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X)

        # computing nearest hit and miss distances, these will be used to
        # compute thetas
        nearest_hit_dist = np.array([dist[i][next(j for j in range(
            1, len(X)) if y[i] == y[ind[i][j]])] for i in range(len(X))])
        nearest_miss_dist = np.array([dist[i][next(j for j in range(
            1, len(X)) if y[i] != y[ind[i][j]])] for i in range(len(X))])

        # computing the thetas without new samples being involved
        theta_A_sub_alpha = 0.5*(nearest_miss_dist - nearest_hit_dist)
        theta_min = theta_A_sub_alpha[min_indices]
        theta_maj = theta_A_sub_alpha[maj_indices]

        # computing the f_3 score for all new samples
        f_3 = []
        for x in X_new:
            # determining the distances of the new sample from the training set
            distances = np.linalg.norm(X - x, axis=1)

            # computing nearest hit and miss distances involving the new
            # elements
            mask = nearest_hit_dist[min_indices] < distances[min_indices]
            nearest_hit_dist_min = np.where(mask,
                                            nearest_hit_dist[min_indices],
                                            distances[min_indices])
            nearest_miss_dist_min = nearest_miss_dist[min_indices]
            nearest_hit_dist_maj = nearest_hit_dist[maj_indices]
            mask = nearest_miss_dist[maj_indices] < distances[maj_indices]
            nearest_miss_dist_maj = np.where(mask,
                                             nearest_miss_dist[maj_indices],
                                             distances[maj_indices])

            # computing the thetas incorporating the new elements
            theta_x_min = 0.5*(nearest_miss_dist_min - nearest_hit_dist_min)
            theta_x_maj = 0.5*(nearest_miss_dist_maj - nearest_hit_dist_maj)

            # determining the delta scores and computing f_3
            Delta_P = np.sum(theta_x_min - theta_min)
            Delta_N = np.sum(theta_x_maj - theta_maj)

            f_3.append(-Delta_N/(Delta_P + 0.01))

        f_3 = np.array(f_3)

        # determining the elements with the minimum f_3 scores to add
        _, new_ind = zip(
            *sorted(zip(f_3, np.arange(len(f_3))), key=lambda x: x[0]))
        new_ind = list(new_ind[:(len(X_maj) - len(X_min))])

        return (np.vstack([X, X_new[new_ind]]),
                np.hstack([y, np.repeat(self.min_label, len(new_ind))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'pressure': self.pressure,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SVM_balance(OverSampling):
    """
    References:
        * BibTex::

            @article{svm_balance,
                     author = {Farquad, M.A.H. and Bose, Indranil},
                     title = {Preprocessing Unbalanced Data Using Support
                                Vector Machine},
                     journal = {Decis. Support Syst.},
                     issue_date = {April, 2012},
                     volume = {53},
                     number = {1},
                     month = apr,
                     year = {2012},
                     issn = {0167-9236},
                     pages = {226--233},
                     numpages = {8},
                     url = {http://dx.doi.org/10.1016/j.dss.2012.01.016},
                     doi = {10.1016/j.dss.2012.01.016},
                     acmid = {2181554},
                     publisher = {Elsevier Science Publishers B. V.},
                     address = {Amsterdam, The Netherlands, The Netherlands},
                     keywords = {COIL data, Hybrid method, Preprocessor, SVM,
                                    Unbalanced data},
                    }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_classifier,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in the SMOTE sampling
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        X, y = SMOTE(proportion=self.proportion,
                     n_neighbors=self.n_neighbors,
                     n_jobs=self.n_jobs,
                     random_state=self.random_state).sample(X, y)

        if sum(y == self.min_label) < 2:
            return X.copy(), y.copy()
        else:
            cv = min([5, sum(y == self.min_label)])

        ss = StandardScaler()
        X_norm = ss.fit_transform(X)

        C_params = [0.01, 0.1, 1.0, 10.0]
        best_score = 0
        best_C = 0.01
        for C in C_params:
            _logger.info(self.__class__.__name__ + ": " +
                         "Evaluating SVM with C=%f" % C)
            svc = SVC(C=C, kernel='rbf', gamma='auto')
            score = np.mean(cross_val_score(svc, X_norm, y, cv=cv))
            if score > best_score:
                best_score = score
                best_C = C
        svc = SVC(C=best_C, kernel='rbf', gamma='auto')
        svc.fit(X_norm, y)

        return X, svc.predict(X_norm)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class TRIM_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{trim_smote,
                            author="Puntumapon, Kamthorn
                            and Waiyamai, Kitsana",
                            editor="Tan, Pang-Ning
                            and Chawla, Sanjay
                            and Ho, Chin Kuan
                            and Bailey, James",
                            title="A Pruning-Based Approach for Searching
                                    Precise and Generalized Region for
                                    Synthetic Minority Over-Sampling",
                            booktitle="Advances in Knowledge Discovery
                                        and Data Mining",
                            year="2012",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="371--382",
                            isbn="978-3-642-30220-6"
                            }

    Notes:
        * It is not described precisely how the filtered data is used for
            sample generation. The method is proposed to be a preprocessing
            step, and it states that it applies sample generation to each
            group extracted.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 min_precision=0.3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_in_range(min_precision, 'min_precision', [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.min_precision = min_precision
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'min_precision': [0.3]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def trim(self, y):
        """
        Determines the trim value.

        Args:
            y (np.array): array of target labels

        Returns:
            float: the trim value
        """
        return np.sum(y == self.min_label)**2/len(y)

[docs]    def precision(self, y):
        """
        Determines the precision value.

        Args:
            y (np.array): array of target labels

        Returns:
            float: the precision value
        """
        return np.sum(y == self.min_label)/len(y)

[docs]    def determine_splitting_point(self, X, y, split_on_border=False):
        """
        Determines the splitting point.

        Args:
            X (np.matrix): a subset of the training data
            y (np.array): an array of target labels
            split_on_border (bool): wether splitting on class borders is
                                    considered

        Returns:
            tuple(int, float), bool: (splitting feature, splitting value),
                                        make the split
        """
        trim_value = self.trim(y)
        d = len(X[0])
        max_t_minus_gain = 0.0
        split = None

        # checking all dimensions of X
        for i in range(d):
            # sort the elements in dimension i
            sorted_X_y = sorted(zip(X[:, i], y), key=lambda pair: pair[0])
            sorted_y = [yy for _, yy in sorted_X_y]

            # number of minority samples on the left
            left_min = 0
            # number of minority samples on the right
            right_min = np.sum(sorted_y == self.min_label)

            # check all possible splitting points sequentiall
            for j in range(0, len(sorted_y)-1):
                if sorted_y[j] == self.min_label:
                    # adjusting the number of minority and majority samples
                    left_min = left_min + 1
                    right_min = right_min - 1
                # checking of we can split on the border and do not split
                # tieing feature values
                if ((split_on_border is False
                     or (split_on_border is True
                         and not sorted_y[j-1] == sorted_y[j]))
                        and sorted_X_y[j][0] != sorted_X_y[j+1][0]):
                    # compute trim value of the left
                    trim_left = left_min**2/(j+1)
                    # compute trim value of the right
                    trim_right = right_min**2/(len(sorted_y) - j - 1)
                    # let's check the gain
                    if max([trim_left, trim_right]) > max_t_minus_gain:
                        max_t_minus_gain = max([trim_left, trim_right])
                        split = (i, sorted_X_y[j][0])
        # return splitting values and the value of the logical condition
        # in line 9
        if split is not None:
            return split, max_t_minus_gain > trim_value
        else:
            return (0, 0), False

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        leafs = [(X, y)]
        candidates = []
        seeds = []

        # executing the trimming
        # loop in line 2 of the paper
        _logger.info(self.__class__.__name__ +
                     ": " + "do the trimming process")
        while len(leafs) > 0 or len(candidates) > 0:
            add_to_leafs = []
            # executing the loop starting in line 3
            for leaf in leafs:
                # the function implements the loop starting in line 6
                # splitting on class border is forced
                split, gain = self.determine_splitting_point(
                    leaf[0], leaf[1], True)
                if len(leaf[0]) == 1:
                    # small leafs with 1 element (no splitting point)
                    # are dropped as noise
                    continue
                else:
                    # condition in line 9
                    if gain:
                        # making the split
                        mask_left = (leaf[0][:, split[0]] <= split[1])
                        X_left = leaf[0][mask_left]
                        y_left = leaf[1][mask_left]
                        mask_right = np.logical_not(mask_left)
                        X_right = leaf[0][mask_right]
                        y_right = leaf[1][mask_right]

                        # condition in line 11
                        if np.sum(y_left == self.min_label) > 0:
                            add_to_leafs.append((X_left, y_left))
                        # condition in line 13
                        if np.sum(y_right == self.min_label) > 0:
                            add_to_leafs.append((X_right, y_right))
                    else:
                        # line 16
                        candidates.append(leaf)
            # we implement line 15 and 18 by replacing the list of leafs by
            # the list of new leafs.
            leafs = add_to_leafs

            # iterating through all candidates (loop starting in line 21)
            for c in candidates:
                # extracting splitting points, this time split on border
                # is not forced
                split, gain = self.determine_splitting_point(c[0], c[1], False)
                if len(c[0]) == 1:
                    # small leafs are dropped as noise
                    continue
                else:
                    # checking condition in line 27
                    if gain:
                        # doing the split
                        mask_left = (c[0][:, split[0]] <= split[1])
                        X_left, y_left = c[0][mask_left], c[1][mask_left]
                        mask_right = np.logical_not(mask_left)
                        X_right, y_right = c[0][mask_right], c[1][mask_right]
                        # checking logic in line 29
                        if np.sum(y_left == self.min_label) > 0:
                            leafs.append((X_left, y_left))
                        # checking logic in line 31
                        if np.sum(y_right == self.min_label) > 0:
                            leafs.append((X_right, y_right))
                    else:
                        # adding candidate to seeds (line 35)
                        seeds.append(c)
            # line 33 and line 36 are implemented by emptying the candidates
            # list
            candidates = []

        # filtering the resulting set
        filtered_seeds = [s for s in seeds if self.precision(
            s[1]) > self.min_precision]

        # handling the situation when no seeds were found
        if len(seeds) == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "no seeds identified")
            return X.copy(), y.copy()

        # fix for bad choice of min_precision
        multiplier = 0.9
        while len(filtered_seeds) == 0:
            filtered_seeds = [s for s in seeds if self.precision(
                s[1]) > self.min_precision*multiplier]
            multiplier = multiplier*0.9
            if multiplier < 0.1:
                _logger.warning(self.__class__.__name__ + ": " +
                                "no clusters passing the filtering")
                return X.copy(), y.copy()

        seeds = filtered_seeds

        X_seed = np.vstack([s[0] for s in seeds])
        y_seed = np.hstack([s[1] for s in seeds])

        _logger.info(self.__class__.__name__ + ": " + "do the sampling")
        # generating samples by SMOTE
        X_seed_min = X_seed[y_seed == self.min_label]
        if len(X_seed_min) <= 1:
            _logger.warning(self.__class__.__name__ + ": " +
                            "X_seed_min contains less than 2 samples")
            return X.copy(), y.copy()

        n_neighbors = min([len(X_seed_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_seed_min)
        distances, indices = nn.kneighbors(X_seed_min)

        # do the sampling
        samples = []
        for _ in range(n_to_sample):
            random_idx = self.random_state.randint(len(X_seed_min))
            random_neighbor_idx = self.random_state.choice(
                indices[random_idx][1:])
            samples.append(self.sample_between_points(
                X_seed_min[random_idx], X_seed_min[random_neighbor_idx]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'min_precision': self.min_precision,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_RSB(OverSampling):
    """
    References:
        * BibTex::

            @Article{smote_rsb,
                    author="Ramentol, Enislay
                    and Caballero, Yail{\'e}
                    and Bello, Rafael
                    and Herrera, Francisco",
                    title="SMOTE-RSB*: a hybrid preprocessing approach
                            based on oversampling and undersampling for
                            high imbalanced data-sets using SMOTE and
                            rough sets theory",
                    journal="Knowledge and Information Systems",
                    year="2012",
                    month="Nov",
                    day="01",
                    volume="33",
                    number="2",
                    pages="245--265",
                    issn="0219-3116",
                    doi="10.1007/s10115-011-0465-6",
                    url="https://doi.org/10.1007/s10115-011-0465-6"
                    }

    Notes:
        * I think the description of the algorithm in Fig 5 of the paper
            is not correct. The set "resultSet" is initialized with the
            original instances, and then the While loop in the Algorithm
            run until resultSet is empty, which never holds. Also, the
            resultSet is only extended in the loop. Our implementation
            is changed in the following way: we generate twice as many
            instances are required to balance the dataset, and repeat
            the loop until the number of new samples added to the training
            set is enough to balance the dataset.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=2.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors in the SMOTE sampling
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        X_maj = X[y == self.maj_label]
        X_min = X[y == self.min_label]

        # Step 1: do the sampling
        smote = SMOTE(proportion=self.proportion,
                      n_neighbors=self.n_neighbors,
                      n_jobs=self.n_jobs,
                      random_state=self.random_state)

        X_samp, y_samp = smote.sample(X, y)
        X_samp, y_samp = X_samp[len(X):], y_samp[len(X):]

        if len(X_samp) == 0:
            return X.copy(), y.copy()

        # Step 2: (original will be added later)
        result_set = []

        # Step 3: first the data is normalized
        maximums = np.max(X_samp, axis=0)
        minimums = np.min(X_samp, axis=0)

        # normalize X_new and X_maj
        norm_factor = maximums - minimums
        null_mask = norm_factor == 0
        n_null = np.sum(null_mask)
        fixed = np.max(np.vstack([maximums[null_mask], np.repeat(1, n_null)]),
                       axis=0)

        norm_factor[null_mask] = fixed

        X_samp_norm = X_samp / norm_factor
        X_maj_norm = X_maj / norm_factor

        # compute similarity matrix
        similarity_matrix = 1.0 - pairwise_distances(X_samp_norm,
                                                     X_maj_norm,
                                                     metric='minkowski',
                                                     p=1)/len(X[0])

        # Step 4: counting the similar examples
        similarity_value = 0.4
        syn = len(X_samp)
        cont = np.zeros(syn)

        already_added = np.repeat(False, len(X_samp))

        while (len(result_set) < len(X_maj) - len(X_min)
                and similarity_value <= 0.9):
            for i in range(syn):
                cont[i] = np.sum(similarity_matrix[i, :] > similarity_value)
                if cont[i] == 0 and not already_added[i]:
                    result_set.append(X_samp[i])
                    already_added[i] = True
            similarity_value = similarity_value + 0.05

        # Step 5: returning the results depending the number of instances
        # added to the result set
        if len(result_set) > 0:
            return (np.vstack([X, np.vstack(result_set)]),
                    np.hstack([y, np.repeat(self.min_label,
                                            len(result_set))]))
        else:
            return np.vstack([X, X_samp]), np.hstack([y, y_samp])

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ProWSyn(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{prowsyn,
                        author="Barua, Sukarna
                        and Islam, Md. Monirul
                        and Murase, Kazuyuki",
                        editor="Pei, Jian
                        and Tseng, Vincent S.
                        and Cao, Longbing
                        and Motoda, Hiroshi
                        and Xu, Guandong",
                        title="ProWSyn: Proximity Weighted Synthetic
                                        Oversampling Technique for
                                        Imbalanced Data Set Learning",
                        booktitle="Advances in Knowledge Discovery
                                    and Data Mining",
                        year="2013",
                        publisher="Springer Berlin Heidelberg",
                        address="Berlin, Heidelberg",
                        pages="317--328",
                        isbn="978-3-642-37456-2"
                        }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 L=5,
                 theta=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            L (int): number of levels
            theta (float): smoothing factor in weight formula
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(L, "L", 1)
        self.check_greater_or_equal(theta, "theta", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.L = L
        self.theta = theta
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'L': [3, 5, 7],
                                  'theta': [0.1, 1.0, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and
                                    target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # Step 1 - a bit generalized
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            m = "Sampling is not needed"
            _logger.warning(self.__class__.__name__ + ": " + m)
            return X.copy(), y.copy()

        # Step 2
        P = np.where(y == self.min_label)[0]
        X_maj = X[y == self.maj_label]

        Ps = []
        proximity_levels = []

        # Step 3
        for i in range(self.L):
            if len(P) == 0:
                break
            # Step 3 a
            n_neighbors = min([len(P), self.n_neighbors])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            nn.fit(X[P])
            distances, indices = nn.kneighbors(X_maj)

            # Step 3 b
            P_i = np.unique(np.hstack([i for i in indices]))

            # Step 3 c - proximity levels are encoded in the Ps list index
            Ps.append(P[P_i])
            proximity_levels.append(i+1)

            # Step 3 d
            P = np.delete(P, P_i)

        # Step 4
        if len(P) > 0:
            Ps.append(P)

        # Step 5
        if len(P) > 0:
            proximity_levels.append(i)
            proximity_levels = np.array(proximity_levels)

        # Step 6
        weights = np.array([np.exp(-self.theta*(proximity_levels[i] - 1))
                            for i in range(len(proximity_levels))])
        # weights is the probability distribution of sampling in the
        # clusters identified
        weights = weights/np.sum(weights)

        suitable = False
        for i in range(len(weights)):
            if weights[i] > 0 and len(Ps[i]) > 1:
                suitable = True

        if not suitable:
            return X.copy(), y.copy()

        # do the sampling, from each cluster proportionally to the distribution
        samples = []
        while len(samples) < n_to_sample:
            cluster_idx = self.random_state.choice(
                np.arange(len(weights)), p=weights)
            if len(Ps[cluster_idx]) > 1:
                random_idx1, random_idx2 = self.random_state.choice(
                    Ps[cluster_idx], 2, replace=False)
                samples.append(self.sample_between_points(
                    X[random_idx1], X[random_idx2]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'L': self.L,
                'theta': self.theta,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SL_graph_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{sl_graph_smote,
                    author = {Bunkhumpornpat,
                        Chumpol and Subpaiboonkit, Sitthichoke},
                    booktitle= {13th International Symposium on Communications
                                and Information Technologies},
                    year = {2013},
                    month = {09},
                    pages = {570-575},
                    title = {Safe level graph for synthetic minority
                                over-sampling techniques},
                    isbn = {978-1-4673-5578-0}
                    }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """

        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # Fitting nearest neighbors model
        n_neighbors = min([len(X), self.n_neighbors])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X[y == self.min_label])

        # Computing safe level values
        safe_level_values = np.array(
            [np.sum(y[i] == self.min_label) for i in indices])

        # Computing skewness
        skewness = skew(safe_level_values)

        if skewness < 0:
            # left skewed
            s = Safe_Level_SMOTE(self.proportion,
                                 self.n_neighbors,
                                 n_jobs=self.n_jobs,
                                 random_state=self.random_state)
        else:
            # right skewed
            s = Borderline_SMOTE1(self.proportion,
                                  self.n_neighbors,
                                  n_jobs=self.n_jobs,
                                  random_state=self.random_state)

        return s.sample(X, y)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class NRSBoundary_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @Article{nrsboundary_smote,
                    author= {Feng, Hu and Hang, Li},
                    title= {A Novel Boundary Oversampling Algorithm Based on
                            Neighborhood Rough Set Model: NRSBoundary-SMOTE},
                    journal= {Mathematical Problems in Engineering},
                    year= {2013},
                    pages= {10},
                    doi= {10.1155/2013/694809},
                    url= {http://dx.doi.org/10.1155/694809}
                    }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 w=0.005,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            w (float): used to set neighborhood radius
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(w, "w", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.w = w
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'w': [0.005, 0.01, 0.05]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determining the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # step 1
        bound_set = []
        pos_set = []

        # step 2
        X_min_indices = np.where(y == self.min_label)[0]
        X_min = X[X_min_indices]

        # step 3
        dm = pairwise_distances(X, X)
        d_max = np.max(dm, axis=1)
        max_dist = np.max(dm)
        np.fill_diagonal(dm, max_dist)
        d_min = np.min(dm, axis=1)

        delta = d_min + self.w*(d_max - d_min)

        # number of neighbors is not interesting here, as we use the
        # radius_neighbors function to extract the neighbors in a given radius
        n_neighbors = min([self.n_neighbors + 1, len(X)])
        nn = NearestNeighbors(n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        for i in range(len(X)):
            indices = nn.radius_neighbors(X[i].reshape(1, -1),
                                          delta[i],
                                          return_distance=False)

            n_minority = np.sum(y[indices[0]] == self.min_label)
            n_majority = np.sum(y[indices[0]] == self.maj_label)
            if y[i] == self.min_label and not n_minority == len(indices[0]):
                bound_set.append(i)
            elif y[i] == self.maj_label and n_majority == len(indices[0]):
                pos_set.append(i)

        bound_set = np.array(bound_set)
        pos_set = np.array(pos_set)

        if len(pos_set) == 0 or len(bound_set) == 0:
            return X.copy(), y.copy()

        # step 4 and 5
        # computing the nearest neighbors of the bound set from the
        # minority set
        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distances, indices = nn.kneighbors(X[bound_set])

        # do the sampling
        samples = []
        trials = 0
        w = self.w
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(len(bound_set))
            random_neighbor_idx = self.random_state.choice(indices[idx][1:])
            x_new = self.sample_between_points(
                X[bound_set[idx]], X_min[random_neighbor_idx])

            # checking the conflict
            dist_from_pos_set = np.linalg.norm(X[pos_set] - x_new, axis=1)
            if np.all(dist_from_pos_set > delta[pos_set]):
                # no conflict
                samples.append(x_new)
            trials = trials + 1
            if trials > 1000 and len(samples) == 0:
                trials = 0
                w = w*0.9

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'w': self.w,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class LVQ_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{lvq_smote,
                              title={LVQ-SMOTE – Learning Vector Quantization
                                    based Synthetic Minority Over–sampling
                                    Technique for biomedical data},
                              author={Munehiro Nakamura and Yusuke Kajiwara
                                     and Atsushi Otsuka and Haruhiko Kimura},
                              booktitle={BioData Mining},
                              year={2013}
                            }

    Notes:
        * This implementation is only a rough approximation of the method
            described in the paper. The main problem is that the paper uses
            many datasets to find similar patterns in the codebooks and
            replicate patterns appearing in other datasets to the imbalanced
            datasets based on their relative position compared to the codebook
            elements. What we do is clustering the minority class to extract
            a codebook as kmeans cluster means, then, find pairs of codebook
            elements which have the most similar relative position to a
            randomly selected pair of codebook elements, and translate nearby
            minority samples from the neighborhood one pair of codebook
            elements to the neighborood of another pair of codebook elements.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_application]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_clusters=10,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            n_clusters (int): number of clusters in vector quantization
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_clusters, "n_clusters", 3)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_clusters': [4, 8, 12]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # clustering X_min to extract codebook
        n_clusters = min([len(X_min), self.n_clusters])
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X_min)
        codebook = kmeans.cluster_centers_

        # get nearest neighbors of minority samples to codebook samples
        n_neighbors = min([len(X_min), self.n_neighbors])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        distances, indices = nn.kneighbors(codebook)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            # randomly selecting a pair of codebook elements
            cb_0, cb_1 = self.random_state.choice(
                list(range(len(codebook))), 2, replace=False)
            diff = codebook[cb_0] - codebook[cb_1]
            min_dist = np.inf
            min_0 = None
            # finding another pair of codebook elements with similar offset
            for i in range(len(codebook)):
                for j in range(len(codebook)):
                    if cb_0 != i and cb_0 != j and cb_1 != i and cb_1 != j:
                        dd = np.linalg.norm(diff - (codebook[i] - codebook[j]))
                        if dd < min_dist:
                            min_dist = dd
                            min_0 = self.random_state.choice([i, j])

            # translating a random neighbor of codebook element min_0 to
            # the neighborhood of point_0
            random_index = self.random_state.randint(len(indices[min_0]))
            sample = X_min[indices[min_0][random_index]]
            point_0 = codebook[cb_0] + (sample - codebook[min_0])

            samples.append(point_0)

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_clusters': self.n_clusters,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SOI_CJ(OverSampling):
    """
    References:
        * BibTex::

            @article{soi_cj,
                    author = {Sánchez, Atlántida I. and Morales, Eduardo and
                                Gonzalez, Jesus},
                    year = {2013},
                    month = {01},
                    pages = {},
                    title = {Synthetic Oversampling of Instances Using
                                Clustering},
                    volume = {22},
                    booktitle = {International Journal of Artificial
                                    Intelligence Tools}
                    }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 method='interpolation',
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of nearest neighbors in the SMOTE
                                sampling
            method (str): 'interpolation'/'jittering'
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_isin(method, 'method', ['interpolation', 'jittering'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.method = method
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'method': ['interpolation', 'jittering']}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def clustering(self, X, y):
        """
        Implementation of the clustering technique described in the paper.

        Args:
            X (np.matrix): array of training instances
            y (np.array): target labels

        Returns:
            list(set): list of minority clusters
        """
        nn_all = NearestNeighbors(n_jobs=self.n_jobs)
        nn_all.fit(X)

        X_min = X[y == self.min_label]

        # extract nearest neighbors of all samples from the set of
        # minority samples
        nn = NearestNeighbors(n_neighbors=len(X_min), n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_min)

        # initialize clusters by minority samples
        clusters = []
        for i in range(len(X_min)):
            # empty cluster added
            clusters.append(set())
            # while the closest instance is from the minority class, adding it
            # to the cluster
            for j in indices[i]:
                if y[j] == self.min_label:
                    clusters[i].add(j)
                else:
                    break

        # cluster merging phase
        is_intersection = True
        while is_intersection:
            is_intersection = False
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    # computing intersection
                    intersection = clusters[i].intersection(clusters[j])
                    if len(intersection) > 0:
                        is_intersection = True
                        # computing distance matrix
                        dm = pairwise_distances(
                            X[list(clusters[i])], X[list(clusters[j])])
                        # largest distance
                        max_dist_pair = np.where(dm == np.max(dm))
                        # elements with the largest distance
                        max_i = X[list(clusters[i])[max_dist_pair[0][0]]]
                        max_j = X[list(clusters[j])[max_dist_pair[1][0]]]

                        # finding midpoint and radius
                        mid_point = (max_i + max_j)/2.0
                        radius = np.linalg.norm(mid_point - max_i)

                        # extracting points within the hypersphare of
                        # radius "radius"
                        mid_point_reshaped = mid_point.reshape(1, -1)
                        ind = nn_all.radius_neighbors(mid_point_reshaped,
                                                      radius,
                                                      return_distance=False)

                        n_min = np.sum(y[ind[0]] == self.min_label)
                        if n_min > len(ind[0])/2:
                            # if most of the covered elements come from the
                            # minority class, merge clusters
                            clusters[i].update(clusters[j])
                            clusters[j] = set()
                        else:
                            # otherwise move the difference to the
                            # bigger cluster
                            if len(clusters[i]) > len(clusters[j]):
                                clusters[j].difference_update(intersection)
                            else:
                                clusters[i].difference_update(intersection)

        # returning non-empty clusters
        return [c for c in clusters if len(c) > 0]

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        std_min = np.std(X_min, axis=0)

        # do the clustering
        _logger.info(self.__class__.__name__ + ": " + "Executing clustering")
        clusters = self.clustering(X, y)

        # filtering the clusters, at least two points in a cluster are needed
        # for both interpolation and jittering (due to the standard deviation)
        clusters_filtered = [list(c) for c in clusters if len(c) > 2]

        if len(clusters_filtered) > 0:
            # if there are clusters having at least 2 elements, do the sampling
            cluster_nums = [len(c) for c in clusters_filtered]
            cluster_weights = cluster_nums/np.sum(cluster_nums)
            cluster_stds = [np.std(X[clusters_filtered[i]], axis=0)
                            for i in range(len(clusters_filtered))]

            _logger.info(self.__class__.__name__ + ": " +
                         "Executing sample generation")
            samples = []
            while len(samples) < n_to_sample:
                cluster_idx = self.random_state.choice(
                    np.arange(len(clusters_filtered)), p=cluster_weights)
                if self.method == 'interpolation':
                    clust = clusters_filtered[cluster_idx]
                    idx_0, idx_1 = self.random_state.choice(clust,
                                                            2,
                                                            replace=False)
                    X_0, X_1 = X[idx_0], X[idx_1]
                    samples.append(
                        self.sample_between_points_componentwise(X_0, X_1))
                elif self.method == 'jittering':
                    clust_std = cluster_stds[cluster_idx]
                    std = np.min(np.vstack([std_min, clust_std]), axis=0)
                    clust = clusters_filtered[cluster_idx]
                    idx = self.random_state.choice(clust)
                    X_samp = self.sample_by_jittering_componentwise(X[idx],
                                                                    std)
                    samples.append(X_samp)

            return (np.vstack([X, samples]),
                    np.hstack([y, np.array([self.min_label]*len(samples))]))
        else:
            # otherwise fall back to standard smote
            _logger.warning(self.__class__.__name__ + ": " +
                            "No clusters with more than 2 elements")
            return X.copy(), y.copy()

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'method': self.method,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ROSE(OverSampling):
    """
    References:
        * BibTex::

            @Article{rose,
                    author="Menardi, Giovanna
                    and Torelli, Nicola",
                    title="Training and assessing classification rules with
                            imbalanced data",
                    journal="Data Mining and Knowledge Discovery",
                    year="2014",
                    month="Jan",
                    day="01",
                    volume="28",
                    number="1",
                    pages="92--122",
                    issn="1573-756X",
                    doi="10.1007/s10618-012-0295-5",
                    url="https://doi.org/10.1007/s10618-012-0295-5"
                    }

    Notes:
        * It is not entirely clear if the authors propose kernel density
            estimation or the fitting of simple multivariate Gaussians
            on the minority samples. The latter seems to be more likely,
            I implement that approach.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self, proportion=1.0, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0.0)

        self.proportion = proportion

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # Estimating the H matrix
        std = np.std(X_min, axis=0)
        d = len(X[0])
        n = len(X_min)
        H = std*(4.0/((d + 1)*n))**(1.0/(d + 4))

        # do the sampling
        samples = []
        for _ in range(n_to_sample):
            random_idx = self.random_state.randint(len(X_min))
            samples.append(self.sample_by_gaussian_jittering(
                X_min[random_idx], H))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'random_state': self._random_state_init}


[docs]class SMOTE_OUT(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_out_smote_cosine_selected_smote,
                      title={SMOTE-Out, SMOTE-Cosine, and Selected-SMOTE: An
                                enhancement strategy to handle imbalance in
                                data level},
                      author={Fajri Koto},
                      journal={2014 International Conference on Advanced
                                Computer Science and Information System},
                      year={2014},
                      pages={280-284}
                    }
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])
        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        minority_indices = np.where(y == self.min_label)[0]

        # nearest neighbors among minority points
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn_min = NearestNeighbors(n_neighbors=n_neighbors,
                                  n_jobs=self.n_jobs).fit(X_min)

        min_distances, min_indices = nn_min.kneighbors(X_min)
        # nearest neighbors among majority points
        n_neighbors = min([len(X_maj), self.n_neighbors+1])
        nn_maj = NearestNeighbors(
            n_neighbors=n_neighbors, n_jobs=self.n_jobs).fit(X_maj)
        maj_distances, maj_indices = nn_maj.kneighbors(X_min)

        # generate samples
        samples = []
        for _ in range(n_to_sample):
            # implementation of Algorithm 1 in the paper
            random_idx = self.random_state.choice(
                np.arange(len(minority_indices)))
            u = X[minority_indices[random_idx]]
            v = X_maj[self.random_state.choice(maj_indices[random_idx])]
            dif1 = u - v
            uu = u + self.random_state.random_sample()*0.3*dif1
            x = X_min[self.random_state.choice(min_indices[random_idx][1:])]
            dif2 = uu - x
            w = x + self.random_state.random_sample()*0.5*dif2

            samples.append(w)

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_Cosine(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_out_smote_cosine_selected_smote,
                      title={SMOTE-Out, SMOTE-Cosine, and Selected-SMOTE:
                                An enhancement strategy to handle imbalance
                                in data level},
                      author={Fajri Koto},
                      journal={2014 International Conference on Advanced
                                Computer Science and Information System},
                      year={2014},
                      pages={280-284}
                    }
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        minority_indices = np.where(y == self.min_label)[0]

        # Fitting the nearest neighbors models to the minority and
        # majority data using two different metrics for the minority
        nn_min_euc = NearestNeighbors(n_neighbors=len(X_min),
                                      n_jobs=self.n_jobs)
        nn_min_euc.fit(X_min)
        nn_min_euc_dist, nn_min_euc_ind = nn_min_euc.kneighbors(X_min)

        nn_min_cos = NearestNeighbors(n_neighbors=len(X_min),
                                      metric='cosine',
                                      n_jobs=self.n_jobs)
        nn_min_cos.fit(X_min)
        nn_min_cos_dist, nn_min_cos_ind = nn_min_cos.kneighbors(X_min)

        nn_maj = NearestNeighbors(n_neighbors=self.n_neighbors,
                                  n_jobs=self.n_jobs)
        nn_maj.fit(X_maj)
        nn_maj_dist, nn_maj_ind = nn_maj.kneighbors(X_min)

        samples = []
        for _ in range(n_to_sample):
            random_idx = self.random_state.choice(
                np.arange(len(minority_indices)))
            u = X[minority_indices[random_idx]]
            # get the rank of each minority sample according to their distance
            # from u
            to_sort_euc = zip(
                nn_min_euc_ind[random_idx], np.arange(len(X_min)))
            _, sorted_by_euc_ind = zip(*(sorted(to_sort_euc,
                                                key=lambda x: x[0])))
            to_sort_cos = zip(
                nn_min_cos_ind[random_idx], np.arange(len(X_min)))
            _, sorted_by_cos_ind = zip(*(sorted(to_sort_cos,
                                                key=lambda x: x[0])))
            # adding the ranks to get the composite similarity measure (called
            # voting in the paper)
            ranked_min_indices = sorted_by_euc_ind + sorted_by_cos_ind
            # sorting the ranking
            to_sort = zip(ranked_min_indices, np.arange(len(X_min)))
            _, sorted_ranking = zip(*(sorted(to_sort, key=lambda x: x[0])))
            # get the indices of the n_neighbors nearest neighbors according
            # to the composite metrics
            min_indices = sorted_ranking[1:(self.n_neighbors + 1)]

            v = X_maj[self.random_state.choice(nn_maj_ind[random_idx])]
            dif1 = u - v
            uu = u + self.random_state.random_sample()*0.3*dif1
            x = X_min[self.random_state.choice(min_indices[1:])]
            dif2 = uu - x
            w = x + self.random_state.random_sample()*0.5*dif2
            samples.append(w)

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Selected_SMOTE(OverSampling):
    """
    References:
        * BibTex::

        @article{smote_out_smote_cosine_selected_smote,
                  title={SMOTE-Out, SMOTE-Cosine, and Selected-SMOTE: An
                            enhancement strategy to handle imbalance in
                            data level},
                  author={Fajri Koto},
                  journal={2014 International Conference on Advanced
                            Computer Science and Information System},
                  year={2014},
                  pages={280-284}
                }

    Notes:
        * Significant attribute selection was not described in the paper,
            therefore we have implemented something meaningful.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 perc_sign_attr=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            strategy (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            perc_sign_attr (float): [0,1] - percentage of significant
                                            attributes
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_neighbors, 'n_neighbors', 1)
        self.check_in_range(perc_sign_attr, 'perc_sign_attr', [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.perc_sign_attr = perc_sign_attr
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'perc_sign_attr': [0.3, 0.5, 0.8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])
        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        minority_indices = np.where(y == self.min_label)[0]

        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn_min_euc = NearestNeighbors(n_neighbors=n_neighbors,
                                      n_jobs=self.n_jobs).fit(X_min)

        nn_min_dist, nn_min_ind = nn_min_euc.kneighbors(X_min)

        # significant attribute selection was not described in the paper
        # I have implemented significant attribute selection by checking
        # the overlap between ranges of minority and majority class attributes
        # the attributes with bigger overlap respecting their ranges
        # are considered more significant
        min_ranges_a = np.min(X_min, axis=0)
        min_ranges_b = np.max(X_min, axis=0)
        maj_ranges_a = np.min(X_maj, axis=0)
        maj_ranges_b = np.max(X_maj, axis=0)

        # end points of overlaps
        max_a = np.max(np.vstack([min_ranges_a, maj_ranges_a]), axis=0)
        min_b = np.min(np.vstack([min_ranges_b, maj_ranges_b]), axis=0)

        # size of overlap
        overlap = min_b - max_a

        # replacing negative values (no overlap) by zero
        overlap = np.where(overlap < 0, 0, overlap)
        # percentage of overlap compared to the ranges of attributes in the
        # minority set
        percentages = overlap/(min_ranges_b - min_ranges_a)
        # fixing zero division if some attributes have zero range
        percentages = np.nan_to_num(percentages)
        # number of significant attributes to determine
        num_sign_attr = min(
            [1, int(np.rint(self.perc_sign_attr*len(percentages)))])

        significant_attr = (percentages >= sorted(
            percentages)[-num_sign_attr]).astype(int)

        samples = []
        for _ in range(n_to_sample):
            random_idx = self.random_state.choice(range(len(minority_indices)))
            u = X[minority_indices[random_idx]]
            v = X_min[self.random_state.choice(nn_min_ind[random_idx][1:])]
            samples.append(self.sample_between_points_componentwise(
                u, v, significant_attr))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'perc_sign_attr': self.perc_sign_attr,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class LN_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{ln_smote,
                            author={Maciejewski, T. and Stefanowski, J.},
                            booktitle={2011 IEEE Symposium on Computational
                                        Intelligence and Data Mining (CIDM)},
                            title={Local neighbourhood extension of SMOTE for
                                        mining imbalanced data},
                            year={2011},
                            volume={},
                            number={},
                            pages={104-111},
                            keywords={Bayes methods;data mining;pattern
                                        classification;local neighbourhood
                                        extension;imbalanced data mining;
                                        focused resampling technique;SMOTE
                                        over-sampling method;naive Bayes
                                        classifiers;Noise measurement;Noise;
                                        Decision trees;Breast cancer;
                                        Sensitivity;Data mining;Training},
                            doi={10.1109/CIDM.2011.5949434},
                            ISSN={},
                            month={April}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0.0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        if self.n_neighbors + 2 > len(X):
            n_neighbors = len(X) - 2
        else:
            n_neighbors = self.n_neighbors

        if n_neighbors < 2:
            return X.copy(), y.copy()

        # nearest neighbors of each instance to each instance in the dataset
        nn = NearestNeighbors(n_neighbors=n_neighbors + 2, n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X)

        minority_indices = np.where(y == self.min_label)[0]

        # dimensionality
        d = len(X[0])

        def safe_level(p_idx, n_idx=None):
            """
            computing the safe level of samples

            Args:
                p_idx (int): index of positive sample
                n_idx (int): index of other sample

            Returns:
                int: safe level
            """
            if n_idx is None:
                # implementation for 1 sample only
                return np.sum(y[indices[p_idx][1:-1]] == self.min_label)
            else:
                # implementation for 2 samples
                if ((not y[n_idx] != self.maj_label)
                        and p_idx in indices[n_idx][1:-1]):
                    # -1 because p_idx will be replaced
                    n_positives = np.sum(
                        y[indices[n_idx][1:-1]] == self.min_label) - 1
                    if y[indices[n_idx][-1]] == self.min_label:
                        # this is the effect of replacing p_idx by the next
                        # (k+1)th neighbor
                        n_positives = n_positives + 1
                    return n_positives
                return np.sum(y[indices[n_idx][1:-1]] == self.min_label)

        def random_gap(slp, sln, n_label):
            """
            determining random gap

            Args:
                slp (int): safe level of p
                sln (int): safe level of n
                n_label (int): label of n

            Returns:
                float: gap
            """
            delta = 0
            if sln == 0 and slp > 0:
                return delta
            else:
                sl_ratio = slp/sln
                if sl_ratio == 1:
                    delta = self.random_state.random_sample()
                elif sl_ratio > 1:
                    delta = self.random_state.random_sample()/sl_ratio
                else:
                    delta = 1.0 - self.random_state.random_sample()*sl_ratio
            if not n_label == self.min_label:
                delta = delta*sln/(n_neighbors)
            return delta

        # generating samples
        trials = 0
        samples = []
        while len(samples) < n_to_sample:
            p_idx = self.random_state.choice(minority_indices)
            # extract random neighbor of p
            n_idx = self.random_state.choice(indices[p_idx][1:-1])

            # checking can-create criteria
            slp = safe_level(p_idx)
            sln = safe_level(p_idx, n_idx)

            if (not slp == 0) or (not sln == 0):
                # can create
                p = X[p_idx]
                n = X[n_idx]
                x_new = p.copy()

                for a in range(d):
                    delta = random_gap(slp, sln, y[n_idx])
                    diff = n[a] - p[a]
                    x_new[a] = p[a] + delta*diff
                samples.append(x_new)

            trials = trials + 1
            if len(samples)/trials < 1.0/n_to_sample:
                _logger.info(self.__class__.__name__ + ": " +
                             "no instances with slp > 0 and sln > 0 found")
                return X.copy(), y.copy()

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MWMOTE(OverSampling):
    """
    References:
        * BibTex::

            @ARTICLE{mwmote,
                        author={Barua, S. and Islam, M. M. and Yao, X. and
                                Murase, K.},
                        journal={IEEE Transactions on Knowledge and Data
                                Engineering},
                        title={MWMOTE--Majority Weighted Minority Oversampling
                                Technique for Imbalanced Data Set Learning},
                        year={2014},
                        volume={26},
                        number={2},
                        pages={405-425},
                        keywords={learning (artificial intelligence);pattern
                                    clustering;sampling methods;AUC;area under
                                    curve;ROC;receiver operating curve;G-mean;
                                    geometric mean;minority class cluster;
                                    clustering approach;weighted informative
                                    minority class samples;Euclidean distance;
                                    hard-to-learn informative minority class
                                    samples;majority class;synthetic minority
                                    class samples;synthetic oversampling
                                    methods;imbalanced learning problems;
                                    imbalanced data set learning;
                                    MWMOTE-majority weighted minority
                                    oversampling technique;Sampling methods;
                                    Noise measurement;Boosting;Simulation;
                                    Complexity theory;Interpolation;Abstracts;
                                    Imbalanced learning;undersampling;
                                    oversampling;synthetic sample generation;
                                    clustering},
                        doi={10.1109/TKDE.2012.232},
                        ISSN={1041-4347},
                        month={Feb}}

    Notes:
        * The original method was not prepared for the case of having clusters
            of 1 elements.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 k1=5,
                 k2=5,
                 k3=5,
                 M=10,
                 cf_th=5.0,
                 cmax=10.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            k1 (int): parameter of the NearestNeighbors component
            k2 (int): parameter of the NearestNeighbors component
            k3 (int): parameter of the NearestNeighbors component
            M (int): number of clusters
            cf_th (float): cutoff threshold
            cmax (float): maximum closeness value
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(k1, 'k1', 1)
        self.check_greater_or_equal(k2, 'k2', 1)
        self.check_greater_or_equal(k3, 'k3', 1)
        self.check_greater_or_equal(M, 'M', 1)
        self.check_greater_or_equal(cf_th, 'cf_th', 0)
        self.check_greater_or_equal(cmax, 'cmax', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.k1 = k1
        self.k2 = k2
        self.k3 = k3
        self.M = M
        self.cf_th = cf_th
        self.cmax = cmax
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'k1': [5, 9],
                                  'k2': [5, 9],
                                  'k3': [5, 9],
                                  'M': [4, 10],
                                  'cf_th': [5.0],
                                  'cmax': [10.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])
        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        minority = np.where(y == self.min_label)[0]

        # Step 1
        n_neighbors = min([len(X), self.k1 + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs)
        nn.fit(X)
        dist1, ind1 = nn.kneighbors(X)

        # Step 2
        arr = [i for i in minority if np.sum(y[ind1[i][1:]] == self.min_label)]
        filtered_minority = np.array(arr)

        if len(filtered_minority) == 0:
            _logger.info(self.__class__.__name__ + ": " +
                         "filtered_minority array is empty")
            return X.copy(), y.copy()

        # Step 3 - ind2 needs to be indexed by indices of the lengh of X_maj
        nn_maj = NearestNeighbors(n_neighbors=self.k2, n_jobs=self.n_jobs)
        nn_maj.fit(X_maj)
        dist2, ind2 = nn_maj.kneighbors(X[filtered_minority])

        # Step 4
        border_majority = np.unique(ind2.flatten())

        # Step 5 - ind3 needs to be indexed by indices of the length of X_min
        n_neighbors = min([self.k3, len(X_min)])
        nn_min = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn_min.fit(X_min)
        dist3, ind3 = nn_min.kneighbors(X_maj[border_majority])

        # Step 6 - informative minority indexes X_min
        informative_minority = np.unique(ind3.flatten())

        def closeness_factor(y, x, cf_th=self.cf_th, cmax=self.cmax):
            """
            Closeness factor according to the Eq (6)

            Args:
                y (np.array): training instance (border_majority)
                x (np.array): training instance (informative_minority)
                cf_th (float): cutoff threshold
                cmax (float): maximum values

            Returns:
                float: closeness factor
            """
            d = np.linalg.norm(y - x)/len(y)
            if d == 0.0:
                d = 0.1
            if 1.0/d < cf_th:
                f = 1.0/d
            else:
                f = cf_th
            return f/cf_th*cmax

        # Steps 7 - 9
        _logger.info(self.__class__.__name__ + ": " +
                     'computing closeness factors')
        closeness_factors = np.zeros(
            shape=(len(border_majority), len(informative_minority)))
        for i in range(len(border_majority)):
            bm_i = border_majority[i]
            for j in range(len(informative_minority)):
                im_j = informative_minority[j]
                closeness_factors[i, j] = closeness_factor(X_maj[bm_i],
                                                           X_min[im_j])

        _logger.info(self.__class__.__name__ + ": " +
                     'computing information weights')
        information_weights = np.zeros(
            shape=(len(border_majority), len(informative_minority)))
        for i in range(len(border_majority)):
            norm_factor = np.sum(closeness_factors[i, :])
            for j in range(len(informative_minority)):
                cf_ij = closeness_factors[i, j]
                information_weights[i, j] = cf_ij**2/norm_factor

        selection_weights = np.sum(information_weights, axis=0)
        selection_probabilities = selection_weights/np.sum(selection_weights)

        # Step 10
        _logger.info(self.__class__.__name__ + ": " + 'do clustering')
        n_clusters = min([len(X_min), self.M])
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X_min)
        imin_labels = kmeans.labels_[informative_minority]

        clusters = [np.where(imin_labels == i)[0]
                    for i in range(np.max(kmeans.labels_)+1)]

        # Step 11
        samples = []

        # Step 12
        for i in range(n_to_sample):
            random_index = self.random_state.choice(informative_minority,
                                                    p=selection_probabilities)
            cluster_label = kmeans.labels_[random_index]
            cluster = clusters[cluster_label]
            random_index_in_cluster = self.random_state.choice(cluster)
            X_random = X_min[random_index]
            X_random_cluster = X_min[random_index_in_cluster]
            samples.append(self.sample_between_points(X_random,
                                                      X_random_cluster))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'k1': self.k1,
                'k2': self.k2,
                'k3': self.k3,
                'M': self.M,
                'cf_th': self.cf_th,
                'cmax': self.cmax,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class PDFOS(OverSampling):
    """
    References:
        * BibTex::

            @article{pdfos,
                    title = "PDFOS: PDF estimation based over-sampling for
                                imbalanced two-class problems",
                    journal = "Neurocomputing",
                    volume = "138",
                    pages = "248 - 259",
                    year = "2014",
                    issn = "0925-2312",
                    doi = "https://doi.org/10.1016/j.neucom.2014.02.006",
                    author = "Ming Gao and Xia Hong and Sheng Chen and Chris
                                J. Harris and Emad Khalaf",
                    keywords = "Imbalanced classification, Probability density
                                function based over-sampling, Radial basis
                                function classifier, Orthogonal forward
                                selection, Particle swarm optimisation"
                    }

    Notes:
        * Not prepared for low-rank data.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_density_estimation]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

    def _sample_by_kernel_density_estimation(self,
                                             X,
                                             n_to_sample,
                                             n_optimize=100):
        """
        Sample n_to_sample instances by kernel density estimation

        Args:
            X_min (np.array): minority data
            n_to_sample (int): number of instances to sample
            n_optimize (int): number of vectors used for the optimization
                                process
        """
        # dimensionality of the data
        m = len(X[0])

        # computing the covariance matrix of the data
        S = np.cov(X, rowvar=False)
        message = "Condition number of covariance matrix: %f"
        message = message % np.linalg.cond(S)
        _logger.info(self.__class__.__name__ + ": " + message)

        message = "Inputs size: %d" % len(X)
        _logger.info(self.__class__.__name__ + ": " + message)
        _logger.info(self.__class__.__name__ + ": " + "Input dim: %d" % m)

        S_mrank = np.linalg.matrix_rank(S, tol=1e-2)
        message = "Matrix rank of covariance matrix: %d" % S_mrank
        _logger.info(self.__class__.__name__ + ": " + message)

        # checking the rank of the matrix
        if S_mrank < m:
            message = "The covariance matrix is singular, fixing it by PCA"
            _logger.info(self.__class__.__name__ + ": " + message)
            message = "dim: %d, rank: %d, size: %d" % (m, S_mrank, len(X))
            _logger.info(self.__class__.__name__ + ": " + message)

            n_components = max([min([S_mrank, len(X)])-1, 2])
            if n_components == len(X[0]):
                return X.copy()

            pca = PCA(n_components=n_components)
            X_low_dim = pca.fit_transform(X)
            X_samp = self._sample_by_kernel_density_estimation(
                X_low_dim, n_to_sample, n_optimize)
            return pca.inverse_transform(X_samp)

        S_inv = np.linalg.inv(S)
        det = np.linalg.det(S)

        _logger.info(self.__class__.__name__ + ": " + "Determinant: %f" % det)

        def eq_9(i, j, sigma, X):
            """
            Eq (9) in the paper
            """
            tmp = np.dot(np.dot((X[j] - X[i]), S_inv), (X[j] - X[i]))
            numerator = (np.sqrt(2)*sigma)**(-m)*np.exp(-(1/(4*sigma**2))*tmp)
            denominator = ((2*np.pi)**(m/2))
            return numerator/denominator

        def eq_5(i, j, sigma, X):
            """
            Eq (5) in the paper
            """
            tmp = np.dot(np.dot((X[j] - X[i]), S_inv), (X[j] - X[i]))
            numerator = sigma**(-m)*np.exp(-(1/(2*sigma**2))*tmp)
            denominator = ((2.0*np.pi)**(m/2))
            return numerator/denominator

        def eq_5_0(sigma, X):
            """
            Eq (5) with the same vectors feeded in
            """
            return sigma**(-m)/((2.0*np.pi)**(m/2))

        def eq_8(i, j, sigma, X):
            """
            Eq (8) in the paper
            """
            e9 = eq_9(i, j, sigma, X)
            e5 = eq_5(i, j, sigma, X)
            return e9 - 2*e5

        def M(sigma, X):
            """
            Eq (7) in the paper
            """
            total = 0.0
            for i in range(len(X)):
                for j in range(len(X)):
                    total = total + eq_8(i, j, sigma, X)

            a = total/len(X)**2
            b = 2.0*eq_5_0(sigma, X)/len(X)
            return a + b

        # finding the best sigma parameter
        best_sigma = 0
        error = np.inf
        # the dataset is reduced to make the optimization more efficient
        domain = range(len(X))
        n_to_choose = min([len(X), n_optimize])
        X_reduced = X[self.random_state.choice(domain,
                                               n_to_choose,
                                               replace=False)]

        # we suppose that the data is normalized, thus, this search space
        # should be meaningful
        for sigma in np.logspace(-5, 2, num=20):
            e = M(sigma, X_reduced)
            if e < error:
                error = e
                best_sigma = sigma
        _logger.info(self.__class__.__name__ + ": " +
                     "best sigma found: %f" % best_sigma)

        # generating samples according to the
        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.randint(len(X))
            samples.append(self.random_state.multivariate_normal(
                X[idx], best_sigma*S))

        return np.vstack(samples)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # scaling the data to aid numerical stability
        ss = StandardScaler()
        X_ss = ss.fit_transform(X)

        X_min = X_ss[y == self.min_label]

        # generating samples by kernel density estimation
        samples = self._sample_by_kernel_density_estimation(X_min,
                                                            n_to_sample,
                                                            n_optimize=100)

        return (np.vstack([X, ss.inverse_transform(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class IPADE_ID(OverSampling):
    """
    References:
        * BibTex::

            @article{ipade_id,
                    title = "Addressing imbalanced classification with
                                instance generation techniques: IPADE-ID",
                    journal = "Neurocomputing",
                    volume = "126",
                    pages = "15 - 28",
                    year = "2014",
                    note = "Recent trends in Intelligent Data Analysis Online
                                Data Processing",
                    issn = "0925-2312",
                    doi = "https://doi.org/10.1016/j.neucom.2013.01.050",
                    author = "Victoria López and Isaac Triguero and Cristóbal
                                J. Carmona and Salvador García and
                                Francisco Herrera",
                    keywords = "Differential evolution, Instance generation,
                                Nearest neighbor, Decision tree, Imbalanced
                                datasets"
                    }

    Notes:
        * According to the algorithm, if the addition of a majority sample
            doesn't improve the AUC during the DE optimization process,
            the addition of no further majority points is tried.
        * In the differential evolution the multiplication by a random number
            seems have a deteriorating effect, new scaling parameter added to
            fix this.
        * It is not specified how to do the evaluation.
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_memetic,
                  OverSampling.cat_uses_classifier]

[docs]    def __init__(self,
                 F=0.1,
                 G=0.1,
                 OT=20,
                 max_it=40,
                 dt_classifier=DecisionTreeClassifier(random_state=2),
                 base_classifier=DecisionTreeClassifier(random_state=2),
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            F (float): control parameter of differential evolution
            G (float): control parameter of the evolution
            OT (int): number of optimizations
            max_it (int): maximum number of iterations for DE_optimization
            dt_classifier (obj): decision tree classifier object
            base_classifier (obj): classifier object
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater(F, 'F', 0)
        self.check_greater(G, 'G', 0)
        self.check_greater(OT, 'OT', 0)
        self.check_greater(max_it, 'max_it', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.F = F
        self.G = G
        self.OT = OT
        self.max_it = max_it
        self.dt_classifier = dt_classifier
        self.base_classifier = base_classifier
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        # as the OT and max_it parameters control the discovery of the feature
        # space it is enough to try sufficiently large numbers
        dt_classifiers = [DecisionTreeClassifier(random_state=2)]
        base_classifiers = [DecisionTreeClassifier(random_state=2)]
        parameter_combinations = {'F': [0.1, 0.2],
                                  'G': [0.1, 0.2],
                                  'OT': [30],
                                  'max_it': [40],
                                  'dt_classifier': dt_classifiers,
                                  'base_classifier': base_classifiers}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        mms = MinMaxScaler()
        X = mms.fit_transform(X)

        min_indices = np.where(y == self.min_label)[0]
        maj_indices = np.where(y == self.maj_label)[0]

        def DE_optimization(GS,
                            GS_y,
                            X,
                            y,
                            min_indices,
                            maj_indices,
                            classifier,
                            for_validation):
            """
            Implements the DE_optimization method of the paper.

            Args:
                GS (np.matrix): actual best training set
                GS_y (np.array): corresponding class labels
                X (np.matrix): complete training set
                y (np.array): all class labels
                min_indices (np.array): array of minority class labels in y
                maj_indices (np.array): array of majority class labels in y
                classifier (object): base classifier
                for_validation (np.array): array of indices for X used for
                                            validation

            Returns:
                np.matrix: optimized training set
            """
            # evaluate training set
            AUC_GS = evaluate_ID(
                GS, GS_y, X[for_validation], y[for_validation], classifier)

            # optimizing the training set
            for _ in range(self.max_it):
                GS_hat = []
                # doing the differential evolution
                for i in range(len(GS)):
                    if GS_y[i] == self.min_label:
                        r1, r2, r3 = self.random_state.choice(min_indices,
                                                              3,
                                                              replace=False)
                    else:
                        r1, r2, r3 = self.random_state.choice(maj_indices,
                                                              3,
                                                              replace=False)

                    random_value = self.random_state.random_sample()
                    force_G = X[r1] - X[i]
                    force_F = X[r2] - X[r3]
                    value = GS[i] + self.G*random_value * \
                        force_G + self.F*force_F
                    GS_hat.append(np.clip(value, 0.0, 1.0))

                # evaluating the current setting
                AUC_GS_hat = evaluate_ID(GS_hat,
                                         GS_y,
                                         X[for_validation],
                                         y[for_validation],
                                         classifier)

                if AUC_GS_hat > AUC_GS:
                    GS = GS_hat
                    AUC_GS = AUC_GS_hat

            return GS

        def evaluate_ID(GS, GS_y, TR, TR_y, base_classifier):
            """
            Implements the evaluate_ID function of the paper.

            Args:
                GS (np.matrix): actual training set
                GS_y (np.array): list of corresponding class labels
                TR (np.matrix): complete training set
                TR_y (np.array): all class labels
                base_classifier (object): classifier to be used

            Returns:
                float: ROC AUC score
            """
            base_classifier.fit(GS, GS_y)
            pred = base_classifier.predict_proba(TR)[:, np.where(
                base_classifier.classes_ == self.min_label)[0][0]]
            if len(np.unique(TR_y)) != 2:
                return 0.0
            return roc_auc_score(TR_y, pred)

        def evaluate_class(GS, GS_y, TR, TR_y, base_classifier):
            """
            Implements the evaluate_ID function of the paper.

            Args:
                GS (np.matrix): actual training set
                GS_y (np.array): list of corresponding class labels
                TR (np.matrix): complete training set
                TR_y (np.array): all class labels
                base_classifier (object): classifier to be used

            Returns:
                float: accuracy score
            """
            base_classifier.fit(GS, GS_y)
            pred = base_classifier.predict(TR)
            return accuracy_score(TR_y, pred)

        # Phase 1: Initialization
        _logger.info(self.__class__.__name__ + ": " + "Initialization")
        self.dt_classifier.fit(X, y)
        leafs = self.dt_classifier.apply(X)
        unique_leafs = np.unique(leafs)
        used_in_GS = np.repeat(False, len(X))
        for_validation = np.where(np.logical_not(used_in_GS))[0]

        # extracting mean elements of the leafs
        GS = []
        GS_y = []
        for u in unique_leafs:
            indices = np.where(leafs == u)[0]
            GS.append(np.mean(X[indices], axis=0))
            GS_y.append(mode(y[indices]))
            if len(indices) == 1:
                used_in_GS[indices[0]] = True

        # updating the indices of the validation set excluding those used in GS
        for_validation = np.where(np.logical_not(used_in_GS))[0]
        _logger.info(self.__class__.__name__ + ": " +
                     "Size of validation set %d" % len(for_validation))
        if len(np.unique(y[for_validation])) == 1:
            _logger.info(self.__class__.__name__ + ": " +
                         "No minority samples in validation set")
            return X.copy(), y.copy()
        if len(np.unique(GS_y)) == 1:
            _logger.info(self.__class__.__name__ + ": " +
                         "No minority samples in reduced dataset")
            return X.copy(), y.copy()

        # DE optimization takes place
        _logger.info(self.__class__.__name__ + ": " + "DE optimization")
        base_classifier = self.base_classifier.__class__(
            **(self.base_classifier.get_params()))
        GS = DE_optimization(GS, GS_y, X, y, min_indices,
                             maj_indices, base_classifier, for_validation)
        # evaluate results
        base_classifier = self.base_classifier.__class__(
            **(self.base_classifier.get_params()))
        AUC = evaluate_ID(GS, GS_y, X[for_validation],
                          y[for_validation], base_classifier)

        # Phase 2: Addition of new instances
        register_class = {self.min_label: 'optimizable',
                          self.maj_label: 'optimizable'}
        number_of_optimizations = {self.min_label: 0,
                                   self.maj_label: 0}
        accuracy_class = {self.min_label: 0, self.maj_label: 0}

        _logger.info(self.__class__.__name__ + ": " + "Starting optimization")
        while (AUC < 1.0
                and (register_class[self.min_label] == 'optimizable'
                     or register_class[self.maj_label] == 'optimizable')):
            less_accuracy = np.inf
            # loop in line 8
            for i in [self.min_label, self.maj_label]:
                # condition in line 9
                if register_class[i] == 'optimizable':
                    y_mask = y[for_validation] == i
                    class_for_validation = for_validation[y_mask]
                    bp = self.base_classifier.get_params()
                    base_classifier = self.base_classifier.__class__(**(bp))
                    accuracy_class[i] = evaluate_class(GS,
                                                       GS_y,
                                                       X[class_for_validation],
                                                       y[class_for_validation],
                                                       base_classifier)
                    if accuracy_class[i] < less_accuracy:
                        less_accuracy = accuracy_class[i]
                        target_class = i
            # conditional in line 17
            if (target_class == self.min_label
                    and number_of_optimizations[target_class] > 0):
                # it is not clear where does GS_trial coming from in line 18
                GS = DE_optimization(GS,
                                     GS_y,
                                     X,
                                     y,
                                     min_indices,
                                     maj_indices,
                                     base_classifier,
                                     for_validation)
            else:
                if target_class == self.min_label:
                    idx = self.random_state.choice(min_indices)
                else:
                    idx = self.random_state.choice(maj_indices)

                GS_trial = np.vstack([GS, X[idx]])
                GS_trial_y = np.hstack([GS_y, y[idx]])
                # removing idx from the validation set in order to keep
                # the validation fair
                for_validation_trial = for_validation.tolist()
                if idx in for_validation:
                    for_validation_trial.remove(idx)

                for_validation_trial = np.array(
                    for_validation_trial).astype(int)
                # doing optimization
                GS_trial = DE_optimization(GS_trial,
                                           GS_trial_y,
                                           X,
                                           y,
                                           min_indices,
                                           maj_indices,
                                           base_classifier,
                                           for_validation)

            # line 23
            bp = self.base_classifier.get_params()
            base_classifier = self.base_classifier.__class__(**(bp))

            AUC_trial = evaluate_ID(GS_trial,
                                    GS_trial_y,
                                    X[for_validation],
                                    y[for_validation],
                                    base_classifier)
            # conditional in line 24
            if AUC_trial > AUC:
                AUC = AUC_trial
                GS = GS_trial
                GS_y = GS_trial_y
                for_validation = for_validation_trial

                _logger.info(self.__class__.__name__ + ": " +
                             "Size of validation set %d" % len(for_validation))
                if len(np.unique(y[for_validation])) == 1:
                    _logger.info(self.__class__.__name__ + ": " +
                                 "No minority samples in validation set")
                    return X.copy(), y.copy()
                if len(np.unique(GS_y)) == 1:
                    _logger.info(self.__class__.__name__ + ": " +
                                 "No minority samples in reduced dataset")
                    return X.copy(), y.copy()

                number_of_optimizations[target_class] = 0
            else:
                # conditional in line 29
                if (target_class == self.min_label
                        and number_of_optimizations[target_class] < self.OT):
                    number_of_optimizations[target_class] += 1
                else:
                    register_class[target_class] = 'non-optimizable'

        return mms.inverse_transform(GS), GS_y

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'F': self.F,
                'G': self.G,
                'OT': self.OT,
                'max_it': self.max_it,
                'n_jobs': self.n_jobs,
                'dt_classifier': self.dt_classifier,
                'base_classifier': self.base_classifier,
                'random_state': self._random_state_init}


[docs]class RWO_sampling(OverSampling):
    """
    References:
        * BibTex::

            @article{rwo_sampling,
                    author = {Zhang, Huaxzhang and Li, Mingfang},
                    year = {2014},
                    month = {11},
                    pages = {},
                    title = {RWO-Sampling: A Random Walk Over-Sampling Approach
                                to Imbalanced Data Classification},
                    volume = {20},
                    booktitle = {Information Fusion}
                    }
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        stds = np.diag(np.std(X_min, axis=0)/np.sqrt(len(X_min)))

        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.randint(len(X_min))
            samples.append(self.random_state.multivariate_normal(X_min[idx],
                                                                 stds))

        return (np.vstack([X, samples]),
                np.hstack([y, np.array([self.min_label]*len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class NEATER(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{neater,
                            author={Almogahed, B. A. and Kakadiaris, I. A.},
                            booktitle={2014 22nd International Conference on
                                         Pattern Recognition},
                            title={NEATER: Filtering of Over-sampled Data
                                    Using Non-cooperative Game Theory},
                            year={2014},
                            volume={},
                            number={},
                            pages={1371-1376},
                            keywords={data handling;game theory;information
                                        filtering;NEATER;imbalanced data
                                        problem;synthetic data;filtering of
                                        over-sampled data using non-cooperative
                                        game theory;Games;Game theory;Vectors;
                                        Sociology;Statistics;Silicon;
                                        Mathematical model},
                            doi={10.1109/ICPR.2014.245},
                            ISSN={1051-4651},
                            month={Aug}}

    Notes:
        * Evolving both majority and minority probabilities as nothing ensures
            that the probabilities remain in the range [0,1], and they need to
            be normalized.
        * The inversely weighted function needs to be cut at some value (like
            the alpha level), otherwise it will overemphasize the utility of
            having differing neighbors next to each other.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_borderline,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 smote_n_neighbors=5,
                 b=5,
                 alpha=0.1,
                 h=20,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            smote_n_neighbors (int): number of neighbors in SMOTE sampling
            b (int): number of neighbors
            alpha (float): smoothing term
            h (int): number of iterations in evolution
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(smote_n_neighbors, "smote_n_neighbors", 1)
        self.check_greater_or_equal(b, "b", 1)
        self.check_greater_or_equal(alpha, "alpha", 0)
        self.check_greater_or_equal(h, "h", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.smote_n_neighbors = smote_n_neighbors
        self.b = b
        self.alpha = alpha
        self.h = h
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'smote_n_neighbors': [3, 5, 7],
                                  'b': [3, 5, 7],
                                  'alpha': [0.1],
                                  'h': [20]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # Applying SMOTE and ADASYN
        X_0, y_0 = SMOTE(proportion=self.proportion,
                         n_neighbors=self.smote_n_neighbors,
                         n_jobs=self.n_jobs,
                         random_state=self.random_state).sample(X, y)

        X_1, y_1 = ADASYN(n_neighbors=self.b,
                          n_jobs=self.n_jobs,
                          random_state=self.random_state).sample(X, y)

        X_new = np.vstack([X_0, X_1[len(X):]])
        y_new = np.hstack([y_0, y_1[len(y):]])

        X_syn = X_new[len(X):]

        if len(X_syn) == 0:
            return X.copy(), y.copy()

        X_all = X_new
        y_all = y_new

        # binary indicator indicating synthetic instances
        synthetic = np.hstack(
            [np.array([False]*len(X)), np.array([True]*len(X_syn))])

        # initializing strategy probabilities
        prob = np.zeros(shape=(len(X_all), 2))
        prob.fill(0.5)
        for i in range(len(X)):
            if y[i] == self.min_label:
                prob[i, 0], prob[i, 1] = 0.0, 1.0
            else:
                prob[i, 0], prob[i, 1] = 1.0, 0.0

        # Finding nearest neighbors, +1 as X_syn is part of X_all and nearest
        # neighbors will be themselves
        nn = NearestNeighbors(n_neighbors=self.b + 1, n_jobs=self.n_jobs)
        nn.fit(X_all)
        distances, indices = nn.kneighbors(X_syn)

        # computing distances
        dm = pairwise_distances(X_syn, X_all)
        dm[dm == 0] = 1e-8
        dm = 1.0/dm
        dm[dm > self.alpha] = self.alpha

        def wprob_mixed(prob, i):
            ind = indices[i][1:]
            term_0 = 1*prob[i][0]*prob[ind, 0]
            term_1 = dm[i, ind]*(prob[i][1]*prob[ind, 0] +
                                 prob[i][0]*prob[ind, 1])
            term_2 = 1*prob[i][1]*prob[ind, 1]
            return np.sum(term_0 + term_1 + term_2)

        def wprob_min(prob, i):
            term_0 = 0*prob[indices[i][1:], 0]
            term_1 = dm[i, indices[i][1:]]*(1*prob[indices[i][1:], 0] +
                                            0*prob[indices[i][1:], 1])
            term_2 = 1*prob[indices[i][1:], 1]
            return np.sum(term_0 + term_1 + term_2)

        def wprob_maj(prob, i):
            term_0 = 1*prob[indices[i][1:], 0]
            term_1 = dm[i, indices[i][1:]]*(0*prob[indices[i][1:], 0] +
                                            1*prob[indices[i][1:], 1])
            term_2 = 0*prob[indices[i][1:], 1]
            return np.sum(term_0 + term_1 + term_2)

        def utilities(prob):
            """
            Computes the utilit function

            Args:
                prob (np.matrix): strategy probabilities

            Returns:
                np.array, np.array, np.array: utility values, minority
                                                utilities, majority
                                                utilities
            """

            domain = range(len(X_syn))
            util_mixed = np.array([wprob_mixed(prob, i) for i in domain])
            util_mixed = np.hstack([np.array([0]*len(X)), util_mixed])

            util_min = np.array([wprob_min(prob, i) for i in domain])
            util_min = np.hstack([np.array([0]*len(X)), util_min])

            util_maj = np.array([wprob_maj(prob, i) for i in domain])
            util_maj = np.hstack([np.array([0]*len(X)), util_maj])

            return util_mixed, util_min, util_maj

        def evolution(prob, synthetic, alpha=self.alpha):
            """
            Executing one step of the probabilistic evolution

            Args:
                prob (np.matrix): strategy probabilities
                synthetic (np.array): flags of synthetic examples
                alpha (float): smoothing function

            Returns:
                np.matrix: updated probabilities
            """
            util_mixed, util_min, util_maj = utilities(prob)

            prob_new = prob.copy()
            synthetic_values = prob[:, 1] * \
                (alpha + util_min)/(alpha + util_mixed)
            prob_new[:, 1] = np.where(synthetic, synthetic_values, prob[:, 1])

            synthetic_values = prob[:, 0] * \
                (alpha + util_maj)/(alpha + util_mixed)
            prob_new[:, 0] = np.where(synthetic, synthetic_values, prob[:, 0])

            norm_factor = np.sum(prob_new, axis=1)

            prob_new[:, 0] = prob_new[:, 0]/norm_factor
            prob_new[:, 1] = prob_new[:, 1]/norm_factor

            return prob_new

        # executing the evolution
        for _ in range(self.h):
            prob = evolution(prob, synthetic)

        # determining final labels
        y_all[len(X):] = np.argmax(prob[len(X):], axis=1)

        return X_all, y_all

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'smote_n_neighbors': self.smote_n_neighbors,
                'b': self.b,
                'alpha': self.alpha,
                'h': self.h,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class DEAGO(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{deago,
                            author={Bellinger, C. and Japkowicz, N. and
                                        Drummond, C.},
                            booktitle={2015 IEEE 14th International
                                        Conference on Machine Learning
                                        and Applications (ICMLA)},
                            title={Synthetic Oversampling for Advanced
                                        Radioactive Threat Detection},
                            year={2015},
                            volume={},
                            number={},
                            pages={948-953},
                            keywords={radioactive waste;advanced radioactive
                                        threat detection;gamma-ray spectral
                                        classification;industrial nuclear
                                        facilities;Health Canadas national
                                        monitoring networks;Vancouver 2010;
                                        Isotopes;Training;Monitoring;
                                        Gamma-rays;Machine learning algorithms;
                                        Security;Neural networks;machine
                                        learning;classification;class
                                        imbalance;synthetic oversampling;
                                        artificial neural networks;
                                        autoencoders;gamma-ray spectra},
                            doi={10.1109/ICMLA.2015.58},
                            ISSN={},
                            month={Dec}}

    Notes:
        * There is no hint on the activation functions and amounts of noise.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_density_estimation,
                  OverSampling.cat_application]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 e=100,
                 h=0.3,
                 sigma=0.1,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            e (int): number of epochs
            h (float): fraction of number of hidden units
            sigma (float): training noise
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0.0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater(e, "e", 1)
        self.check_greater(h, "h", 0)
        self.check_greater(sigma, "sigma", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.e = e
        self.h = h
        self.sigma = sigma
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'e': [40],
                                  'h': [0.1, 0.2, 0.3, 0.4, 0.5],
                                  'sigma': [0.05, 0.1, 0.2]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # ugly hack to get reproducible results from keras with
        # tensorflow backend
        if isinstance(self._random_state_init, int):
            import os
            os.environ['PYTHONHASHSEED'] = str(self._random_state_init)
            import keras as K
            np.random.seed(self._random_state_init)
            import random
            random.seed(self._random_state_init)
            # from tensorflow import set_random_seed
            import tensorflow
            try:
                tensorflow.set_random_seed(self._random_state_init)
            except Exception as e:
                tensorflow.random.set_seed(self._random_state_init)
        else:
            seed = 127
            import os
            os.environ['PYTHONHASHSEED'] = str(seed)
            import keras as K
            np.random.seed(seed)
            import random
            random.seed(seed)
            # from tensorflow import set_random_seed
            import tensorflow
            try:
                tensorflow.compat.v1.set_random_seed(seed)
            except Exception as e:
                tensorflow.random.set_seed(self._random_state_init)

        from keras import backend as K
        import tensorflow as tf
        try:
            session_conf = tf.compat.v1.ConfigProto(
                intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
            sess = tf.compat.v1.Session(
                graph=tf.compat.v1.get_default_graph(), config=session_conf)
            K.set_session(sess)
        except Exception as e:
            session_conf = tf.compat.v1.ConfigProto(
                intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
            sess = tf.compat.v1.Session(
                graph=tf.compat.v1.get_default_graph(), config=session_conf)
            tf.compat.v1.keras.backend.set_session(sess)

        if not hasattr(self, 'Input'):
            from keras.layers import Input, Dense, GaussianNoise
            from keras.models import Model
            from tensorflow.keras.callbacks import EarlyStopping

            self.Input = Input
            self.Dense = Dense
            self.GaussianNoise = GaussianNoise
            self.Model = Model
            self.EarlyStopping = EarlyStopping

        # sampling by smote
        X_samp, y_samp = SMOTE(proportion=self.proportion,
                               n_neighbors=self.n_neighbors,
                               n_jobs=self.n_jobs,
                               random_state=self.random_state).sample(X, y)

        # samples to map to the manifold extracted by the autoencoder
        X_init = X_samp[len(X):]

        if len(X_init) == 0:
            return X.copy(), y.copy()

        # normalizing
        X_min = X[y == self.min_label]
        ss = StandardScaler()
        X_min_normalized = ss.fit_transform(X_min)
        X_init_normalized = ss.transform(X_init)

        # extracting dimensions
        d = len(X[0])
        encoding_d = max([2, int(np.rint(d*self.h))])

        message = "Input dimension: %d, encoding dimension: %d"
        message = message % (d, encoding_d)
        _logger.info(self.__class__.__name__ + ": " + message
                     )

        # constructing the autoencoder
        callbacks = [self.EarlyStopping(monitor='val_loss', patience=2)]

        input_layer = self.Input(shape=(d,))
        noise = self.GaussianNoise(self.sigma)(input_layer)
        encoded = self.Dense(encoding_d, activation='relu')(noise)
        decoded = self.Dense(d, activation='linear')(encoded)

        dae = self.Model(input_layer, decoded)
        dae.compile(optimizer='adadelta', loss='mean_squared_error')
        actual_epochs = max([self.e, int(5000.0/len(X_min))])

        if len(X_min) > 10:
            val_perc = 0.2
            val_num = int(val_perc*len(X_min))
            X_min_train = X_min_normalized[:-val_num]
            X_min_val = X_min_normalized[-val_num:]

            dae.fit(X_min_train,
                    X_min_train,
                    epochs=actual_epochs,
                    validation_data=(X_min_val, X_min_val),
                    callbacks=callbacks,
                    verbose=0)
        else:
            dae.fit(X_min_normalized, X_min_normalized,
                    epochs=actual_epochs, verbose=0)

        # mapping the initial samples to the manifold
        samples = ss.inverse_transform(dae.predict(X_init_normalized))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'e': self.e,
                'h': self.h,
                'sigma': self.sigma,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Gazzah(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{gazzah,
                            author={Gazzah, S. and Hechkel, A. and Essoukri
                                        Ben Amara, N. },
                            booktitle={2015 IEEE 12th International
                                        Multi-Conference on Systems,
                                        Signals Devices (SSD15)},
                            title={A hybrid sampling method for
                                    imbalanced data},
                            year={2015},
                            volume={},
                            number={},
                            pages={1-6},
                            keywords={computer vision;image classification;
                                        learning (artificial intelligence);
                                        sampling methods;hybrid sampling
                                        method;imbalanced data;
                                        diversification;computer vision
                                        domain;classical machine learning
                                        systems;intraclass variations;
                                        system performances;classification
                                        accuracy;imbalanced training data;
                                        training data set;over-sampling;
                                        minority class;SMOTE star topology;
                                        feature vector deletion;intra-class
                                        variations;distribution criterion;
                                        biometric data;true positive rate;
                                        Training data;Principal component
                                        analysis;Databases;Support vector
                                        machines;Training;Feature extraction;
                                        Correlation;Imbalanced data sets;
                                        Intra-class variations;Data analysis;
                                        Principal component analysis;
                                        One-against-all SVM},
                            doi={10.1109/SSD.2015.7348093},
                            ISSN={},
                            month={March}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_dim_reduction,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_components=2,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_components (int): number of components in PCA analysis
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_components, "n_components", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_components = n_components
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_components': [2, 3, 4, 5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # do the oversampling
        pf_smote = polynom_fit_SMOTE(proportion=self.proportion,
                                     random_state=self.random_state)
        X_samp, y_samp = pf_smote.sample(X, y)
        X_min_samp = X_samp[len(X):]

        if len(X_min_samp) == 0:
            return X.copy(), y.copy()

        # do the undersampling
        X_maj = X[y == self.maj_label]

        # fitting the PCA model
        pca = PCA(n_components=min([len(X[0]), self.n_components]))
        X_maj_trans = pca.fit_transform(X_maj)
        R = np.sqrt(np.sum(np.var(X_maj_trans, axis=0)))
        # determining the majority samples to remove
        to_remove = np.where([np.linalg.norm(x) > R for x in X_maj_trans])[0]
        _logger.info(self.__class__.__name__ + ": " +
                     "Removing %d majority samples" % len(to_remove))
        # removing the majority samples
        X_maj = np.delete(X_maj, to_remove, axis=0)

        if len(X_min_samp) == 0:
            _logger.info("no samples added")
            return X.copy(), y.copy()

        return (np.vstack([X_maj, X_min_samp]),
                np.hstack([np.repeat(self.maj_label, len(X_maj)),
                           np.repeat(self.min_label, len(X_min_samp))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_components': self.n_components,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MCT(OverSampling):
    """
    References:
        * BibTex::

            @article{mct,
                    author = {Jiang, Liangxiao and Qiu, Chen and Li, Chaoqun},
                    year = {2015},
                    month = {03},
                    pages = {1551004},
                    title = {A Novel Minority Cloning Technique for
                                Cost-Sensitive Learning},
                    volume = {29},
                    booktitle = {International Journal of Pattern Recognition
                                    and Artificial Intelligence}
                    }

    Notes:
        * Mode is changed to median, distance is changed to Euclidean to
                support continuous features, and normalized.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_copy]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # having continuous variables, the mode is replaced by median
        x_med = np.median(X_min, axis=0)
        distances = np.array([np.linalg.norm(x_med - x) for x in X_min])
        sums = np.sum(distances)
        if sums != 0:
            distances = distances/sums

        # distribution of copies is determined (Euclidean distance is a
        # dissimilarity measure which is changed to similarity by subtracting
        # from 1.0)
        distribution = (1.0 - distances)/(np.sum(1.0 - distances))

        if any(np.isnan(distribution)):
            _logger.warning(self.__class__.__name__ + ": " +
                            "NaN in the probability distribution")
            return X.copy(), y.copy()

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            samples.append(X_min[self.random_state.choice(
                np.arange(len(X_min)), p=distribution)])

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ADG(OverSampling):
    """
    References:
        * BibTex::

            @article{adg,
                    author = {Pourhabib, A. and Mallick, Bani K. and Ding, Yu},
                    year = {2015},
                    month = {16},
                    pages = {2695--2724},
                    title = {A Novel Minority Cloning Technique for
                                Cost-Sensitive Learning},
                    volume = {16},
                    journal = {Journal of Machine Learning Research}
                    }

    Notes:
        * This method has a lot of parameters, it becomes fairly hard to
            cross-validate thoroughly.
        * Fails if matrix is singular when computing alpha_star, fixed
            by PCA.
        * Singularity might be caused by repeating samples.
        * Maintaining the kernel matrix becomes unfeasible above a couple
            of thousand vectors.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 kernel='inner',
                 lam=1.0,
                 mu=1.0,
                 k=12,
                 gamma=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            kernel (str): 'inner'/'rbf_x', where x is a float, the bandwidth
            lam (float): lambda parameter of the method
            mu (float): mu parameter of the method
            k (int): number of samples to generate in each iteration
            gamma (float): gamma parameter of the method
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)

        if kernel != 'inner' and not kernel.startswith('rbf'):
            raise ValueError(self.__class__.__name__ + ": " +
                             'Kernel function %s not supported' % kernel)
        elif kernel.startswith('rbf'):
            par = float(kernel.split('_')[-1])
            if par <= 0.0:
                raise ValueError(self.__class__.__name__ + ": " +
                                 'Kernel parameter %f is not supported' % par)

        self.check_greater(lam, 'lam', 0)
        self.check_greater(mu, 'mu', 0)
        self.check_greater_or_equal(k, 'k', 1)
        self.check_greater(gamma, 'gamma', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.kernel = kernel
        self.lam = lam
        self.mu = mu
        self.k = k
        self.gamma = gamma
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'kernel': ['inner', 'rbf_0.5',
                                             'rbf_1.0', 'rbf_2.0'],
                                  'lam': [1.0, 2.0],
                                  'mu': [1.0, 2.0],
                                  'k': [12],
                                  'gamma': [1.0, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        def bic_score(kmeans, X):
            """
            Compute BIC score for clustering

            Args:
                kmeans (sklearn.KMeans): kmeans object
                X (np.matrix):  clustered data

            Returns:
                float: bic value

            Inspired by https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
            """  # noqa
            # extract descriptors of the clustering
            cluster_centers = kmeans.cluster_centers_
            cluster_labels = kmeans.labels_
            n_clusters = kmeans.n_clusters
            n_in_clusters = np.bincount(cluster_labels)
            N, d = X.shape

            # compute variance for all clusters beforehand

            def sum_norm_2(i):
                return np.sum(np.linalg.norm(X[cluster_labels == i] -
                                             cluster_centers[i])**2)

            cluster_variances = [sum_norm_2(i) for i in range(n_clusters)]
            term_0 = (1.0)/((N - n_clusters) * d)
            term_1 = np.sum(cluster_variances)
            clustering_variance = term_0 * term_1

            const_term = 0.5 * n_clusters * np.log(N) * (d+1)

            def bic_comp(i):
                term_0 = n_in_clusters[i] * np.log(n_in_clusters[i])
                term_1 = n_in_clusters[i] * np.log(N)
                term_2 = (((n_in_clusters[i] * d) / 2)
                          * np.log(2*np.pi*clustering_variance))
                term_3 = ((n_in_clusters[i] - 1) * d / 2)

                return term_0 - term_1 - term_2 - term_3

            bic = np.sum([bic_comp(i) for i in range(n_clusters)]) - const_term

            return bic

        def xmeans(X, r=(1, 10)):
            """
            Clustering with BIC based n_cluster selection

            Args:
                X (np.matrix): data to cluster
                r (tuple): lower and upper bound on the number of clusters

            Returns:
                sklearn.KMeans: clustering with lowest BIC score
            """
            best_bic = np.inf
            best_clustering = None

            # do clustering for all n_clusters in the specified range
            for k in range(r[0], min([r[1], len(X)])):
                kmeans = KMeans(n_clusters=k,
                                random_state=self.random_state).fit(X)

                bic = bic_score(kmeans, X)
                if bic < best_bic:
                    best_bic = bic
                    best_clustering = kmeans

            return best_clustering

        def xgmeans(X, r=(1, 10)):
            """
            Gaussian mixture with BIC to select the optimal number
            of components

            Args:
                X (np.matrix): data to cluster
                r (tuple): lower and upper bound on the number of components

            Returns:
                sklearn.GaussianMixture: Gaussian mixture model with the
                                            lowest BIC score
            """
            best_bic = np.inf
            best_mixture = None

            # do model fitting for all n_components in the specified range
            for k in range(r[0], min([r[1], len(X)])):
                gmm = GaussianMixture(
                    n_components=k, random_state=self.random_state).fit(X)
                bic = gmm.bic(X)
                if bic < best_bic:
                    best_bic = bic
                    best_mixture = gmm

            return best_mixture

        def evaluate_matrices(X, y, kernel=np.inner):
            """
            The function evaluates the matrices specified in the method.

            Args:
                X (np.matrix): features
                y (np.array): target labels
                kernel (function): the kernel function to be used

            Returns:
                np.matrix, np.matrix, int, int, np.matrix, np.array,
                np.matrix, np.matrix, np.matrix
                np.array, np.matrix, np.matrix, np.matrix, np.matrix:
                    X_minux, X_plus, l_minus, l_plus, X, y, K, M_plus, M_minus,
                    M, K_plus, K_minus, N_plus, n_minus using the notations of
                    the paper, X and y are ordered by target labels
            """
            X_minus = X[y == self.maj_label]
            X_plus = X[y == self.min_label]
            l_minus = len(X_minus)
            l_plus = len(X_plus)

            X = np.vstack([X_minus, X_plus])
            y = np.hstack([np.array([self.maj_label]*l_minus),
                           np.array([self.min_label]*l_plus)])

            K = pairwise_distances(X, X, metric=kernel)
            M_plus = np.mean(K[:, len(X_minus):], axis=1)
            M_minus = np.mean(K[:, :len(X_minus)], axis=1)
            M = np.dot(M_minus - M_plus, M_minus - M_plus)

            K_minus = K[:, :len(X_minus)]
            K_plus = K[:, len(X_minus):]

            return (X_minus, X_plus, l_minus, l_plus, X, y, K,
                    M_plus, M_minus, M, K_plus, K_minus)

        # Implementation of the technique, following the steps and notations
        # of the paper
        q = n_to_sample

        # instantiating the proper kernel function, the parameter of the RBF
        # is supposed to be the denominator in the Gaussian
        if self.kernel == 'inner':
            kernel_function = np.inner
        else:
            kf = self.kernel.split('_')
            if kf[0] == 'rbf':
                d = float(kf[1])
                def kernel_function(
                    x, y): return np.exp(-np.linalg.norm(x - y)**2/d)

        # Initial evaluation of the matrices
        (X_minus, X_plus, l_minus, l_plus, X, y, K, M_plus, M_minus,
         M, K_plus, K_minus) = evaluate_matrices(X,
                                                 y,
                                                 kernel=kernel_function)
        # The computing of N matrix is factored into two steps, computing
        # N_plus and N_minus this is used to improve efficiency
        K_plus2 = np.dot(K_plus, K_plus.T)
        K_plus_sum = np.sum(K_plus, axis=1)
        K_plus_diad = np.outer(K_plus_sum, K_plus_sum)/l_plus

        K_minus2 = np.dot(K_minus, K_minus.T)
        K_minus_sum = np.sum(K_minus, axis=1)
        K_minus_diad = np.outer(K_minus_sum, K_minus_sum)/l_minus

        N = K_plus2 - K_plus_diad + K_minus2 - K_minus_diad

        X_plus_hat = X_plus.copy()
        l_minus = len(X_minus)

        early_stop = False
        total_added = 0
        # executing the sample generation
        while q > 1:
            _logger.info(self.__class__.__name__ + ": " +
                         "Starting iteration with q=%d" % q)
            # step 1
            clusters = xmeans(X_plus_hat)
            l_c = np.array([np.sum(clusters.labels_ == i)
                            for i in range(clusters.n_clusters)])

            # step 2
            k_c = ((1.0/l_c)/(np.sum(1.0/l_c))*self.k).astype(int)
            k_c[k_c == 0] = 1
            lam_c, mu_c = self.lam/l_c, self.mu/l_c

            # step 3
            omega = - np.sum([k_c[i]*(lam_c[i])**2/(4*mu_c[i]**2)
                              for i in range(len(k_c))])
            nu_c = - 0.5*k_c*lam_c
            M_plus_c = [np.mean(K[:, np.arange(len(X_minus), len(X))[
                clusters.labels_ == i]]) for i in range(len(k_c))]

            # step 4
            A = (M - self.gamma*N) - omega*K
            b = np.sum([(M_minus - M_plus_c[i])*nu_c[i]
                        for i in range(len(k_c))], axis=0)
            try:
                alpha_star = np.linalg.solve(A, b)
            except Exception as e:
                # handling the issue of singular matrix
                _logger.warning(self.__class__.__name__ +
                                ": " + "Singular matrix")
                # deleting huge data structures
                if q == n_to_sample:
                    if len(X[0]) == 1:
                        return None, None
                    K, K_plus, K_minus = None, None, None
                    n_components = int(np.sqrt(len(X[0])))
                    pca = PCA(n_components=n_components).fit(X)

                    message = "reducing dimensionality to %d" % n_components
                    _logger.warning(self.__class__.__name__ + ": " + message)
                    X_trans = pca.transform(X)
                    adg = ADG(proportion=self.proportion,
                              kernel=self.kernel,
                              lam=self.lam,
                              mu=self.mu,
                              k=self.k,
                              gamma=self.gamma,
                              random_state=self.random_state)
                    X_samp, y_samp = adg.sample(X_trans, y)
                    if X_samp is not None:
                        return pca.inverse_transform(X_samp), y_samp
                    else:
                        return X.copy(), y.copy()
                else:
                    q = int(q/2)
                continue

            # step 5
            mixture = xgmeans(X_plus)

            # step 6
            try:
                Z = mixture.sample(q)[0]
            except Exception as e:
                message = "sampling error in sklearn.mixture.GaussianMixture"
                _logger.warning(
                    self.__class__.__name__ + ": " + message)
                return X.copy(), y.copy()

            # step 7
            # computing the kernel matrix of generated samples with all samples
            K_10 = pairwise_distances(Z, X, metric=kernel_function)
            mask_inner_prod = np.where(np.inner(K_10, alpha_star) > 0)[0]
            Z_hat = Z[mask_inner_prod]

            if len(Z_hat) == 0:
                q = int(q/2)
                continue

            _logger.info(self.__class__.__name__ + ": " +
                         "number of vectors added: %d/%d" % (len(Z_hat), q))

            # step 8
            # this step is not used for anything, the identified clusters are
            # only used in step 13 of the paper, however, the values set
            # (M_plus^c) are overwritten in step 3 of the next iteration

            # step 9
            X_plus_hat = np.vstack([X_plus_hat, Z_hat])
            l_plus = len(X_plus_hat)

            # step 11 - 16
            # these steps have been reorganized a bit for efficient
            # calculations

            pairwd = pairwise_distances(Z_hat, Z_hat, metric=kernel_function)
            K = np.block([[K, K_10[mask_inner_prod].T],
                          [K_10[mask_inner_prod], pairwd]])

            K_minus = K[:, :l_minus]
            K_plus = K[:, l_minus:]

            # step 10
            X = np.vstack([X_minus, X_plus_hat])
            y = np.hstack([y, np.repeat(self.min_label, len(Z_hat))])

            if early_stop is True:
                break

            M_plus = np.mean(K_plus, axis=1)
            M_minus = np.mean(K_minus, axis=1)

            # step 13 is already involved in the core of the loop
            M = np.dot(M_minus - M_plus, M_minus - M_plus)

            l_new = len(Z_hat)
            total_added = total_added + l_new

            K_minus2_01 = np.dot(K_minus[:-l_new:], K_minus[-l_new:].T)
            K_minus2 = np.block([[K_minus2, K_minus2_01],
                                 [K_minus2_01.T, np.dot(K_minus[-l_new:],
                                                        K_minus[-l_new:].T)]])
            K_minus_sum = M_minus*len(K_minus)

            K_plus2 = K_plus2 + np.dot(K_plus[:-l_new, l_new:],
                                       K_plus[:-l_new, l_new:].T)

            K_plus2_01 = np.dot(K_plus[:-l_new], K_plus[-l_new:].T)

            K_plus2 = np.block([[K_plus2, K_plus2_01],
                                [K_plus2_01.T, np.dot(K_plus[-l_new:],
                                                      K_plus[-l_new:].T)]])

            K_plus_sum = M_plus*len(K_plus)

            N = K_plus2 - np.outer(K_plus_sum/l_plus, K_plus_sum) + \
                K_minus2 - np.outer(K_minus_sum/l_minus, K_minus_sum)

            # step 17
            if l_new/total_added < 0.01:
                early_stop = True
            else:
                q = int(q/2)

        return X.copy(), y.copy()

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'kernel': self.kernel,
                'lam': self.lam,
                'mu': self.mu,
                'k': self.k,
                'gamma': self.gamma,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_IPF(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_ipf,
                        title = "SMOTE–IPF: Addressing the noisy and borderline
                                    examples problem in imbalanced
                                    classification by a re-sampling method
                                    with filtering",
                        journal = "Information Sciences",
                        volume = "291",
                        pages = "184 - 203",
                        year = "2015",
                        issn = "0020-0255",
                        doi = "https://doi.org/10.1016/j.ins.2014.08.051",
                        author = "José A. Sáez and Julián Luengo and Jerzy
                                    Stefanowski and Francisco Herrera",
                        keywords = "Imbalanced classification,
                                        Borderline examples,
                                        Noisy data,
                                        Noise filters,
                                        SMOTE"
                        }
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_uses_classifier]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_folds=9,
                 k=3,
                 p=0.01,
                 voting='majority',
                 classifier=DecisionTreeClassifier(random_state=2),
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors in SMOTE sampling
            n_folds (int): the number of partitions
            k (int): used in stopping condition
            p (float): percentage value ([0,1]) used in stopping condition
            voting (str): 'majority'/'consensus'
            classifier (obj): classifier object
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_folds, "n_folds", 2)
        self.check_greater_or_equal(k, "k", 1)
        self.check_greater_or_equal(p, "p", 0)
        self.check_isin(voting, "voting", ['majority', 'consensus'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_folds = n_folds
        self.k = k
        self.p = p
        self.voting = voting
        self.classifier = classifier
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        classifiers = [DecisionTreeClassifier(random_state=2)]
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_folds': [9],
                                  'k': [3],
                                  'p': [0.01],
                                  'voting': ['majority', 'consensus'],
                                  'classifier': classifiers}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # do SMOTE sampling
        X_samp, y_samp = SMOTE(self.proportion,
                               self.n_neighbors,
                               n_jobs=self.n_jobs,
                               random_state=self.random_state).sample(X, y)

        n_folds = min([self.n_folds, np.sum(y == self.min_label)])

        condition = 0
        while True:
            # validating the sampled dataset
            validator = StratifiedKFold(n_folds)
            predictions = []
            for train_index, _ in validator.split(X_samp, y_samp):
                self.classifier.fit(X_samp[train_index], y_samp[train_index])
                predictions.append(self.classifier.predict(X_samp))

            # do decision based on one of the voting schemes
            if self.voting == 'majority':
                pred_votes = (np.mean(predictions, axis=0) > 0.5).astype(int)
                to_remove = np.where(np.not_equal(pred_votes, y_samp))[0]
            elif self.voting == 'consensus':
                pred_votes = (np.mean(predictions, axis=0) > 0.5).astype(int)
                sum_votes = np.sum(predictions, axis=0)
                to_remove = np.where(np.logical_and(np.not_equal(
                    pred_votes, y_samp), np.equal(sum_votes, self.n_folds)))[0]
            else:
                message = 'Voting scheme %s is not implemented' % self.voting
                raise ValueError(self.__class__.__name__ + ": " + message)

            # delete samples incorrectly classified
            _logger.info(self.__class__.__name__ + ": " +
                         'Removing %d elements' % len(to_remove))
            X_samp = np.delete(X_samp, to_remove, axis=0)
            y_samp = np.delete(y_samp, to_remove)

            # if the number of samples removed becomes small or k iterations
            # were done quit
            if len(to_remove) < len(X_samp)*self.p:
                condition = condition + 1
            else:
                condition = 0
            if condition >= self.k:
                break

        return X_samp, y_samp

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_folds': self.n_folds,
                'k': self.k,
                'p': self.p,
                'voting': self.voting,
                'n_jobs': self.n_jobs,
                'classifier': self.classifier,
                'random_state': self._random_state_init}


[docs]class KernelADASYN(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{kernel_adasyn,
                            author={Tang, B. and He, H.},
                            booktitle={2015 IEEE Congress on Evolutionary
                                        Computation (CEC)},
                            title={KernelADASYN: Kernel based adaptive
                                    synthetic data generation for
                                    imbalanced learning},
                            year={2015},
                            volume={},
                            number={},
                            pages={664-671},
                            keywords={learning (artificial intelligence);
                                        pattern classification;
                                        sampling methods;KernelADASYN;
                                        kernel based adaptive synthetic
                                        data generation;imbalanced
                                        learning;standard classification
                                        algorithms;data distribution;
                                        minority class decision rule;
                                        expensive minority class data
                                        misclassification;kernel based
                                        adaptive synthetic over-sampling
                                        approach;imbalanced data
                                        classification problems;kernel
                                        density estimation methods;Kernel;
                                        Estimation;Accuracy;Measurement;
                                        Standards;Training data;Sampling
                                        methods;Imbalanced learning;
                                        adaptive over-sampling;kernel
                                        density estimation;pattern
                                        recognition;medical and
                                        healthcare data learning},
                            doi={10.1109/CEC.2015.7256954},
                            ISSN={1089-778X},
                            month={May}}

    Notes:
        * The method of sampling was not specified, Markov Chain Monte Carlo
            has been implemented.
        * Not prepared for improperly conditioned covariance matrix.
    """

    categories = [OverSampling.cat_density_estimation,
                  OverSampling.cat_extensive,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 k=5,
                 h=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            k (int): number of neighbors in the nearest neighbors component
            h (float): kernel bandwidth
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(k, 'k', 1)
        self.check_greater(h, 'h', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.k = k
        self.h = h
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'k': [5, 7, 9],
                                  'h': [0.01, 0.02, 0.05, 0.1, 0.2,
                                        0.5, 1.0, 2.0, 10.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting the nearest neighbors model
        nn = NearestNeighbors(n_neighbors=min([len(X_min), self.k+1]),
                              n_jobs=self.n_jobs)
        nn.fit(X)
        distances, indices = nn.kneighbors(X_min)

        # computing majority score
        r = np.array([np.sum(y[indices[i][1:]] == self.maj_label)
                      for i in range(len(X_min))])

        if np.sum(r > 0) < 2:
            message = ("majority score is 0 for all or all but one "
                       "minority samples")
            _logger.info(self.__class__.__name__ + ": " + message)
            return X.copy(), y.copy()

        r = r/np.sum(r)

        # kernel density function
        def p_x(x):
            """
            Returns minority density value at x

            Args:
                x (np.array): feature vector

            Returns:
                float: density value
            """
            result = 1.0/(len(X_min)*self.h)
            result = result*(1.0/(np.sqrt(2*np.pi)*self.h)**len(X[0]))

            exp_term = np.exp(-0.5*np.linalg.norm(x - X_min, axis=1)**2/self.h)
            return result*np.inner(r, exp_term)

        samples = []
        it = 0

        # parameters of the Monte Carlo sampling
        burn_in = 1000
        periods = 50

        # covariance is used to generate a random sample in the neighborhood
        covariance = np.cov(X_min[r > 0], rowvar=False)

        if len(covariance) > 1 and np.linalg.cond(covariance) > 10000:
            message = ("reducing dimensions due to inproperly conditioned"
                       "covariance matrix")
            _logger.info(self.__class__.__name__ + ": " + message)

            if len(X[0]) <= 2:
                _logger.info(self.__class__.__name__ +
                             ": " + "matrix ill-conditioned")
                return X.copy(), y.copy()

            n_components = int(np.rint(len(covariance)/2))

            pca = PCA(n_components=n_components)
            X_trans = pca.fit_transform(X)

            ka = KernelADASYN(proportion=self.proportion,
                              k=self.k,
                              h=self.h,
                              random_state=self.random_state)

            X_samp, y_samp = ka.sample(X_trans, y)
            return pca.inverse_transform(X_samp), y_samp

        # starting Markov-Chain Monte Carlo for sampling
        x_old = X_min[self.random_state.choice(np.where(r > 0)[0])]
        p_old = p_x(x_old)

        # Cholesky decomposition
        L = np.linalg.cholesky(covariance)

        while len(samples) < n_to_sample:
            x_new = x_old + \
                np.dot(self.random_state.normal(size=len(x_old)), L)
            p_new = p_x(x_new)

            alpha = p_new/p_old
            u = self.random_state.random_sample()
            if u < alpha:
                x_old = x_new
                p_old = p_new
            else:
                pass

            it = it + 1
            if it % periods == 0 and it > burn_in:
                samples.append(x_old)

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'k': self.k,
                'h': self.h,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MOT2LD(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{mot2ld,
                            author="Xie, Zhipeng
                            and Jiang, Liyang
                            and Ye, Tengju
                            and Li, Xiaoli",
                            editor="Renz, Matthias
                            and Shahabi, Cyrus
                            and Zhou, Xiaofang
                            and Cheema, Muhammad Aamir",
                            title="A Synthetic Minority Oversampling Method
                                    Based on Local Densities in Low-Dimensional
                                    Space for Imbalanced Learning",
                            booktitle="Database Systems for Advanced
                                        Applications",
                            year="2015",
                            publisher="Springer International Publishing",
                            address="Cham",
                            pages="3--18",
                            isbn="978-3-319-18123-3"
                            }

    Notes:
        * Clusters might contain 1 elements, and all points can be filtered
            as noise.
        * Clusters might contain 0 elements as well, if all points are filtered
            as noise.
        * The entire clustering can become empty.
        * TSNE is very slow when the number of instances is over a couple
            of 1000
    """

    categories = [OverSampling.cat_uses_clustering,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_components=2,
                 k=5,
                 d_cut='auto',
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_components (int): number of components for stochastic
                                neighborhood embedding
            k (int): number of neighbors in the nearest neighbor component
            d_cut (float/str): distance cut value/'auto' for automated
                                selection
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_components, 'n_component', 1)
        self.check_greater_or_equal(k, 'k', 1)
        if isinstance(d_cut, float) or isinstance(d_cut, int):
            if d_cut <= 0:
                raise ValueError(self.__class__.__name__ +
                                 ": " + 'Non-positive d_cut is not allowed')
        elif d_cut != 'auto':
            raise ValueError(self.__class__.__name__ + ": " +
                             'd_cut value %s not implemented' % d_cut)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_components = n_components
        self.k = k
        self.d_cut = d_cut
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_components': [2],
                                  'k': [3, 5, 7],
                                  'd_cut': ['auto']}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        _logger.info(self.__class__.__name__ + ": " +
                     ("starting TSNE n: %d d: %d" % (len(X), len(X[0]))))
        # do the stochastic embedding
        X_tsne = TSNE(self.n_components,
                      random_state=self.random_state,
                      perplexity=10,
                      n_iter_without_progress=100,
                      n_iter=500,
                      verbose=3).fit_transform(X)
        X_min = X_tsne[y == self.min_label]
        _logger.info(self.__class__.__name__ + ": " + "TSNE finished")

        # fitting nearest neighbors model for all training data
        n_neighbors = min([len(X_min), self.k + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_tsne)
        distances, indices = nn.kneighbors(X_min)

        if isinstance(self.d_cut, float):
            d_cut = self.d_cut
        elif self.d_cut == 'auto':
            d_cut = np.max(distances[:, 1])

        # fitting nearest neighbors model to the minority data
        nn_min = NearestNeighbors(n_neighbors=len(X_min), n_jobs=self.n_jobs)
        nn_min.fit(X_min)
        distances_min, indices_min = nn_min.kneighbors(X_min)

        def n_rad_neighbors(x):
            x = x.reshape(1, -1)
            return len(nn.radius_neighbors(x, d_cut, return_distance=False)[0])

        # extracting the number of neighbors in a given radius
        rho = np.array([n_rad_neighbors(x) for x in X_min])
        closest_highest = []
        delta = []

        # implementation of the density peak clustering algorithm
        # based on http://science.sciencemag.org/content/344/6191/1492.full
        for i in range(len(rho)):
            closest_neighbors = indices_min[i]
            closest_densities = rho[closest_neighbors]
            closest_highs = np.where(closest_densities > rho[i])[0]

            if len(closest_highs) > 0:
                closest_highest.append(closest_highs[0])
                delta.append(distances_min[i][closest_highs[0]])
            else:
                closest_highest.append(-1)
                delta.append(np.max(distances_min))

        to_sort = zip(rho, delta, np.arange(len(rho)))
        r, d, idx = zip(*sorted(to_sort, key=lambda x: x[0]))
        r, d, idx = np.array(r), np.array(d), np.array(idx)

        if len(d) < 3:
            return X.copy(), y.copy()

        widths = np.arange(1, int(len(r)/2))
        peak_indices = np.array(ssignal.find_peaks_cwt(d, widths=widths))

        if len(peak_indices) == 0:
            _logger.info(self.__class__.__name__ + ": " + "no peaks found")
            return X.copy(), y.copy()

        cluster_center_indices = idx[peak_indices]
        cluster_centers = X_min[cluster_center_indices]

        # finding closest cluster center to minority points and deriving
        # cluster labels
        nn_cluster = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
        nn_cluster.fit(cluster_centers)
        dist_cluster, ind_cluster = nn_cluster.kneighbors(X_min)
        cluster_labels = ind_cluster[:, 0]

        # computing local minority counts and determining noisy samples
        def n_min_y(i):
            return np.sum(y[indices[i][1:]] == self.min_label)

        local_minority_count = np.array(
            [n_min_y(i) for i in range(len(X_min))])

        noise = np.where(np.logical_or(rho == 1, local_minority_count == 0))[0]

        # determining importance scores
        importance = local_minority_count/rho
        prob = importance
        prob[noise] = 0.0
        prob = prob/np.sum(prob)

        # extracting cluster indices
        cluster_indices = [np.where(cluster_labels == i)[0]
                           for i in range(np.max(cluster_labels) + 1)]
        # removing noise from clusters
        cluster_indices = [list(set(c).difference(set(noise)))
                           for c in cluster_indices]

        # checking if clustering is empty
        empty_clustering = True
        for i in range(len(cluster_indices)):
            if len(cluster_indices[i]) > 0:
                empty_clustering = False

        if empty_clustering:
            _logger.info(self.__class__.__name__ + ": " + "Empty clustering")
            return X.copy(), y.copy()

        cluster_sizes = np.array([len(c) for c in cluster_indices])
        cluster_indices_size_0 = np.where(cluster_sizes == 0)[0]
        for i in range(len(prob)):
            if cluster_labels[i] in cluster_indices_size_0:
                prob[i] = 0.0
        prob = prob/np.sum(prob)

        # carrying out the sampling
        X_min = X[y == self.min_label]
        samples = []
        while len(samples) < n_to_sample:
            # random sample according to the distribution computed
            random_idx = self.random_state.choice(np.arange(len(X_min)),
                                                  p=prob)

            # cluster label of the random minority sample
            cluster_label = cluster_labels[random_idx]
            if cluster_label == -1:
                continue

            if len(cluster_indices[cluster_label]) == 0:
                continue
            elif len(cluster_indices[cluster_label]) == 1:
                # if the cluster has only 1 elements, it is repeated
                samples.append(X_min[random_idx])
                continue
            else:
                # otherwise a random cluster index is selected for sample
                # generation
                clus = cluster_indices[cluster_label]
                random_neigh_in_clus_idx = self.random_state.choice(clus)
                while random_idx == random_neigh_in_clus_idx:
                    random_neigh_in_clus_idx = self.random_state.choice(clus)

                X_rand = X_min[random_idx]
                X_in_clus = X_min[random_neigh_in_clus_idx]
                samples.append(self.sample_between_points(X_rand, X_in_clus))

        return (np.vstack([np.delete(X, noise, axis=0), np.vstack(samples)]),
                np.hstack([np.delete(y, noise),
                           np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_components': self.n_components,
                'k': self.k,
                'd_cut': self.d_cut,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class V_SYNTH(OverSampling):
    """
    References:
        * BibTex::

            @article{v_synth,
                     author = {Young,Ii, William A. and Nykl, Scott L. and
                                Weckman, Gary R. and Chelberg, David M.},
                     title = {Using Voronoi Diagrams to Improve
                                Classification Performances when Modeling
                                Imbalanced Datasets},
                     journal = {Neural Comput. Appl.},
                     issue_date = {July      2015},
                     volume = {26},
                     number = {5},
                     month = jul,
                     year = {2015},
                     issn = {0941-0643},
                     pages = {1041--1054},
                     numpages = {14},
                     url = {http://dx.doi.org/10.1007/s00521-014-1780-0},
                     doi = {10.1007/s00521-014-1780-0},
                     acmid = {2790665},
                     publisher = {Springer-Verlag},
                     address = {London, UK, UK},
                     keywords = {Data engineering, Data mining, Imbalanced
                                    datasets, Knowledge extraction,
                                    Numerical algorithms, Synthetic
                                    over-sampling},
                    }

    Notes:
        * The proposed encompassing bounding box generation is incorrect.
        * Voronoi diagram generation in high dimensional spaces is instable
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_components=3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_components (int): number of components for PCA
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_components, "n_component", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_components = n_components
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_components': [3]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # creating the bounding box
        mins = np.min(X, axis=0)
        maxs = np.max(X, axis=0)
        mins = mins - 0.1*np.abs(mins)
        maxs = maxs + 0.1*np.abs(maxs)

        dim = len(X[0])

        def random_min_maxs():
            return np.where(self.random_state.randint(0, 1, size=dim) == 0,
                            mins,
                            maxs)

        n_bounding_box = min([100, len(X[0])])
        bounding_box = [random_min_maxs() for i in range(n_bounding_box)]
        X_bb = np.vstack([X, bounding_box])

        # applying PCA to reduce the dimensionality of the data
        n_components = min([len(X[0]), self.n_components])
        pca = PCA(n_components=n_components)
        X_pca = pca.fit_transform(X_bb)
        y_pca = np.hstack([y, np.repeat(-1, len(bounding_box))])

        dm = pairwise_distances(X_pca)
        to_remove = []
        for i in range(len(dm)):
            for j in range(i+1, len(dm)):
                if dm[i, j] < 0.001:
                    to_remove.append(i)
        X_pca = np.delete(X_pca, to_remove, axis=0)
        y_pca = np.delete(y_pca, to_remove)

        # doing the Voronoi tessellation
        voronoi = sspatial.Voronoi(X_pca)

        # extracting those ridge point pairs which are candidates for
        # generating an edge between two cells of different class labels
        candidate_face_generators = []
        for i, r in enumerate(voronoi.ridge_points):
            if r[0] < len(y) and r[1] < len(y) and not y[r[0]] == y[r[1]]:
                candidate_face_generators.append(i)

        if len(candidate_face_generators) == 0:
            return X.copy(), y.copy()

        # generating samples
        samples = []
        for _ in range(n_to_sample):
            # randomly choosing a pair from the ridge point pairs of different
            # labels
            random_face = self.random_state.choice(candidate_face_generators)

            # extracting the vertices of the face between the points
            ridge_vertices = voronoi.ridge_vertices[random_face]
            face_vertices = voronoi.vertices[ridge_vertices]

            # creating a random vector for sampling the face (supposed to be
            # convex)
            w = self.random_state.random_sample(size=len(X_pca[0]))
            w = w/np.sum(w)

            # initiating a sample point on the face
            sample_point_on_face = np.zeros(len(X_pca[0]))
            for i in range(len(X_pca[0])):
                sample_point_on_face += w[i]*face_vertices[i]

            # finding the ridge point with the minority label
            if y[voronoi.ridge_points[random_face][0]] == self.min_label:
                h = voronoi.points[voronoi.ridge_points[random_face][0]]
            else:
                h = voronoi.points[voronoi.ridge_points[random_face][1]]

            # generating a point between the minority ridge point and the
            # random point on the face
            samples.append(self.sample_between_points(sample_point_on_face,
                                                      h))

        return (np.vstack([X, pca.inverse_transform(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_components': self.n_components,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class OUPS(OverSampling):
    """
    References:
        * BibTex::

            @article{oups,
                        title = "A priori synthetic over-sampling methods for
                                    increasing classification sensitivity in
                                    imbalanced data sets",
                        journal = "Expert Systems with Applications",
                        volume = "66",
                        pages = "124 - 135",
                        year = "2016",
                        issn = "0957-4174",
                        doi = "https://doi.org/10.1016/j.eswa.2016.09.010",
                        author = "William A. Rivera and Petros Xanthopoulos",
                        keywords = "SMOTE, OUPS, Class imbalance,
                                    Classification"
                        }

    Notes:
        * In the description of the algorithm a fractional number p (j) is
            used to index a vector.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels
        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if self.class_stats[self.min_label] < 2:
            message = ("The number of minority samples (%d) is not enough for"
                       " sampling")
            message = message % self.class_stats[self.min_label]
            _logger.warning(self.__class__.__name__ + ": " + message)
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # extracting propensity scores
        lr = LogisticRegression(solver='lbfgs',
                                n_jobs=self.n_jobs,
                                random_state=self.random_state)
        lr.fit(X, y)
        propensity = lr.predict_proba(X)
        propensity = propensity[:, np.where(
            lr.classes_ == self.min_label)[0][0]]

        # sorting indices according to propensity scores
        prop_sorted = sorted(zip(propensity, np.arange(
            len(propensity))), key=lambda x: -x[0])

        p = np.sum(y == self.maj_label)/np.sum(y == self.min_label)
        n = 0
        samples = []
        # implementing Algorithm 1 in the cited paper with some minor changes
        # to enable the proper sampling of p numbers
        while n < len(propensity) and len(samples) < n_to_sample:
            if (y[prop_sorted[n][1]] == self.min_label
                    and n < len(propensity) - 1):
                num = 1
                p_tmp = p
                while p_tmp > 0 and n + num < len(propensity):
                    if self.random_state.random_sample() < p_tmp:
                        samples.append(self.sample_between_points(
                            X[prop_sorted[n][1]], X[prop_sorted[n+num][1]]))
                    p_tmp = p_tmp - 1
                    num = num + 1
            n = n + 1

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_D(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{smote_d,
                            author="Torres, Fredy Rodr{\'i}guez
                            and Carrasco-Ochoa, Jes{\'u}s A.
                            and Mart{\'i}nez-Trinidad, Jos{\'e} Fco.",
                            editor="Mart{\'i}nez-Trinidad, Jos{\'e} Francisco
                            and Carrasco-Ochoa, Jes{\'u}s Ariel
                            and Ayala Ramirez, Victor
                            and Olvera-L{\'o}pez, Jos{\'e} Arturo
                            and Jiang, Xiaoyi",
                            title="SMOTE-D a Deterministic Version of SMOTE",
                            booktitle="Pattern Recognition",
                            year="2016",
                            publisher="Springer International Publishing",
                            address="Cham",
                            pages="177--188",
                            isbn="978-3-319-39393-3"
                            }

    Notes:
        * Copying happens if two points are the neighbors of each other.
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self, proportion=1.0, k=3, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            k (int): number of neighbors in nearest neighbors component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()

        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(k, "k", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.k = k
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'k': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X_min), self.k+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # extracting standard deviations of distances
        stds = np.std(dist[:, 1:], axis=1)

        # estimating sampling density
        if np.sum(stds) > 0:
            p_i = stds/np.sum(stds)
        else:
            _logger.warning(self.__class__.__name__ +
                            ": " + "zero distribution")
            return X.copy(), y.copy()

        # the other component of sampling density
        p_ij = dist[:, 1:]/np.sum(dist[:, 1:], axis=1)[:, None]

        # number of samples to generate between minority points
        counts_ij = n_to_sample*p_i[:, None]*p_ij

        # do the sampling
        samples = []
        for i in range(len(p_i)):
            for j in range(min([len(X_min)-1, self.k])):
                while counts_ij[i][j] > 0:
                    if self.random_state.random_sample() < counts_ij[i][j]:
                        translation = X_min[ind[i][j+1]] - X_min[i]
                        weight = counts_ij[i][j] + 1
                        samples.append(
                            X_min[i] + translation/counts_ij[i][j]+1)
                    counts_ij[i][j] = counts_ij[i][j] - 1

        if len(samples) > 0:
            return (np.vstack([X, np.vstack(samples)]),
                    np.hstack([y, np.repeat(self.min_label, len(samples))]))
        else:
            return X.copy(), y.copy()

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'k': self.k,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_PSO(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_pso,
                        title = "PSO-based method for SVM classification on
                                    skewed data sets",
                        journal = "Neurocomputing",
                        volume = "228",
                        pages = "187 - 197",
                        year = "2017",
                        note = "Advanced Intelligent Computing: Theory and
                                    Applications",
                        issn = "0925-2312",
                        doi = "https://doi.org/10.1016/j.neucom.2016.10.041",
                        author = "Jair Cervantes and Farid Garcia-Lamont and
                                    Lisbeth Rodriguez and Asdrúbal López and
                                    José Ruiz Castilla and Adrian Trueba",
                        keywords = "Skew data sets, SVM, Hybrid algorithms"
                        }

    Notes:
        * I find the description of the technique a bit confusing, especially
            on the bounds of the search space of velocities and positions.
            Equations 15 and 16 specify the lower and upper bounds, the lower
            bound is in fact a vector while the upper bound is a distance.
            I tried to implement something meaningful.
        * I also find the setting of accelerating constant 2.0 strange, most
            of the time the velocity will be bounded due to this choice.
        * Also, training and predicting probabilities with a non-linear
            SVM as the evaluation function becomes fairly expensive when the
            number of training vectors reaches a couple of thousands. To
            reduce computational burden, minority and majority vectors far
            from the other class are removed to reduce the size of both
            classes to a maximum of 500 samples. Generally, this shouldn't
            really affect the results as the technique focuses on the samples
            near the class boundaries.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_memetic,
                  OverSampling.cat_uses_classifier]

[docs]    def __init__(self,
                 k=3,
                 eps=0.05,
                 n_pop=10,
                 w=1.0,
                 c1=2.0,
                 c2=2.0,
                 num_it=10,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            k (int): number of neighbors in nearest neighbors component, this
                        is also the multiplication factor of minority support
                        vectors
            eps (float): use to specify the initially generated support
                            vectors along minority-majority lines
            n_pop (int): size of population
            w (float): intertia constant
            c1 (float): acceleration constant of local optimum
            c2 (float): acceleration constant of population optimum
            num_it (int): number of iterations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(k, "k", 1)
        self.check_greater(eps, "eps", 0)
        self.check_greater_or_equal(n_pop, "n_pop", 1)
        self.check_greater_or_equal(w, "w", 0)
        self.check_greater_or_equal(c1, "c1", 0)
        self.check_greater_or_equal(c2, "c2", 0)
        self.check_greater_or_equal(num_it, "num_it", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.k = k
        self.eps = eps
        self.n_pop = n_pop
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.num_it = num_it
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return cls.generate_parameter_combinations({'k': [3, 5, 7],
                                                    'eps': [0.05],
                                                    'n_pop': [5],
                                                    'w': [0.5, 1.0],
                                                    'c1': [1.0, 2.0],
                                                    'c2': [1.0, 2.0],
                                                    'num_it': [5]}, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        # saving original dataset
        X_orig = X
        y_orig = y

        # scaling the records
        mms = MinMaxScaler()
        X_scaled = mms.fit_transform(X)

        # removing majority and minority samples far from the training data if
        # needed to increase performance
        performance_threshold = 500

        n_maj_to_remove = np.sum(
            y == self.maj_label) - performance_threshold
        if n_maj_to_remove > 0:
            # if majority samples are to be removed
            nn = NearestNeighbors(n_neighbors=1,
                                  n_jobs=self.n_jobs)
            nn.fit(X_scaled[y == self.min_label])
            dist, ind = nn.kneighbors(X_scaled)
            di = sorted([(dist[i][0], i)
                         for i in range(len(ind))], key=lambda x: x[0])
            to_remove = []
            # finding the proper number of samples farest from the minority
            # samples
            for i in reversed(range(len(di))):
                if y[di[i][1]] == self.maj_label:
                    to_remove.append(di[i][1])
                if len(to_remove) >= n_maj_to_remove:
                    break
            # removing the samples
            X_scaled = np.delete(X_scaled, to_remove, axis=0)
            y = np.delete(y, to_remove)

        n_min_to_remove = np.sum(
            y == self.min_label) - performance_threshold
        if n_min_to_remove > 0:
            # if majority samples are to be removed
            nn = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
            nn.fit(X_scaled[y == self.maj_label])
            dist, ind = nn.kneighbors(X_scaled)
            di = sorted([(dist[i][0], i)
                         for i in range(len(ind))], key=lambda x: x[0])
            to_remove = []
            # finding the proper number of samples farest from the minority
            # samples
            for i in reversed(range(len(di))):
                if y[di[i][1]] == self.min_label:
                    to_remove.append(di[i][1])
                if len(to_remove) >= n_min_to_remove:
                    break
            # removing the samples
            X_scaled = np.delete(X_scaled, to_remove, axis=0)
            y = np.delete(y, to_remove)

        # fitting SVM to extract initial support vectors
        svc = SVC(kernel='rbf', probability=True,
                  gamma='auto', random_state=self.random_state)
        svc.fit(X_scaled, y)

        # extracting the support vectors
        SV_min = np.array(
            [i for i in svc.support_ if y[i] == self.min_label])
        SV_maj = np.array(
            [i for i in svc.support_ if y[i] == self.maj_label])

        X_SV_min = X_scaled[SV_min]
        X_SV_maj = X_scaled[SV_maj]

        # finding nearest majority support vectors
        n_neighbors = min([len(X_SV_maj), self.k])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_SV_maj)
        dist, ind = nn.kneighbors(X_SV_min)

        # finding the initial particle and specifying the search space
        X_min_gen = []
        search_space = []
        init_velocity = []
        for i in range(len(SV_min)):
            for j in range(min([len(X_SV_maj), self.k])):
                min_vector = X_SV_min[i]
                maj_vector = X_SV_maj[ind[i][j]]
                # the upper bound of the search space if specified by the
                # closest majority support vector
                upper_bound = X_SV_maj[ind[i][0]]
                # the third element of the search space specification is
                # the distance of the vector and the closest
                # majority support vector, which specifies the radius of
                # the search
                norms = np.linalg.norm(min_vector - upper_bound)
                search_space.append([min_vector, maj_vector, norms])
                # initial particles
                X_min_gen.append(min_vector + self.eps *
                                 (maj_vector - min_vector))
                # initial velocities
                init_velocity.append(self.eps*(maj_vector - min_vector))

        X_min_gen = np.vstack(X_min_gen)
        init_velocity = np.vstack(init_velocity)

        # evaluates a specific particle
        def evaluate(X_train, y_train, X_test, y_test):
            """
            Trains support vector classifier and evaluates it

            Args:
                X_train (np.matrix): training vectors
                y_train (np.array): target labels
                X_test (np.matrix): test vectors
                y_test (np.array): test labels
            """
            svc.fit(X_train, y_train)
            y_pred = svc.predict_proba(X_test)[:, np.where(
                svc.classes_ == self.min_label)[0][0]]
            return roc_auc_score(y_test, y_pred)

        # initializing the particle swarm and the particle and population level
        # memory
        particle_swarm = [X_min_gen.copy() for _ in range(self.n_pop)]
        velocities = [init_velocity.copy() for _ in range(self.n_pop)]
        local_best = [X_min_gen.copy() for _ in range(self.n_pop)]
        local_best_scores = [0.0]*self.n_pop
        global_best = X_min_gen.copy()
        global_best_score = 0.0

        def evaluate_particle(X_scaled, p, y):
            X_extended = np.vstack([X_scaled, p])
            y_extended = np.hstack([y, np.repeat(self.min_label, len(p))])
            return evaluate(X_extended, y_extended, X_scaled, y)

        for i in range(self.num_it):
            _logger.info(self.__class__.__name__ + ": " + "Iteration %d" % i)
            # evaluate population
            scores = [evaluate_particle(X_scaled, p, y)
                      for p in particle_swarm]

            # update best scores
            for i, s in enumerate(scores):
                if s > local_best_scores[i]:
                    local_best_scores[i] = s
                    local_best[i] = particle_swarm[i]
                if s > global_best_score:
                    global_best_score = s
                    global_best = particle_swarm[i]

            # update velocities
            for i, p in enumerate(particle_swarm):
                term_0 = self.w*velocities[i]
                random_1 = self.random_state.random_sample()
                random_2 = self.random_state.random_sample()
                term_1 = self.c1*random_1*(local_best[i] - p)
                term_2 = self.c2*random_2*(global_best - p)

                velocities[i] = term_0 + term_1 + term_2

            # bound velocities according to search space constraints
            for v in velocities:
                for i in range(len(v)):
                    v_i_norm = np.linalg.norm(v[i])
                    if v_i_norm > search_space[i][2]/2.0:
                        v[i] = v[i]/v_i_norm*search_space[i][2]/2.0

            # update positions
            for i, p in enumerate(particle_swarm):
                particle_swarm[i] = particle_swarm[i] + velocities[i]

            # bound positions according to search space constraints
            for p in particle_swarm:
                for i in range(len(p)):
                    ss = search_space[i]

                    trans_vector = p[i] - ss[0]
                    trans_norm = np.linalg.norm(trans_vector)
                    normed_trans = trans_vector/trans_norm

                    if trans_norm > ss[2]:
                        p[i] = ss[0] + normed_trans*ss[2]

        X_ret = np.vstack([X_orig, mms.inverse_transform(global_best)])
        y_ret = np.hstack(
            [y_orig, np.repeat(self.min_label, len(global_best))])

        return (X_ret, y_ret)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'k': self.k,
                'eps': self.eps,
                'n_pop': self.n_pop,
                'w': self.w,
                'c1': self.c1,
                'c2': self.c2,
                'num_it': self.num_it,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class CURE_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @Article{cure_smote,
                        author="Ma, Li
                        and Fan, Suohai",
                        title="CURE-SMOTE algorithm and hybrid algorithm for
                                feature selection and parameter optimization
                                based on random forests",
                        journal="BMC Bioinformatics",
                        year="2017",
                        month="Mar",
                        day="14",
                        volume="18",
                        number="1",
                        pages="169",
                        issn="1471-2105",
                        doi="10.1186/s12859-017-1578-z",
                        url="https://doi.org/10.1186/s12859-017-1578-z"
                        }

    Notes:
        * It is not specified how to determine the cluster with the
            "slowest growth rate"
        * All clusters can be removed as noise.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_clusters=5,
                 noise_th=2,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_clusters (int): number of clusters to generate
            noise_th (int): below this number of elements the cluster is
                                considered as noise
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_clusters, "n_clusters", 1)
        self.check_greater_or_equal(noise_th, "noise_th", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_clusters = n_clusters
        self.noise_th = noise_th
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_clusters': [5, 10, 15],
                                  'noise_th': [1, 3]}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # standardizing the data
        mms = MinMaxScaler()
        X_scaled = mms.fit_transform(X)

        X_min = X_scaled[y == self.min_label]

        # initiating clustering
        clusters = [np.array([i]) for i in range(len(X_min))]
        dm = pairwise_distances(X_min)

        # setting the diagonal of the distance matrix to infinity
        for i in range(len(dm)):
            dm[i, i] = np.inf

        # starting the clustering iteration
        iteration = 0
        while len(clusters) > self.n_clusters:
            iteration = iteration + 1

            # delete a cluster with slowest growth rate, determined by
            # the cluster size
            if iteration % self.n_clusters == 0:
                # extracting cluster sizes
                cluster_sizes = np.array([len(c) for c in clusters])
                # removing one of the clusters with the smallest size
                to_remove = np.where(cluster_sizes == np.min(cluster_sizes))[0]
                to_remove = self.random_state.choice(to_remove)
                del clusters[to_remove]
                # adjusting the distance matrix accordingly
                dm = np.delete(dm, to_remove, axis=0)
                dm = np.delete(dm, to_remove, axis=1)

            # finding the cluster pair with the smallest distance
            min_coord = np.where(dm == np.min(dm))
            merge_a = min_coord[0][0]
            merge_b = min_coord[1][0]

            # merging the clusters
            clusters[merge_a] = np.hstack(
                [clusters[merge_a], clusters[merge_b]])
            # removing one of them
            del clusters[merge_b]
            # adjusting the distances in the distance matrix
            dm[merge_a] = np.min(np.vstack([dm[merge_a], dm[merge_b]]), axis=0)
            dm[:, merge_a] = dm[merge_a]
            # removing the row and column corresponding to one of
            # the merged clusters
            dm = np.delete(dm, merge_b, axis=0)
            dm = np.delete(dm, merge_b, axis=1)
            # updating the diagonal
            for i in range(len(dm)):
                dm[i, i] = np.inf

        # removing clusters declared as noise
        to_remove = []
        for i in range(len(clusters)):
            if len(clusters[i]) < self.noise_th:
                to_remove.append(i)
        clusters = [clusters[i]
                    for i in range(len(clusters)) if i not in to_remove]

        # all clusters can be noise
        if len(clusters) == 0:
            _logger.warning(self.__class__.__name__ + ": " +
                            "all clusters removed as noise")
            return X.copy(), y.copy()

        # generating samples
        samples = []
        for _ in range(n_to_sample):
            cluster_idx = self.random_state.randint(len(clusters))
            center = np.mean(X_min[clusters[cluster_idx]], axis=0)
            representative = X_min[self.random_state.choice(
                clusters[cluster_idx])]
            samples.append(self.sample_between_points(center, representative))

        return (np.vstack([X, mms.inverse_transform(np.vstack(samples))]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_clusters': self.n_clusters,
                'noise_th': self.noise_th,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SOMO(OverSampling):
    """
    References:
        * BibTex::

            @article{somo,
                        title = "Self-Organizing Map Oversampling (SOMO) for
                                    imbalanced data set learning",
                        journal = "Expert Systems with Applications",
                        volume = "82",
                        pages = "40 - 52",
                        year = "2017",
                        issn = "0957-4174",
                        doi = "https://doi.org/10.1016/j.eswa.2017.03.073",
                        author = "Georgios Douzas and Fernando Bacao"
                        }

    Notes:
        * It is not specified how to handle those cases when a cluster contains
            1 minority samples, the mean of within-cluster distances is set to
            100 in these cases.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_grid=10,
                 sigma=0.2,
                 learning_rate=0.5,
                 n_iter=100,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_grid (int): size of grid
            sigma (float): sigma of SOM
            learning_rate (float) learning rate of SOM
            n_iter (int): number of iterations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_greater_or_equal(n_grid, 'n_grid', 2)
        self.check_greater(sigma, 'sigma', 0)
        self.check_greater(learning_rate, 'learning_rate', 0)
        self.check_greater_or_equal(n_iter, 'n_iter', 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_grid = n_grid
        self.sigma = sigma
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_grid': [5, 9, 13],
                                  'sigma': [0.4],
                                  'learning_rate': [0.3, 0.5],
                                  'n_iter': [100]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        N_inter = n_to_sample/2
        N_intra = n_to_sample/2

        import minisom

        # training SOM
        som = minisom.MiniSom(self.n_grid,
                              self.n_grid,
                              len(X[0]),
                              sigma=self.sigma,
                              learning_rate=self.learning_rate,
                              random_seed=3)
        som.train_random(X, self.n_iter)

        # constructing the grid
        grid_min = {}
        grid_maj = {}
        for i in range(len(y)):
            tmp = som.winner(X[i])
            idx = (tmp[0], tmp[1])
            if idx not in grid_min:
                grid_min[idx] = []
            if idx not in grid_maj:
                grid_maj[idx] = []
            if y[i] == self.min_label:
                grid_min[idx].append(i)
            else:
                grid_maj[idx].append(i)

        # converting the grid to arrays
        for i in grid_min:
            grid_min[i] = np.array(grid_min[i])
        for i in grid_maj:
            grid_maj[i] = np.array(grid_maj[i])

        # filtering
        filtered = {}
        for i in grid_min:
            if i not in grid_maj:
                filtered[i] = True
            else:
                filtered[i] = (len(grid_maj[i]) + 1)/(len(grid_min[i])+1) < 1.0

        # computing densities
        densities = {}
        for i in filtered:
            if filtered[i]:
                if len(grid_min[i]) > 1:
                    paird = pairwise_distances(X[grid_min[i]])
                    densities[i] = len(grid_min[i])/np.mean(paird)**2
                else:
                    densities[i] = 10

        # all clusters can be filtered
        if len(densities) == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "all clusters filtered")
            return X.copy(), y.copy()

        # computing neighbour densities, using 4 neighborhood
        neighbors = [[0, 1], [0, -1], [1, 0], [-1, 0]]
        pair_densities = {}
        for i in densities:
            for n in neighbors:
                j = (i[0] + n[0], i[1] + n[1]),
                if j in densities:
                    pair_densities[(i, j)] = densities[i] + densities[j]

        # computing weights
        density_keys = list(densities.keys())
        density_vals = np.array(list(densities.values()))

        # determining pair keys and density values
        pair_keys = list(pair_densities.keys())
        pair_vals = np.array(list(pair_densities.values()))

        # determining densities
        density_vals = (1.0/density_vals)/np.sum(1.0/density_vals)
        pair_dens_vals = (1.0/pair_vals)/np.sum(1.0/pair_vals)

        # computing num of samples to generate
        if len(pair_vals) > 0:
            dens_num = N_intra
            pair_num = N_inter
        else:
            dens_num = N_inter + N_intra
            pair_num = 0

        # generating the samples according to the extracted distributions
        samples = []
        while len(samples) < dens_num:
            cluster_idx = density_keys[self.random_state.choice(
                np.arange(len(density_keys)), p=density_vals)]
            cluster = grid_min[cluster_idx]
            sample_a, sample_b = self.random_state.choice(cluster, 2)
            samples.append(self.sample_between_points(
                X[sample_a], X[sample_b]))

        while len(samples) < pair_num:
            idx = pair_keys[self.random_state.choice(
                np.arange(len(pair_keys)), p=pair_dens_vals)]
            cluster_a = grid_min[idx[0]]
            cluster_b = grid_min[idx[1]]
            X_a = X[self.random_state.choice(cluster_a)]
            X_b = X[self.random_state.choice(cluster_b)]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_grid': self.n_grid,
                'sigma': self.sigma,
                'learning_rate': self.learning_rate,
                'n_iter': self.n_iter,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ISOMAP_Hybrid(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{isomap_hybrid,
                             author = {Gu, Qiong and Cai, Zhihua and Zhu, Li},
                             title = {Classification of Imbalanced Data Sets by
                                        Using the Hybrid Re-sampling Algorithm
                                        Based on Isomap},
                             booktitle = {Proceedings of the 4th International
                                            Symposium on Advances in
                                            Computation and Intelligence},
                             series = {ISICA '09},
                             year = {2009},
                             isbn = {978-3-642-04842-5},
                             location = {Huangshi, China},
                             pages = {287--296},
                             numpages = {10},
                             doi = {10.1007/978-3-642-04843-2_31},
                             acmid = {1691478},
                             publisher = {Springer-Verlag},
                             address = {Berlin, Heidelberg},
                             keywords = {Imbalanced data set, Isomap, NCR,
                                            Smote, re-sampling},
                            }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_dim_reduction,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_components=3,
                 smote_n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            n_components (int): number of components
            smote_n_neighbors (int): number of neighbors in SMOTE sampling
            n_jobs (int): number of parallel jobs
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_components, "n_components", 1)
        self.check_greater_or_equal(smote_n_neighbors, "smote_n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.smote_n_neighbors = smote_n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_components': [2, 3, 4],
                                  'smote_n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        self.isomap = Isomap(n_neighbors=self.n_neighbors,
                             n_components=self.n_components,
                             n_jobs=self.n_jobs)

        X_trans = self.isomap.fit_transform(X, y)

        X_sm, y_sm = SMOTE(proportion=self.proportion,
                           n_neighbors=self.smote_n_neighbors,
                           n_jobs=self.n_jobs,
                           random_state=self.random_state).sample(X_trans, y)

        nc = NeighborhoodCleaningRule(n_jobs=self.n_jobs)
        return nc.remove_noise(X_sm, y_sm)

[docs]    def preprocessing_transform(self, X):
        """
        Transforms new data by the trained isomap

        Args:
            X (np.matrix): new data

        Returns:
            np.matrix: the transformed data
        """
        return self.isomap.transform(X)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_components': self.n_components,
                'smote_n_neighbors': self.smote_n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class CE_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{ce_smote,
                                author={Chen, S. and Guo, G. and Chen, L.},
                                booktitle={2010 IEEE 24th International
                                            Conference on Advanced Information
                                            Networking and Applications
                                            Workshops},
                                title={A New Over-Sampling Method Based on
                                        Cluster Ensembles},
                                year={2010},
                                volume={},
                                number={},
                                pages={599-604},
                                keywords={data mining;Internet;pattern
                                            classification;pattern clustering;
                                            over sampling method;cluster
                                            ensembles;classification method;
                                            imbalanced data handling;CE-SMOTE;
                                            clustering consistency index;
                                            cluster boundary minority samples;
                                            imbalanced public data set;
                                            Mathematics;Computer science;
                                            Electronic mail;Accuracy;Nearest
                                            neighbor searches;Application
                                            software;Data mining;Conferences;
                                            Web sites;Information retrieval;
                                            classification;imbalanced data
                                            sets;cluster ensembles;
                                            over-sampling},
                                doi={10.1109/WAINA.2010.40},
                                ISSN={},
                                month={April}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_borderline,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 h=10,
                 k=5,
                 alpha=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            h (int): size of ensemble
            k (int): number of clusters/neighbors
            alpha (float): [0,1] threshold to select boundary samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(h, "h", 1)
        self.check_greater_or_equal(k, "k", 1)
        self.check_in_range(alpha, "alpha", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.h = h
        self.k = k
        self.alpha = alpha
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'h': [5, 10, 15],
                                  'k': [3, 5, 7],
                                  'alpha': [0.2, 0.5, 0.8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # do the clustering and labelling
        d = len(X[0])
        labels = []
        for _ in range(self.h):
            f = self.random_state.randint(int(d/2), d)
            features = self.random_state.choice(np.arange(d), f)
            n_clusters = min([len(X), self.k])
            kmeans = KMeans(n_clusters=n_clusters,
                            random_state=self.random_state)
            kmeans.fit(X[:, features])
            labels.append(kmeans.labels_)

        # do the cluster matching, clustering 0 will be considered the one to
        # match the others to the problem of finding cluster matching is
        # basically the "assignment problem"
        base_label = 0
        for i in range(len(labels)):
            if not i == base_label:
                cost_matrix = np.zeros(shape=(self.k, self.k))
                for j in range(self.k):
                    mask_j = labels[base_label] == j
                    for k in range(self.k):
                        mask_k = labels[i] == k
                        mask_jk = np.logical_and(mask_j, mask_k)
                        cost_matrix[j, k] = np.sum(mask_jk)
                # solving the assignment problem
                row_ind, _ = soptimize.linear_sum_assignment(-cost_matrix)
                # doing the relabeling
                relabeling = labels[i].copy()
                for j in range(len(row_ind)):
                    relabeling[labels[i] == k] = j
                labels[i] = relabeling

        # compute clustering consistency index
        labels = np.vstack(labels)
        cci = np.apply_along_axis(lambda x: max(
            set(x.tolist()), key=x.tolist().count), 0, labels)
        cci = np.sum(labels == cci, axis=0)
        cci = cci/self.h

        # determining minority boundary samples
        P_boundary = X[np.logical_and(
            y == self.min_label, cci < self.alpha)]

        # there might be no boundary samples
        if len(P_boundary) <= 1:
            _logger.warning(self.__class__.__name__ + ": " + "empty boundary")
            return X.copy(), y.copy()

        # finding nearest neighbors of boundary samples
        n_neighbors = min([len(P_boundary), self.k])
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs)
        nn.fit(P_boundary)
        dist, ind = nn.kneighbors(P_boundary)

        # do the sampling
        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.randint(len(ind))
            point_a = P_boundary[idx]
            point_b = P_boundary[self.random_state.choice(ind[idx][1:])]
            samples.append(self.sample_between_points(point_a, point_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'h': self.h,
                'k': self.k,
                'alpha': self.alpha,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Edge_Det_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{Edge_Det_SMOTE,
                            author={Kang, Y. and Won, S.},
                            booktitle={ICCAS 2010},
                            title={Weight decision algorithm for oversampling
                                    technique on class-imbalanced learning},
                            year={2010},
                            volume={},
                            number={},
                            pages={182-186},
                            keywords={edge detection;learning (artificial
                                        intelligence);weight decision
                                        algorithm;oversampling technique;
                                        class-imbalanced learning;class
                                        imbalanced data problem;edge
                                        detection algorithm;spatial space
                                        representation;Classification
                                        algorithms;Image edge detection;
                                        Training;Noise measurement;Glass;
                                        Training data;Machine learning;
                                        Imbalanced learning;Classification;
                                        Weight decision;Oversampling;
                                        Edge detection},
                            doi={10.1109/ICCAS.2010.5669889},
                            ISSN={},
                            month={Oct}}

    Notes:
        * This technique is very loosely specified.
    """

    categories = [OverSampling.cat_density_based,
                  OverSampling.cat_borderline,
                  OverSampling.cat_extensive]

[docs]    def __init__(self, proportion=1.0, k=5, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            k (int): number of neighbors
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(k, "k", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.k = k
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'k': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        d = len(X[0])
        X_min = X[y == self.min_label]

        # organizing class labels according to feature ranking
        magnitudes = np.zeros(len(X))
        for i in range(d):
            to_sort = zip(X[:, i], np.arange(len(X)), y)
            _, idx, label = zip(*sorted(to_sort, key=lambda x: x[0]))
            # extracting edge magnitudes in this dimension
            for j in range(1, len(idx)-1):
                magnitudes[idx[j]] = magnitudes[idx[j]] + \
                    (label[j-1] - label[j+1])**2

        # density estimation
        magnitudes = magnitudes[y == self.min_label]
        magnitudes = np.sqrt(magnitudes)
        magnitudes = magnitudes/np.sum(magnitudes)

        # fitting nearest neighbors models to minority samples
        n_neighbors = min([len(X_min), self.k+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # do the sampling
        samples = []
        for _ in range(n_to_sample):
            idx = self.random_state.choice(np.arange(len(X_min)), p=magnitudes)
            X_a = X_min[idx]
            X_b = X_min[self.random_state.choice(ind[idx][1:])]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'k': self.k,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class CBSO(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{cbso,
                            author="Barua, Sukarna
                            and Islam, Md. Monirul
                            and Murase, Kazuyuki",
                            editor="Lu, Bao-Liang
                            and Zhang, Liqing
                            and Kwok, James",
                            title="A Novel Synthetic Minority Oversampling
                                    Technique for Imbalanced Data Set
                                    Learning",
                            booktitle="Neural Information Processing",
                            year="2011",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="735--744",
                            isbn="978-3-642-24958-7"
                            }

    Notes:
        * Clusters containing 1 element induce cloning of samples.
    """

    categories = [OverSampling.cat_uses_clustering,
                  OverSampling.cat_density_based,
                  OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 C_p=1.3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            C_p (float): used to set the threshold of clustering
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater(C_p, "C_p", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.C_p = C_p
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'C_p': [0.8, 1.0, 1.3, 1.6]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model to find neighbors of minority points
        nn = NearestNeighbors(n_neighbors=self.n_neighbors + 1,
                              n_jobs=self.n_jobs).fit(X)
        dist, ind = nn.kneighbors(X_min)

        # extracting the number of majority neighbors
        weights = [np.sum(y[ind[i][1:]] == self.maj_label)
                   for i in range(len(X_min))]
        # determine distribution of generating data
        weights = weights/np.sum(weights)

        # do the clustering
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs).fit(X_min)
        d_avg = np.mean(nn.kneighbors(X_min)[0][:, 1])
        T_h = d_avg*self.C_p

        # initiating clustering
        clusters = [np.array([i]) for i in range(len(X_min))]
        dm = pairwise_distances(X_min)

        # setting the diagonal of the distance matrix to infinity
        for i in range(len(dm)):
            dm[i, i] = np.inf

        # starting the clustering iteration
        while True:
            # finding the cluster pair with the smallest distance
            min_coord = np.where(dm == np.min(dm))
            merge_a = min_coord[0][0]
            merge_b = min_coord[1][0]

            # check termination conditions
            if dm[merge_a, merge_b] > T_h or len(dm) == 1:
                break

            # merging the clusters
            clusters[merge_a] = np.hstack(
                [clusters[merge_a], clusters[merge_b]])
            # removing one of them
            del clusters[merge_b]
            # adjusting the distances in the distance matrix
            dm[merge_a] = np.min(np.vstack([dm[merge_a], dm[merge_b]]), axis=0)
            dm[:, merge_a] = dm[merge_a]
            # removing the row and column corresponding to one of the
            # merged clusters
            dm = np.delete(dm, merge_b, axis=0)
            dm = np.delete(dm, merge_b, axis=1)
            # updating the diagonal
            for i in range(len(dm)):
                dm[i, i] = np.inf

        # extracting cluster labels
        labels = np.zeros(len(X_min)).astype(int)
        for i in range(len(clusters)):
            for j in clusters[i]:
                labels[j] = i

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(X_min)), p=weights)
            if len(clusters[labels[idx]]) <= 1:
                samples.append(X_min[idx])
                continue
            else:
                random_idx = self.random_state.choice(clusters[labels[idx]])
                while random_idx == idx:
                    random_idx = self.random_state.choice(
                        clusters[labels[idx]])
            samples.append(self.sample_between_points(
                X_min[idx], X_min[random_idx]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'C_p': self.C_p,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class E_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{e_smote,
                            author={Deepa, T. and Punithavalli, M.},
                            booktitle={2011 3rd International Conference on
                                        Electronics Computer Technology},
                            title={An E-SMOTE technique for feature selection
                                    in High-Dimensional Imbalanced Dataset},
                            year={2011},
                            volume={2},
                            number={},
                            pages={322-324},
                            keywords={bioinformatics;data mining;pattern
                                        classification;support vector machines;
                                        E-SMOTE technique;feature selection;
                                        high-dimensional imbalanced dataset;
                                        data mining;bio-informatics;dataset
                                        balancing;SVM classification;micro
                                        array dataset;Feature extraction;
                                        Genetic algorithms;Support vector
                                        machines;Data mining;Machine learning;
                                        Bioinformatics;Cancer;Imbalanced
                                        dataset;Featue Selection;E-SMOTE;
                                        Support Vector Machine[SVM]},
                            doi={10.1109/ICECTECH.2011.5941710},
                            ISSN={},
                            month={April}}

    Notes:
        * This technique is basically unreproducible. I try to implement
            something following the idea of applying some simple genetic
            algorithm for optimization.
        * In my best understanding, the technique uses evolutionary algorithms
            for feature selection and then applies vanilla SMOTE on the
            selected features only.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_dim_reduction,
                  OverSampling.cat_memetic,
                  OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 min_features=2,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in the nearest neighbors
                                component
            min_features (int): minimum number of features
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(min_features, "min_features", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.min_features = min_features
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'min_features': [1, 2, 3]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        min_features = min(self.min_features, len(X[0]))

        if len(X) < 800:
            classifier = SVC(gamma='auto', random_state=self.random_state)
        else:
            classifier = DecisionTreeClassifier(
                max_depth=4, random_state=self.random_state)

        # parameters of the evolutionary algorithm
        n_generations = 50
        n_population = 5

        # creating initial mask
        mask = self.random_state.choice([True, False], len(X[0]), replace=True)
        # fixing if the mask doesn't contain any features
        if np.sum(mask) == 0:
            mask[self.random_state.randint(len(mask))] = True

        def crossover(mask_a, mask_b):
            """
            Crossover operation for two masks

            Args:
                mask_a (np.array): binary mask 1
                mask_b (np.array): binary mask 2

            Returns:
                np.array: the result of crossover
            """
            mask = mask_a.copy()
            for i in range(len(mask_b)):
                if self.random_state.randint(0, 2) == 0:
                    mask[i] = mask_b[i]

            while np.sum(mask) < min_features:
                mask[self.random_state.randint(len(mask))] = True

            return mask

        def mutate(mask_old):
            """
            Mutation operation for a mask

            Args:
                mask_old (np.array): binary mask

            Returns:
                np.array: the result of mutation
            """
            mask = mask_old.copy()
            for i in range(len(mask)):
                if self.random_state.randint(0, 2) == 0:
                    mask[i] = not mask[i]

            while np.sum(mask) < min_features:
                mask[self.random_state.randint(len(mask))] = True

            return mask

        # generating initial population
        population = [[0, mask.copy()] for _ in range(n_population)]
        for _ in range(n_generations):
            # in each generation
            for _ in range(n_population):
                # for each element of a population
                if self.random_state.randint(0, 2) == 0:
                    # crossover
                    i_0 = self.random_state.randint(n_population)
                    i_1 = self.random_state.randint(n_population)
                    mask = crossover(population[i_0][1], population[i_1][1])
                else:
                    # mutation
                    idx = self.random_state.randint(n_population)
                    mask = mutate(population[idx][1])
                # evaluation
                message = "evaluating mask selection with features %d/%d"
                message = message % (np.sum(mask), len(mask))
                _logger.info(self.__class__.__name__ + ": " + message)
                classifier.fit(X[:, mask], y)
                score = np.sum(y == classifier.predict(X[:, mask]))/len(y)
                # appending the result to the population
                population.append([score, mask])
            # sorting the population in a reversed order and keeping the
            # elements with the highest scores
            population = sorted(population, key=lambda x: -x[0])[:n_population]

        self.mask = population[0][1]
        # resampling the population in the given dimensions

        smote = SMOTE(self.proportion,
                      self.n_neighbors,
                      n_jobs=self.n_jobs,
                      random_state=self.random_state)

        return smote.sample(X[:, self.mask], y)

[docs]    def preprocessing_transform(self, X):
        """
        Transform new data by the learnt transformation

        Args:
            X (np.matrix): new data

        Returns:
            np.matrix: transformed data
        """
        return X[:, self.mask]

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'min_features': self.min_features,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class DBSMOTE(OverSampling):
    """
    References:
        * BibTex::

            @Article{dbsmote,
                        author="Bunkhumpornpat, Chumphol
                        and Sinapiromsaran, Krung
                        and Lursinsap, Chidchanok",
                        title="DBSMOTE: Density-Based Synthetic Minority
                                Over-sampling TEchnique",
                        journal="Applied Intelligence",
                        year="2012",
                        month="Apr",
                        day="01",
                        volume="36",
                        number="3",
                        pages="664--684",
                        issn="1573-7497",
                        doi="10.1007/s10489-011-0287-y",
                        url="https://doi.org/10.1007/s10489-011-0287-y"
                        }

    Notes:
        * Standardization is needed to use absolute eps values.
        * The clustering is likely to identify all instances as noise, fixed
            by recursive call with increaseing eps.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_density_based]

[docs]    def __init__(self,
                 proportion=1.0,
                 eps=0.8,
                 min_samples=3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            eps (float): eps paramter of DBSCAN
            min_samples (int): min_samples paramter of DBSCAN
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater(eps, "eps", 0)
        self.check_greater_or_equal(min_samples, "min_samples", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.eps = eps
        self.min_samples = min_samples
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'eps': [0.5, 0.8, 1.2],
                                  'min_samples': [1, 3, 5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        ss = StandardScaler().fit(X)
        X_ss = ss.transform(X)

        # doing the clustering using DBSCAN
        X_min = X_ss[y == self.min_label]
        db = DBSCAN(self.eps, self.min_samples, n_jobs=self.n_jobs).fit(X_min)
        labels = db.labels_
        num_labels = np.max(labels)+1

        if num_labels == 0:
            # adjusting the parameters if no clusters were identified
            message = ("Number of clusters is 0, trying to increase eps and "
                       "decrease min_samples")
            _logger.info(self.__class__.__name__ + ": " + message)
            if self.eps >= 2 or self.min_samples <= 2:
                message = ("Number of clusters is 0, can't adjust parameters "
                           "further")
                _logger.info(self.__class__.__name__ + ": " + message)
                return X.copy(), y.copy()
            else:
                return DBSMOTE(proportion=self.proportion,
                               eps=self.eps*1.5,
                               min_samples=self.min_samples-1,
                               n_jobs=self.n_jobs,
                               random_state=self.random_state).sample(X, y)

        # determining cluster size distribution
        clusters = [np.where(labels == i)[0] for i in range(num_labels)]
        cluster_sizes = np.array([np.sum(labels == i)
                                  for i in range(num_labels)])
        cluster_dist = cluster_sizes/np.sum(cluster_sizes)

        # Bellman-Ford algorithm, inspired by
        # https://gist.github.com/joninvski/701720
        def initialize(graph, source):
            """
            Initializes shortest path algorithm.

            Args:
                graph (dict): graph in dictionary representation
                source (key): source node

            Returns:
                dict, dict: initialized distance and path dictionaries
            """
            d = {}
            p = {}
            for node in graph:
                d[node] = float('Inf')
                p[node] = None
            d[source] = 0
            return d, p

        def relax(u, v, graph, d, p):
            """
            Checks if shorter path exists.

            Args:
                u (key): key of a node
                v (key): key of another node
                graph (dict): the graph object
                d (dict): the distances dictionary
                p (dict): the paths dictionary
            """
            if d[v] > d[u] + graph[u][v]:
                d[v] = d[u] + graph[u][v]
                p[v] = u

        def bellman_ford(graph, source):
            """
            Main entry point of the Bellman-Ford algorithm

            Args:
                graph (dict): a graph in dictionary representation
                source (key): the key of the source node
            """
            d, p = initialize(graph, source)
            for i in range(len(graph)-1):
                for u in graph:
                    for v in graph[u]:
                        relax(u, v, graph, d, p)
            for u in graph:
                for v in graph[u]:
                    assert d[v] <= d[u] + graph[u][v]
            return d, p

        # extract graphs and center-like objects
        graphs = []
        centroid_indices = []
        shortest_paths = []
        for c in range(num_labels):
            # extracting the cluster elements
            cluster = X_min[clusters[c]]
            # initializing the graph object
            graph = {}
            for i in range(len(cluster)):
                graph[i] = {}

            # fitting nearest neighbors model to the cluster elements
            nn = NearestNeighbors(n_neighbors=len(cluster), n_jobs=self.n_jobs)
            nn.fit(cluster)
            dist, ind = nn.kneighbors(cluster)

            # extracting graph edges according to directly density reachabality
            # definition
            for i in range(len(cluster)):
                n = min([len(cluster), (self.min_samples + 1)])
                index_set = ind[i][1:n]
                for j in range(len(cluster)):
                    if j in index_set and dist[i][ind[i] == j][0] < self.eps:
                        graph[i][j] = dist[i][ind[i] == j][0]
            graphs.append(graph)
            # finding the index of the center like object
            centroid_ind = nn.kneighbors(
                np.mean(cluster, axis=0).reshape(1, -1))[1][0][0]
            centroid_indices.append(centroid_ind)
            # extracting shortest paths from centroid object
            shortest_paths.append(bellman_ford(graph, centroid_ind))

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            cluster_idx = self.random_state.choice(
                np.arange(len(clusters)), p=cluster_dist)
            cluster = X_min[clusters[cluster_idx]]
            idx = self.random_state.choice(range(len(clusters[cluster_idx])))

            # executing shortest path algorithm
            distances, parents = shortest_paths[cluster_idx]

            # extracting path
            path = [idx]
            while not parents[path[-1]] is None:
                path.append(parents[path[-1]])

            if len(path) == 1:
                # if the center like object is selected
                samples.append(cluster[path[0]])
            elif len(path) == 2:
                # if the path consists of 1 edge
                X_a = cluster[path[0]]
                X_b = cluster[path[1]]
                sample = self.sample_between_points_componentwise(X_a, X_b)
                samples.append(sample)
            else:
                # if the path consists of at least two edges
                random_vertex = self.random_state.randint(len(path)-1)
                X_a = cluster[path[random_vertex]]
                X_b = cluster[path[random_vertex + 1]]
                sample = self.sample_between_points_componentwise(X_a, X_b)
                samples.append(sample)

        return (np.vstack([X, ss.inverse_transform(np.vstack(samples))]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'eps': self.eps,
                'min_samples': self.min_samples,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ASMOBD(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{asmobd,
                            author={Senzhang Wang and Zhoujun Li and Wenhan
                                    Chao and Qinghua Cao},
                            booktitle={The 2012 International Joint Conference
                                        on Neural Networks (IJCNN)},
                            title={Applying adaptive over-sampling technique
                                    based on data density and cost-sensitive
                                    SVM to imbalanced learning},
                            year={2012},
                            volume={},
                            number={},
                            pages={1-8},
                            doi={10.1109/IJCNN.2012.6252696},
                            ISSN={2161-4407},
                            month={June}}

    Notes:
        * In order to use absolute thresholds, the data is standardized.
        * The technique has many parameters, not easy to find the right
            combination.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 min_samples=3,
                 eps=0.8,
                 eta=0.5,
                 T_1=1.0,
                 T_2=1.0,
                 t_1=4.0,
                 t_2=4.0,
                 a=0.05,
                 smoothing='linear',
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            min_samples (int): parameter of OPTICS
            eps (float): parameter of OPTICS
            eta (float): tradeoff paramter
            T_1 (float): noise threshold (see paper)
            T_2 (float): noise threshold (see paper)
            t_1 (float): noise threshold (see paper)
            t_2 (float): noise threshold (see paper)
            a (float): smoothing factor (see paper)
            smoothing (str): 'sigmoid'/'linear'
            n_jobs (int): number of parallel jobs
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(min_samples, "min_samples", 1)
        self.check_greater(eps, "eps", 0)
        self.check_in_range(eta, "eta", [0, 1])
        self.check_greater(T_1, "T_1", 0)
        self.check_greater(T_2, "T_2", 0)
        self.check_greater(t_1, "t_1", 0)
        self.check_greater(t_2, "t_2", 0)
        self.check_greater(a, "a", 0)
        self.check_isin(smoothing, "smoothing", ['sigmoid', 'linear'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.min_samples = min_samples
        self.eps = eps
        self.eta = eta
        self.T_1 = T_1
        self.T_2 = T_2
        self.t_1 = t_1
        self.t_2 = t_2
        self.a = a
        self.smoothing = smoothing
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'min_samples': [3],
                                  'eps': [0.3],
                                  'eta': [0.5],
                                  'T_1': [0.7, 1.0, 1.4],
                                  'T_2': [0.7, 1.0, 1.4],
                                  't_1': [4.0],
                                  't_2': [4.0],
                                  'a': [0.05, 0.1],
                                  'smoothing': ['sigmoid', 'linear']}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # standardizing the data to enable using absolute thresholds
        ss = StandardScaler().fit(X)
        X_ss = ss.transform(X)

        X_min = X_ss[y == self.min_label]

        # executing the optics algorithm
        min_samples = min([len(X_min)-1, self.min_samples])
        o = OPTICS(min_samples=min_samples,
                   max_eps=self.eps,
                   n_jobs=self.n_jobs)
        o.fit(X_min)
        cd = o.core_distances_
        r = o.reachability_

        # identifying noise
        noise = np.logical_and(cd > self.T_1, r > self.T_2)

        # fitting nearest neighbors models to identify the number of majority
        # samples in local environments
        nn = NearestNeighbors(n_neighbors=self.min_samples, n_jobs=self.n_jobs)
        nn.fit(X_ss)
        n_majs = []
        ratio = []
        for i in range(len(X_min)):
            ind = nn.radius_neighbors(X_min[i].reshape(
                1, -1), radius=cd[i], return_distance=False)[0]
            n_maj = np.sum(y[ind] == self.maj_label)/len(ind)
            n_majs.append(n_maj)
            n_min = len(ind) - n_maj - 1
            if n_min == 0:
                ratio.append(np.inf)
            else:
                ratio.append(n_maj/n_min)

        n_maj = np.array(n_maj)
        ratio = np.array(ratio)

        # second constraint on noise
        noise_2 = np.logical_and(cd > np.mean(
            cd)*self.t_1, r > np.mean(r)*self.t_2)

        # calculating density according to the smoothing function specified
        if self.smoothing == 'sigmoid':
            balance_ratio = np.abs(2.0/(1.0 + np.exp(-self.a*ratio[i])) - 1.0)
            df = self.eta*cd + (1.0 - self.eta)*n_maj - balance_ratio
        else:
            df = self.eta*(self.eta*cd + (1.0 - self.eta)*n_maj) + \
                (1 - self.eta)*len(X_min)/n_to_sample

        # unifying the conditions on noise
        not_noise = np.logical_not(np.logical_or(noise, noise_2))

        # checking if there are not noise samples remaining
        if np.sum(not_noise) == 0:
            message = ("All minority samples found to be noise, increasing"
                       "noise thresholds")
            _logger.info(self.__class__.__name__ + ": " + message)

            return ASMOBD(proportion=self.proportion,
                          min_samples=self.min_samples,
                          eps=self.eps,
                          eta=self.eta,
                          T_1=self.T_1*1.5,
                          T_2=self.T_2*1.5,
                          t_1=self.t_1*1.5,
                          t_2=self.t_2*1.5,
                          a=self.a,
                          smoothing=self.smoothing,
                          n_jobs=self.n_jobs,
                          random_state=self.random_state).sample(X, y)

        # removing noise and adjusting the density factors accordingly
        X_min_not_noise = X_min[not_noise]

        # checking if there are not-noisy samples
        if len(X_min_not_noise) <= 2:
            _logger.warning(self.__class__.__name__ + ": " +
                            "no not-noise minority sample remained")
            return X.copy(), y.copy()

        df = np.delete(df, np.where(np.logical_not(not_noise))[0])
        density = df/np.sum(df)

        # fitting nearest neighbors model to non-noise minority samples
        n_neighbors = min([len(X_min_not_noise), self.min_samples + 1])
        nn_not_noise = NearestNeighbors(n_neighbors=n_neighbors,
                                        n_jobs=self.n_jobs)
        nn_not_noise.fit(X_min_not_noise)
        dist, ind = nn_not_noise.kneighbors(X_min_not_noise)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(X_min_not_noise)),
                                           p=density)
            random_neighbor_idx = self.random_state.choice(ind[idx][1:])
            X_a = X_min_not_noise[idx]
            X_b = X_min_not_noise[random_neighbor_idx]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, ss.inverse_transform(np.vstack(samples))]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'min_samples': self.min_samples,
                'eps': self.eps,
                'eta': self.eta,
                'T_1': self.T_1,
                'T_2': self.T_2,
                't_1': self.t_1,
                't_2': self.t_2,
                'a': self.a,
                'smoothing': self.smoothing,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Assembled_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{assembled_smote,
                            author={Zhou, B. and Yang, C. and Guo, H. and
                                        Hu, J.},
                            booktitle={The 2013 International Joint Conference
                                        on Neural Networks (IJCNN)},
                            title={A quasi-linear SVM combined with assembled
                                    SMOTE for imbalanced data classification},
                            year={2013},
                            volume={},
                            number={},
                            pages={1-7},
                            keywords={approximation theory;interpolation;
                                        pattern classification;sampling
                                        methods;support vector machines;trees
                                        (mathematics);quasilinear SVM;
                                        assembled SMOTE;imbalanced dataset
                                        classification problem;oversampling
                                        method;quasilinear kernel function;
                                        approximate nonlinear separation
                                        boundary;mulitlocal linear boundaries;
                                        interpolation;data distribution
                                        information;minimal spanning tree;
                                        local linear partitioning method;
                                        linear separation boundary;synthetic
                                        minority class samples;oversampled
                                        dataset classification;standard SVM;
                                        composite quasilinear kernel function;
                                        artificial data datasets;benchmark
                                        datasets;classification performance
                                        improvement;synthetic minority
                                        over-sampling technique;Support vector
                                        machines;Kernel;Merging;Standards;
                                        Sociology;Statistics;Interpolation},
                            doi={10.1109/IJCNN.2013.6707035},
                            ISSN={2161-4407},
                            month={Aug}}

    Notes:
        * Absolute value of the angles extracted should be taken.
            (implemented this way)
        * It is not specified how many samples are generated in the various
            clusters.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_borderline,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 pop=2,
                 thres=0.3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            pop (int): lower threshold on cluster sizes
            thres (float): threshold on angles
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(pop, "pop", 1)
        self.check_in_range(thres, "thres", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.pop = pop
        self.thres = thres
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'pop': [2, 4, 5],
                                  'thres': [0.1, 0.3, 0.5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        # finding the set of border and non-border minority elements
        n_min_neighbors = [np.sum(y[ind[i]] == self.min_label)
                           for i in range(len(ind))]
        border_mask = np.logical_not(np.array(n_min_neighbors) == n_neighbors)
        X_border = X_min[border_mask]
        X_non_border = X_min[np.logical_not(border_mask)]

        if len(X_border) == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "X_border is empty")
            return X.copy(), y.copy()

        # initializing clustering
        clusters = [np.array([i]) for i in range(len(X_border))]
        dm = pairwise_distances(X_border)
        for i in range(len(dm)):
            dm[i, i] = np.inf

        # do the clustering
        while len(dm) > 1 and np.min(dm) < np.inf:
            # extracting coordinates of clusters with the minimum distance
            min_coord = np.where(dm == np.min(dm))
            merge_a = min_coord[0][0]
            merge_b = min_coord[1][0]

            # checking the size of clusters to see if they should be merged
            if (len(clusters[merge_a]) < self.pop
                    or len(clusters[merge_b]) < self.pop):
                # if both clusters are small, do the merge
                clusters[merge_a] = np.hstack([clusters[merge_a],
                                               clusters[merge_b]])
                del clusters[merge_b]
                # update the distance matrix accordingly
                dm[merge_a] = np.min(np.vstack([dm[merge_a], dm[merge_b]]),
                                     axis=0)
                dm[:, merge_a] = dm[merge_a]
                # remove columns
                dm = np.delete(dm, merge_b, axis=0)
                dm = np.delete(dm, merge_b, axis=1)
                # fix the diagonal entries
                for i in range(len(dm)):
                    dm[i, i] = np.inf
            else:
                # otherwise find principal directions
                pca_a = PCA(n_components=1).fit(X_border[clusters[merge_a]])
                pca_b = PCA(n_components=1).fit(X_border[clusters[merge_b]])
                # extract the angle of principal directions
                numerator = np.dot(pca_a.components_[0], pca_b.components_[0])
                denominator = np.linalg.norm(pca_a.components_[0])
                denominator *= np.linalg.norm(pca_b.components_[0])
                angle = abs(numerator/denominator)
                # check if angle if angle is above a specific threshold
                if angle > self.thres:
                    # do the merge
                    clusters[merge_a] = np.hstack([clusters[merge_a],
                                                   clusters[merge_b]])
                    del clusters[merge_b]
                    # update the distance matrix acoordingly
                    dm[merge_a] = np.min(np.vstack([dm[merge_a], dm[merge_b]]),
                                         axis=0)
                    dm[:, merge_a] = dm[merge_a]
                    # remove columns
                    dm = np.delete(dm, merge_b, axis=0)
                    dm = np.delete(dm, merge_b, axis=1)
                    # fixing the digaonal entries
                    for i in range(len(dm)):
                        dm[i, i] = np.inf
                else:
                    # changing the distance of clusters to fininte
                    dm[merge_a, merge_b] = np.inf
                    dm[merge_b, merge_a] = np.inf

        # extract vectors belonging to the various clusters
        vectors = [X_border[c] for c in clusters if len(c) > 0]
        # adding non-border samples
        if len(X_non_border) > 0:
            vectors.append(X_non_border)

        # extract cluster sizes and calculating point distribution in clusters
        # the last element of the clusters is the set of non-border xamples
        cluster_sizes = np.array([len(v) for v in vectors])
        densities = cluster_sizes/np.sum(cluster_sizes)

        # extracting nearest neighbors in clusters
        def fit_knn(vectors):
            n_neighbors = min([self.n_neighbors + 1, len(vectors)])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            return nn.fit(vectors).kneighbors(vectors)

        nns = [fit_knn(v) for v in vectors]

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            cluster_idx = self.random_state.choice(len(vectors), p=densities)
            len_cluster = len(vectors[cluster_idx])
            sample_idx = self.random_state.choice(np.arange(len_cluster))

            if len_cluster > 1:
                choose_from = nns[cluster_idx][1][sample_idx][1:]
                random_neighbor_idx = self.random_state.choice(choose_from)
            else:
                random_neighbor_idx = sample_idx

            X_a = vectors[cluster_idx][sample_idx]
            X_b = vectors[cluster_idx][random_neighbor_idx]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'pop': self.pop,
                'thres': self.thres,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SDSMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{sdsmote,
                            author={Li, K. and Zhang, W. and Lu, Q. and
                                        Fang, X.},
                            booktitle={2014 International Conference on
                                        Identification, Information and
                                        Knowledge in the Internet of
                                        Things},
                            title={An Improved SMOTE Imbalanced Data
                                    Classification Method Based on Support
                                    Degree},
                            year={2014},
                            volume={},
                            number={},
                            pages={34-38},
                            keywords={data mining;pattern classification;
                                        sampling methods;improved SMOTE
                                        imbalanced data classification
                                        method;support degree;data mining;
                                        class distribution;imbalanced
                                        data-set classification;over sampling
                                        method;minority class sample
                                        generation;minority class sample
                                        selection;minority class boundary
                                        sample identification;Classification
                                        algorithms;Training;Bagging;Computers;
                                        Testing;Algorithm design and analysis;
                                        Data mining;Imbalanced data-sets;
                                        Classification;Boundary sample;Support
                                        degree;SMOTE},
                            doi={10.1109/IIKI.2014.14},
                            ISSN={},
                            month={Oct}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_borderline]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        # fitting nearest neighbors model to find closest majority points to
        # minority samples
        nn = NearestNeighbors(n_neighbors=len(X_maj), n_jobs=self.n_jobs)
        nn.fit(X_maj)
        dist, ind = nn.kneighbors(X_min)

        # calculating the sum according to S3 in the paper
        S_i = np.sum(dist, axis=1)
        # calculating average distance according to S5
        S = np.sum(S_i)
        S_ave = S/(len(X_min)*len(X_maj))

        # calculate support degree
        def support_degree(x):
            return len(nn.radius_neighbors(x.reshape(1, -1),
                                           S_ave,
                                           return_distance=False))

        k = np.array([support_degree(X_min[i]) for i in range(len(X_min))])
        density = k/np.sum(k)

        # fitting nearest neighbors model to minority samples to run
        # SMOTE-like sampling
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(density)), p=density)
            random_neighbor_idx = self.random_state.choice(ind[idx][1:])
            X_a = X_min[idx]
            X_b = X_min[random_neighbor_idx]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class DSMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{dsmote,
                            author={Mahmoudi, S. and Moradi, P. and Akhlaghian,
                                    F. and Moradi, R.},
                            booktitle={2014 4th International Conference on
                                        Computer and Knowledge Engineering
                                        (ICCKE)},
                            title={Diversity and separable metrics in
                                    over-sampling technique for imbalanced
                                    data classification},
                            year={2014},
                            volume={},
                            number={},
                            pages={152-158},
                            keywords={learning (artificial intelligence);
                                        pattern classification;sampling
                                        methods;diversity metric;separable
                                        metric;over-sampling technique;
                                        imbalanced data classification;
                                        class distribution techniques;
                                        under-sampling technique;DSMOTE method;
                                        imbalanced learning problem;diversity
                                        measure;separable measure;Iran
                                        University of Medical Science;UCI
                                        dataset;Accuracy;Classification
                                        algorithms;Vectors;Educational
                                        institutions;Euclidean distance;
                                        Data mining;Diversity measure;
                                        Separable Measure;Over-Sampling;
                                        Imbalanced Data;Classification
                                        problems},
                            doi={10.1109/ICCKE.2014.6993409},
                            ISSN={},
                            month={Oct}}

    Notes:
        * The method is highly inefficient when the number of minority samples
            is high, time complexity is O(n^3), with 1000 minority samples it
            takes about 1e9 objective function evaluations to find 1 new sample
            points. Adding 1000 samples would take about 1e12 evaluations of
            the objective function, which is unfeasible. We introduce a new
            parameter, n_step, and during the search for the new sample at
            most n_step combinations of minority samples are tried.
        * Abnormality of minority points is defined in the paper as
            D_maj/D_min, high abnormality  means that the minority point is
            close to other minority points and very far from majority points.
            This is definitely not abnormality,
            I have implemented the opposite.
        * Nothing ensures that the fisher statistics and the variance from
            the geometric mean remain comparable, which might skew the
            optimization towards one of the sub-objectives.
        * MinMax normalization doesn't work, each attribute will have a 0
            value, which will make the geometric mean of all attribute 0.
    """

    categories = [OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 rate=0.1,
                 n_step=50,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            rate (float): [0,1] rate of minority samples to turn into majority
            n_step (int): number of random configurations to check for new
                                samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_in_range(rate, "rate", [0, 1])
        self.check_greater_or_equal(n_step, "n_step", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.rate = rate
        self.n_step = n_step
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'rate': [0.1, 0.2],
                                  'n_step': [50]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        mms = MinMaxScaler(feature_range=(1e-6, 1.0 - 1e-6))
        X = mms.fit_transform(X)

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        # fitting nearest neighbors model
        nn = NearestNeighbors(n_neighbors=len(X_maj))
        nn.fit(X_maj)
        dist, ind = nn.kneighbors(X_min)

        # compute mean distances, the D_min is compenstaed for taking into
        # consideration self-distances in the mean
        D_maj = np.mean(dist, axis=1)
        D_min = np.mean(pairwise_distances(X_min), axis=1) * \
            len(X_min)/(len(X_min)-1)

        # computing degree of abnormality
        abnormality = D_min/D_maj

        # sorting minority indices in decreasing order by abnormality
        to_sort = zip(abnormality, np.arange(len(abnormality)))
        abnormality, indices = zip(*sorted(to_sort, key=lambda x: -x[0]))
        rate = int(self.rate*len(abnormality))

        if rate > 0:
            # moving the most abnormal points to the majority class
            X_maj = np.vstack([X_maj, X_min[np.array(indices[:rate])]])
            # removing the most abnormal points form the minority class
            X_min = np.delete(X_min, indices[:rate], axis=0)

        # computing the mean and variance of points in the majority class
        var_maj = np.mean(np.var(X_maj, axis=0))
        mean_maj = np.mean(X_maj)

        # this is the original objective function, however, using this
        # is very inefficient if the number of records increases above
        # approximately 1000
        # def objective(X):
        #    """
        #    The objective function to be maximized
        #
        #    Args:
        #        X (np.matrix): dataset
        #
        #    Returns:
        #        float: the value of the objective function
        #    """
        #    gm= gmean(X, axis= 0)
        #    gdiv= np.mean(np.linalg.norm(X - gm, axis= 1))
        #    fisher= (np.mean(X) - mean_maj)**2/(np.mean(np.var(X, axis= 0)) \
        #                + var_maj)
        #    return gdiv + fisher

        # in order to make the code more efficient, we do maintain some
        # variables containing the main componentes of the objective function
        # and apply only small corrections based on the new values being added
        # the effect should be identical

        # records the sum of logarithms in X_min, used to compute the geometric
        # mean
        min_log_sum = np.sum(np.log(X_min), axis=0)
        # contains the sum of values in X_min, coordinatewise
        min_sum = np.sum(X_min, axis=0)
        # contains the squares of sums of values in X_min, coordinatewise
        min_sum2 = np.sum(X_min**2, axis=0)
        # contains the sum of all numbers in X_min
        min_all_sum = np.sum(X_min)

        min_norm = np.linalg.norm(X_min)**2

        # do the sampling
        n_added = 0
        while n_added < n_to_sample:
            best_candidate = None
            highest_score = 0.0
            # we try n_step combinations of minority samples
            len_X = len(X_min)
            n_steps = min([len_X*(len_X-1)*(len_X-2), self.n_step])
            for _ in range(n_steps):
                i, j, k = self.random_state.choice(np.arange(len_X),
                                                   3,
                                                   replace=False)
                gm = gmean(X_min[np.array([i, j, k])], axis=0)

                # computing the new objective function for the new point (gm)
                #  added
                new_X_min = np.vstack([X_min, gm])

                # updating the components of the objective function
                new_min_log_sum = min_log_sum + np.log(gm)
                new_min_sum = min_sum + gm
                new_min_sum2 = min_sum2 + gm**2
                new_min_all_sum = min_all_sum + np.sum(gm)

                # computing mean, var, gmean and mean of all elements with
                # the new sample (gm)
                new_min_mean = new_min_sum/(len(new_X_min))
                new_min_var = new_min_sum2/(len(new_X_min)) - new_min_mean**2
                new_min_gmean = np.exp(new_min_log_sum/(len(new_X_min)))
                new_min_all_n = (len(new_X_min))*len(X_min[0])
                new_min_all_mean = new_min_all_sum / new_min_all_n

                new_min_norm = min_norm + np.linalg.norm(gm)

                # computing the new objective function value
                inner_prod = np.dot(new_X_min, new_min_gmean)
                gmean_norm = np.linalg.norm(new_min_gmean)**2
                term_sum = new_min_norm - 2*inner_prod + gmean_norm
                new_gdiv = np.mean(np.sqrt(term_sum))

                fisher_numerator = (new_min_all_mean - mean_maj)**2
                fisher_denominator = np.mean(new_min_var) + var_maj
                new_fisher = fisher_numerator / fisher_denominator

                score = new_gdiv + new_fisher

                # evaluate the objective function
                # score= objective(np.vstack([X_min, gm]))
                # check if the score is better than the best so far
                if score > highest_score:
                    highest_score = score
                    best_candidate = gm
                    cand_min_log_sum = new_min_log_sum
                    cand_min_sum = new_min_sum
                    cand_min_sum2 = new_min_sum2
                    cand_min_all_sum = new_min_all_sum
                    cand_min_norm = new_min_norm

            # add the best candidate to the minority samples
            X_min = np.vstack([X_min, best_candidate])
            n_added = n_added + 1

            min_log_sum = cand_min_log_sum
            min_sum = cand_min_sum
            min_sum2 = cand_min_sum2
            min_all_sum = cand_min_all_sum
            min_norm = cand_min_norm

        return (mms.inverse_transform(np.vstack([X_maj, X_min])),
                np.hstack([np.repeat(self.maj_label, len(X_maj)),
                           np.repeat(self.min_label, len(X_min))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'rate': self.rate,
                'n_step': self.n_step,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class G_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{g_smote,
                            author={Sandhan, T. and Choi, J. Y.},
                            booktitle={2014 22nd International Conference on
                                        Pattern Recognition},
                            title={Handling Imbalanced Datasets by Partially
                                    Guided Hybrid Sampling for Pattern
                                    Recognition},
                            year={2014},
                            volume={},
                            number={},
                            pages={1449-1453},
                            keywords={Gaussian processes;learning (artificial
                                        intelligence);pattern classification;
                                        regression analysis;sampling methods;
                                        support vector machines;imbalanced
                                        datasets;partially guided hybrid
                                        sampling;pattern recognition;real-world
                                        domains;skewed datasets;dataset
                                        rebalancing;learning algorithm;
                                        extremely low minority class samples;
                                        classification tasks;extracted hidden
                                        patterns;support vector machine;
                                        logistic regression;nearest neighbor;
                                        Gaussian process classifier;Support
                                        vector machines;Proteins;Pattern
                                        recognition;Kernel;Databases;Gaussian
                                        processes;Vectors;Imbalanced dataset;
                                        protein classification;ensemble
                                        classifier;bootstrapping;Sat-image
                                        classification;medical diagnoses},
                            doi={10.1109/ICPR.2014.258},
                            ISSN={1051-4651},
                            month={Aug}}

    Notes:
        * the non-linear approach is inefficient
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 method='linear',
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbors
                                component
            method (str): 'linear'/'non-linear_2.0' - the float can be any
                                number: standard deviation in the
                                Gaussian-kernel
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        if not method == 'linear' and not method.startswith('non-linear'):
            raise ValueError(self.__class__.__name__ + ": " +
                             'Method parameter %s is not supported' % method)
        elif method.startswith('non-linear'):
            parameter = float(method.split('_')[-1])
            if parameter <= 0:
                message = ("Non-positive non-linear parameter %f is "
                           "not supported") % parameter
                raise ValueError(self.__class__.__name__ + ": " + message)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.method = method
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'method': ['linear', 'non-linear_0.1',
                                             'non-linear_1.0',
                                             'non-linear_2.0']}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X_min), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        if self.method == 'linear':
            # finding H_l by linear decomposition
            cov = np.cov(X_min, rowvar=False)
            w, v = np.linalg.eig(cov)
            H_l = v[np.argmax(w)]
        else:
            # building a non-linear kernel matrix and finding H_n by its
            # decomposition
            self.sigma = float(self.method.split('_')[-1])
            kernel_matrix = pairwise_distances(X_min)
            kernel_matrix = kernel_matrix/(2.0*self.sigma**2)
            kernel_matrix = np.exp(kernel_matrix)
            try:
                w_k, v_k = np.linalg.eig(kernel_matrix)
            except Exception as e:
                return X.copy(), y.copy()
            H_n = v_k[np.argmax(w_k)]

            def kernel(x, y):
                return np.linalg.norm(x - y)/(2.0*self.sigma**2)

        # generating samples
        samples = []

        def angle(P, n, H_l):
            numerator = np.abs(np.dot(P[n], H_l))
            denominator = np.linalg.norm(P[n])*np.linalg.norm(H_l)
            return np.arccos(numerator/denominator)

        while len(samples) < n_to_sample:
            idx = self.random_state.randint(len(X_min))
            # calculating difference vectors from all neighbors
            P = X_min[ind[idx][1:]] - X_min[idx]
            if self.method == 'linear':
                # calculating angles with the principal direction
                thetas = np.array([angle(P, n, H_l) for n in range(len(P))])
            else:
                thetas = []
                # calculating angles of the difference vectors and the
                # principal direction in feature space
                for n in range(len(P)):
                    # calculating representation in feature space
                    feature_vector = np.array(
                        [kernel(X_min[k], P[n]) for k in range(len(X_min))])
                    dp = np.dot(H_n, feature_vector)
                    denom = np.linalg.norm(feature_vector)*np.linalg.norm(H_n)
                    thetas.append(np.arccos(np.abs(dp)/denom))
                thetas = np.array(thetas)

            # using the neighbor with the difference along the most similar
            # direction to the principal direction of the data
            n = np.argmin(thetas)
            X_a = X_min[idx]
            X_b = X_min[ind[idx][1:][n]]
            samples.append(self.sample_between_points_componentwise(X_a, X_b))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'method': self.method,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class NT_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{nt_smote,
                            author={Xu, Y. H. and Li, H. and Le, L. P. and
                                        Tian, X. Y.},
                            booktitle={2014 Seventh International Joint
                                        Conference on Computational Sciences
                                        and Optimization},
                            title={Neighborhood Triangular Synthetic Minority
                                    Over-sampling Technique for Imbalanced
                                    Prediction on Small Samples of Chinese
                                    Tourism and Hospitality Firms},
                            year={2014},
                            volume={},
                            number={},
                            pages={534-538},
                            keywords={financial management;pattern
                                        classification;risk management;sampling
                                        methods;travel industry;Chinese
                                        tourism; hospitality firms;imbalanced
                                        risk prediction;minority class samples;
                                        up-sampling approach;neighborhood
                                        triangular synthetic minority
                                        over-sampling technique;NT-SMOTE;
                                        nearest neighbor idea;triangular area
                                        sampling idea;single classifiers;data
                                        excavation principles;hospitality
                                        industry;missing financial indicators;
                                        financial data filtering;financial risk
                                        prediction;MDA;DT;LSVM;logit;probit;
                                        firm risk prediction;Joints;
                                        Optimization;imbalanced datasets;
                                        NT-SMOTE;neighborhood triangular;
                                        random sampling},
                            doi={10.1109/CSO.2014.104},
                            ISSN={},
                            month={July}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_application]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling(3):
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # find two nearest minority samples
        nn = NearestNeighbors(n_neighbors=3, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        samples = []
        while len(samples) < n_to_sample:
            # select point randomly
            idx = self.random_state.randint(len(X_min))
            P_1 = X_min[idx]
            # find two closest neighbors
            P_2 = X_min[ind[idx][1]]
            P_3 = X_min[ind[idx][2]]
            # generate random point by sampling the specified triangle
            r_1 = self.random_state.random_sample()
            r_2 = self.random_state.random_sample()
            samples.append((P_3 + r_1 * ((P_1 + r_2 * (P_2 - P_1)) - P_3)))

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Lee(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{lee,
                             author = {Lee, Jaedong and Kim,
                                 Noo-ri and Lee, Jee-Hyong},
                             title = {An Over-sampling Technique with Rejection
                                        for Imbalanced Class Learning},
                             booktitle = {Proceedings of the 9th International
                                            Conference on Ubiquitous
                                            Information Management and
                                            Communication},
                             series = {IMCOM '15},
                             year = {2015},
                             isbn = {978-1-4503-3377-1},
                             location = {Bali, Indonesia},
                             pages = {102:1--102:6},
                             articleno = {102},
                             numpages = {6},
                             doi = {10.1145/2701126.2701181},
                             acmid = {2701181},
                             publisher = {ACM},
                             address = {New York, NY, USA},
                             keywords = {data distribution, data preprocessing,
                                            imbalanced problem, rejection rule,
                                            synthetic minority oversampling
                                            technique}
                            }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 rejection_level=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in nearest neighbor
                                component
            rejection_level (float): the rejection level of generated samples,
                                        if the fraction of majority labels in
                                        the local environment is higher than
                                        this number, the generated point is
                                        rejected
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_in_range(rejection_level, "rejection_level", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.rejection_level = rejection_level
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'rejection_level': [0.3, 0.5, 0.7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors models to find neighbors of minority
        # samples in the total data and in the minority datasets
        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn_min = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn_min.fit(X_min)
        dist_min, ind_min = nn_min.kneighbors(X_min)

        # do the sampling, we impleneted a continouos tweaking of rejection
        # levels in order to fix situations when no unrejectable data can
        # be can be generated
        samples = []
        passed = 0
        trial = 0
        rejection_level = self.rejection_level
        while len(samples) < n_to_sample:
            # checking if we managed to generate a single data in 1000 trials
            if passed == trial and passed > 1000:
                rejection_level = rejection_level + 0.1
                trial = 0
                passed = 0
            trial = trial + 1
            # generating random point
            idx = self.random_state.randint(len(X_min))
            random_neighbor_idx = self.random_state.choice(ind_min[idx][1:])
            X_a = X_min[idx]
            X_b = X_min[random_neighbor_idx]
            random_point = self.sample_between_points(X_a, X_b)
            # checking if the local environment is above the rejection level
            dist_new, ind_new = nn.kneighbors(random_point.reshape(1, -1))
            maj_frac = np.sum(y[ind_new][:-1] ==
                              self.maj_label)/self.n_neighbors
            if maj_frac < rejection_level:
                samples.append(random_point)
            else:
                passed = passed + 1

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'rejection_level': self.rejection_level,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SPY(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{spy,
                            author={Dang, X. T. and Tran, D. H. and Hirose, O.
                                    and Satou, K.},
                            booktitle={2015 Seventh International Conference
                                        on Knowledge and Systems Engineering
                                        (KSE)},
                            title={SPY: A Novel Resampling Method for
                                    Improving Classification Performance in
                                    Imbalanced Data},
                            year={2015},
                            volume={},
                            number={},
                            pages={280-285},
                            keywords={decision making;learning (artificial
                                        intelligence);pattern classification;
                                        sampling methods;SPY;resampling
                                        method;decision-making process;
                                        biomedical data classification;
                                        class imbalance learning method;
                                        SMOTE;oversampling method;UCI
                                        machine learning repository;G-mean
                                        value;borderline-SMOTE;
                                        safe-level-SMOTE;Support vector
                                        machines;Training;Bioinformatics;
                                        Proteins;Protein engineering;Radio
                                        frequency;Sensitivity;Imbalanced
                                        dataset;Over-sampling;
                                        Under-sampling;SMOTE;
                                        borderline-SMOTE},
                            doi={10.1109/KSE.2015.24},
                            ISSN={},
                            month={Oct}}
    """

    categories = [OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 n_neighbors=5,
                 threshold=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_neighbors (int): number of neighbors in nearest neighbor
                                component
            threshold (float): threshold*n_neighbors gives the threshold z
                                described in the paper
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_in_range(threshold, "threshold", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_neighbors = n_neighbors
        self.threshold = threshold
        self.n_jobs = n_jobs

        # random state takes no effect for this technique
        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'n_neighbors': [3, 5, 7],
                                  'threshold': [0.3, 0.5, 0.7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        y_new = y.copy()
        z = self.threshold*n_neighbors

        # checking the neighbors of each minority sample
        for i in range(len(X_min)):
            majority_mask = y[ind[i][1:]] == self.maj_label
            x = np.sum(majority_mask)
            # if the number of majority samples in the neighborhood is
            # smaller than a threshold
            # their labels are changed to minority
            if x < z:
                y_new[ind[i][1:][majority_mask]] = self.min_label

        return X.copy(), y_new

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_neighbors': self.n_neighbors,
                'threshold': self.threshold,
                'n_jobs': self.n_jobs}


[docs]class SMOTE_PSOBAT(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{smote_psobat,
                            author={Li, J. and Fong, S. and Zhuang, Y.},
                            booktitle={2015 3rd International Symposium on
                                        Computational and Business
                                        Intelligence (ISCBI)},
                            title={Optimizing SMOTE by Metaheuristics with
                                    Neural Network and Decision Tree},
                            year={2015},
                            volume={},
                            number={},
                            pages={26-32},
                            keywords={data mining;particle swarm
                                        optimisation;pattern classification;
                                        data mining;classifier;metaherustics;
                                        SMOTE parameters;performance
                                        indicators;selection optimization;
                                        PSO;particle swarm optimization
                                        algorithm;BAT;bat-inspired algorithm;
                                        metaheuristic optimization algorithms;
                                        nearest neighbors;imbalanced dataset
                                        problem;synthetic minority
                                        over-sampling technique;decision tree;
                                        neural network;Classification
                                        algorithms;Neural networks;Decision
                                        trees;Training;Optimization;Particle
                                        swarm optimization;Data mining;SMOTE;
                                        Swarm Intelligence;parameter
                                        selection optimization},
                            doi={10.1109/ISCBI.2015.12},
                            ISSN={},
                            month={Dec}}

    Notes:
        * The parameters of the memetic algorithms are not specified.
        * I have checked multiple paper describing the BAT algorithm, but the
            meaning of "Generate a new solution by flying randomly" is still
            unclear.
        * It is also unclear if best solutions are recorded for each bat, or
            the entire population.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_memetic]

[docs]    def __init__(self,
                 maxit=50,
                 c1=0.3,
                 c2=0.1,
                 c3=0.1,
                 alpha=0.9,
                 gamma=0.9,
                 method='bat',
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            maxit (int): maximum number of iterations
            c1 (float): intertia weight of PSO
            c2 (float): attraction of local maximums in PSO
            c3 (float): attraction of global maximum in PSO
            alpha (float): alpha parameter of the method
            gamma (float): gamma parameter of the method
            method (str): optimization technique to be used
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(maxit, "maxit", 1)
        self.check_greater_or_equal(c1, "c1", 0)
        self.check_greater_or_equal(c2, "c2", 0)
        self.check_greater_or_equal(c3, "c3", 0)
        self.check_greater_or_equal(alpha, "alpha", 0)
        self.check_greater_or_equal(gamma, "gamma", 0)
        self.check_isin(method, "method", ['pso', 'bat'])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.maxit = maxit
        self.c1 = c1
        self.c2 = c2
        self.c3 = c3
        self.alpha = alpha
        self.gamma = gamma
        self.method = method
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        bat_pc = cls.generate_parameter_combinations({'maxit': [50],
                                                      'alpha': [0.7, 0.9],
                                                      'gamma': [0.7, 0.9],
                                                      'method': ['bat']}, raw)
        pso_pc = cls.generate_parameter_combinations({'maxit': [50],
                                                      'c1': [0.2, 0.5],
                                                      'c2': [0.1, 0.2],
                                                      'c3': [0.1, 0.2],
                                                      'method': ['pso']}, raw)
        if not raw:
            bat_pc.extend(pso_pc)
        else:
            bat_pc = {**bat_pc, **pso_pc}
        return bat_pc

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        def evaluate(K, proportion):
            """
            Evaluate given configuration

            Args:
                K (int): number of neighbors in nearest neighbors component
                proportion (float): proportion of missing data to generate

            Returns:
                float, float: kappa and accuracy scores
            """
            smote = SMOTE(proportion=proportion,
                          n_neighbors=K,
                          n_jobs=self.n_jobs,
                          random_state=self.random_state)
            X_samp, y_samp = smote.sample(X, y)

            # doing k-fold cross validation
            kfold = KFold(5)
            preds = []
            tests = []
            for train, test in kfold.split(X_samp):
                dt = DecisionTreeClassifier(random_state=self.random_state)
                dt.fit(X_samp[train], y_samp[train])
                preds.append(dt.predict(X_samp[test]))
                tests.append(y_samp[test])
            preds = np.hstack(preds)
            tests = np.hstack(tests)
            # computing the kappa score
            tp = np.sum(np.logical_and(preds == tests,
                                       tests == self.min_label))
            fn = np.sum(np.logical_and(preds != tests,
                                       tests == self.min_label))
            tn = np.sum(np.logical_and(preds == tests,
                                       tests == self.maj_label))
            fp = np.sum(np.logical_and(preds != tests,
                                       tests == self.maj_label))

            p_o = (tp + tn)/(tp + fn + tn + fp)
            p_e = (tp + fn)*(tp + fp)/(tp + fn + tn + fp)**2 + \
                (fp + tn)*(fn + tn)/(tp + fn + tn + fp)**2

            kappa = (p_o - p_e)/(1.0 - p_e)

            return kappa, p_o

        def PSO():
            """
            PSO optimization

            Returns:
                int, float: the best K and proportion values
            """
            # a reasonable range of nearest neighbors to use with SMOTE
            k_range = [2, min([np.sum(y == self.min_label), 10])]
            # a reasonable range of proportions
            proportion_range = [0.1, 2.0]
            # population size
            n_pop = 10

            # initial particles
            def init_particle():
                k_rand = self.random_state.randint(k_range[0], k_range[1])
                r = self.random_state.random_sample()
                diff = proportion_range[1] - proportion_range[0]
                vect = r*diff + proportion_range[0]
                return np.array([k_rand, vect])
            ps = [init_particle() for _ in range(n_pop)]
            # initial velocities
            velocities = [np.array([0, 0]) for _ in range(n_pop)]
            # best configurations of particles
            local_best = [ps[i].copy() for i in range(n_pop)]
            # scores of best configurations of particles
            local_scores = [(0, 0) for _ in range(n_pop)]
            # global best configuration of particles
            global_best = ps[0].copy()
            # global best score
            global_scores = (0, 0)

            # executing the particle swarm optimization
            not_changed = 0
            for _ in range(self.maxit):
                # if the configurations didn't change for 10 iterations, stop
                if not_changed > len(ps)*10:
                    break
                # evaluating each of the configurations
                for i in range(len(ps)):
                    scores = evaluate(np.int(ps[i][0]), ps[i][1])
                    # recording if the best scores didn't change
                    not_changed = not_changed + 1
                    # registering locally and globally best scores
                    if (min([local_scores[i][0], scores[0]]) > 0.4
                            and local_scores[i][1] > scores[1]):
                        local_scores[i] = scores
                        local_best[i] = ps[i].copy()
                        not_changed = 0
                    elif scores[0] > 0.4 and local_scores[i][0] <= 0.4:
                        local_scores[i] = scores
                        local_best[i] = ps[i].copy()
                        not_changed = 0

                    if (min([global_scores[0], scores[0]]) > 0.4
                            and global_scores[1] > scores[1]):
                        global_scores = scores
                        global_best = ps[i].copy()
                        not_changed = 0
                    elif scores[0] > 0.4 and global_scores[0] <= 0.4:
                        global_scores = scores
                        global_best = ps[i].copy()
                        not_changed = 0

                # update velocities
                for i in range(len(ps)):
                    velocities[i] = self.c1*velocities[i] + \
                        (local_best[i] - ps[i])*self.c2 + \
                        (global_best - ps[i])*self.c3
                    # clipping velocities if required
                    while abs(velocities[i][0]) > k_range[1] - k_range[0]:
                        velocities[i][0] = velocities[i][0]/2.0
                    diff = proportion_range[1] - proportion_range[0]
                    while abs(velocities[i][1]) > diff:
                        velocities[i][1] = velocities[i][1]/2.0

                # update positions
                for i in range(len(ps)):
                    ps[i] = ps[i] + velocities[i]
                    # clipping positions according to the specified ranges
                    ps[i][0] = np.clip(ps[i][0], k_range[0], k_range[1])
                    ps[i][1] = np.clip(ps[i][1],
                                       proportion_range[0],
                                       proportion_range[1])

            return global_best

        def BAT():
            """
            BAT optimization

            Returns:
                int, float: the best K and proportion values
            """

            if sum(y == self.min_label) < 2:
                return X.copy(), y.copy()

            # a reasonable range of nearest neighbors to use with SMOTE
            k_range = [1, min([np.sum(y == self.min_label), 10])]
            # a reasonable range of proportions
            proportion_range = [0.1, 2.0]
            # population size
            n_pop = 10
            # maximum frequency
            f_max = 10

            def init_bat():
                k_rand = self.random_state.randint(k_range[0], k_range[1])
                r = self.random_state.random_sample()
                diff = proportion_range[1] - proportion_range[0]
                return np.array([k_rand, r*diff + proportion_range[0]])

            # initial bat positions
            bats = [init_bat() for _ in range(n_pop)]
            # initial velocities
            velocities = [np.array([0, 0]) for _ in range(10)]
            # best configurations of particles
            local_best = [[[[0.0, 0.0], bats[i].copy()]]
                          for i in range(len(bats))]
            # scores of best configurations of particles
            global_best = [[0.0, 0.0], bats[0].copy()]
            # pulse frequencies
            f = self.random_state.random_sample(size=n_pop)*f_max
            # pulse rates
            r = self.random_state.random_sample(size=n_pop)
            # loudness
            A = self.random_state.random_sample(size=n_pop)

            # gamma parameter according to the BAT paper
            gamma = self.gamma
            # alpha parameter according to the BAT paper
            alpha = self.alpha

            # initial best solution
            bat_star = bats[0].copy()

            not_changed = 0
            for t in range(self.maxit):
                not_changed = not_changed + 1

                if not_changed > 10:
                    break

                # update frequencies
                f = self.random_state.random_sample(size=n_pop)*f_max

                # update velocities
                for i in range(len(velocities)):
                    velocities[i] = velocities[i] + (bats[i] - bat_star)*f[i]

                # update bats
                for i in range(len(bats)):
                    bats[i] = bats[i] + velocities[i]
                    bats[i][0] = np.clip(bats[i][0], k_range[0], k_range[1])
                    bats[i][1] = np.clip(
                        bats[i][1], proportion_range[0], proportion_range[1])

                for i in range(n_pop):
                    # generate local solution
                    if self.random_state.random_sample() > r[i]:
                        n_rand = min([len(local_best[i]), 5])
                        rand_int = self.random_state.randint(n_rand)
                        random_best_sol = local_best[i][rand_int][1]
                        rr = self.random_state.random_sample(
                            size=len(bat_star))
                        bats[i] = random_best_sol + rr*A[i]

                # evaluate and do local search
                for i in range(n_pop):
                    scores = evaluate(int(bats[i][0]), bats[i][1])

                    # checking if the scores are better than the global score
                    # implementation of the multi-objective criterion in the
                    # SMOTE-PSOBAT paper
                    improved_global = False
                    if (min([global_best[0][0], scores[0]]) > 0.4
                            and global_best[0][1] > scores[1]):
                        improved_global = True
                        not_changed = 0
                    elif scores[0] > 0.4 and global_best[0][0] <= 0.4:
                        improved_global = True
                        not_changed = 0

                    # checking if the scores are better than the local scores
                    # implementation of the multi-objective criterion in the
                    # SMOTE-PSOBAT paper
                    improved_local = False
                    if (min([local_best[i][0][0][0], scores[0]]) > 0.4
                            and local_best[i][0][0][1] > scores[1]):
                        improved_local = True
                    elif scores[0] > 0.4 and local_best[i][0][0][0] <= 0.4:
                        improved_local = True

                    # local search in the bet algorithm
                    if (self.random_state.random_sample() < A[i]
                            and improved_local):
                        local_best[i].append([scores, bats[i].copy()])
                        A[i] = A[i]*alpha
                        r[i] = r[i]*(1 - np.exp(-gamma*t))
                    if (self.random_state.random_sample() < A[i]
                            and improved_global):
                        global_best = [scores, bats[i].copy()]

                    # ranking local solutions to keep track of the best 5
                    local_best[i] = sorted(
                        local_best[i], key=lambda x: -x[0][0])
                    local_best[i] = local_best[i][:min(
                        [len(local_best[i]), 5])]

                t = t + 1

            return global_best[1]

        if self.method == 'pso':
            best_combination = PSO()
        elif self.method == 'bat':
            best_combination = BAT()
        else:
            message = "Search method %s not supported yet." % self.method
            raise ValueError(self.__class__.__name__ + ": " + message)

        return SMOTE(proportion=best_combination[1],
                     n_neighbors=int(best_combination[0]),
                     n_jobs=self.n_jobs,
                     random_state=self.random_state).sample(X, y)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'maxit': self.maxit,
                'c1': self.c1,
                'c2': self.c2,
                'c3': self.c3,
                'alpha': self.alpha,
                'gamma': self.gamma,
                'method': self.method,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MDO(OverSampling):
    """
    References:
        * BibTex::

            @ARTICLE{mdo,
                        author={Abdi, L. and Hashemi, S.},
                        journal={IEEE Transactions on Knowledge and Data
                                    Engineering},
                        title={To Combat Multi-Class Imbalanced Problems
                                by Means of Over-Sampling Techniques},
                        year={2016},
                        volume={28},
                        number={1},
                        pages={238-251},
                        keywords={covariance analysis;learning (artificial
                                    intelligence);modelling;pattern
                                    classification;sampling methods;
                                    statistical distributions;minority
                                    class instance modelling;probability
                                    contour;covariance structure;MDO;
                                    Mahalanobis distance-based oversampling
                                    technique;data-oriented technique;
                                    model-oriented solution;machine learning
                                    algorithm;data skewness;multiclass
                                    imbalanced problem;Mathematical model;
                                    Training;Accuracy;Eigenvalues and
                                    eigenfunctions;Machine learning
                                    algorithms;Algorithm design and analysis;
                                    Benchmark testing;Multi-class imbalance
                                    problems;over-sampling techniques;
                                    Mahalanobis distance;Multi-class imbalance
                                    problems;over-sampling techniques;
                                    Mahalanobis distance},
                        doi={10.1109/TKDE.2015.2458858},
                        ISSN={1041-4347},
                        month={Jan}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_dim_reduction]

[docs]    def __init__(self,
                 proportion=1.0,
                 K2=5,
                 K1_frac=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            K2 (int): number of neighbors
            K1_frac (float): the fraction of K2 to set K1
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(K2, "K2", 1)
        self.check_greater_or_equal(K1_frac, "K1_frac", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.K2 = K2
        self.K1_frac = K1_frac
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'K2': [3, 5, 7],
                                  'K1_frac': [0.3, 0.5, 0.7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # determining K1
        self.K1 = int(self.K2*self.K1_frac)
        K1 = min([self.K1, len(X)])
        K2 = min([self.K2 + 1, len(X)])

        # Algorithm 2 - chooseSamples
        nn = NearestNeighbors(n_neighbors=K2, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        # extracting the number of minority samples in local neighborhoods
        n_min = np.array([np.sum(y[ind[i][1:]] == self.min_label)
                          for i in range(len(X_min))])

        # extracting selected samples from minority ones
        X_sel = X_min[n_min >= K1]

        # falling back to returning input data if all the input is considered
        # noise
        if len(X_sel) == 0:
            _logger.info(self.__class__.__name__ +
                         ": " + "No samples selected")
            return X.copy(), y.copy()

        # computing distribution
        weights = n_min[n_min >= K1]/K2
        weights = weights/np.sum(weights)

        # Algorithm 1 - MDO over-sampling
        mu = np.mean(X_sel, axis=0)
        Z = X_sel - mu
        # executing PCA
        pca = PCA(n_components=min([len(Z[0]), len(Z)])).fit(Z)
        T = pca.transform(Z)
        # computing variances (step 13)
        V = np.var(T, axis=0)

        V[V < 0.001] = 0.001

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            # selecting a sample randomly according to the distribution
            idx = self.random_state.choice(np.arange(len(X_sel)), p=weights)

            # finding vector in PCA space
            X_temp = T[idx]
            X_temp_square = X_temp**2

            # computing alphas
            alpha = np.sum(X_temp_square/V)
            alpha_V = alpha*V
            alpha_V[alpha_V < 0.001] = 0.001

            # initializing a new vector
            X_new = np.zeros(len(X_temp))

            # sampling components of the new vector
            s = 0
            for j in range(len(X_temp)-1):
                r = (2*self.random_state.random_sample()-1)*np.sqrt(alpha_V[j])
                X_new[j] = r
                s = s + (r**2/alpha_V[j])

            if s > 1:
                last_fea_val = 0
            else:
                tmp = (1 - s)*alpha*V[-1]
                if tmp < 0:
                    tmp = 0
                last_fea_val = np.sqrt(tmp)
            # determine last component to fulfill the ellipse equation
            X_new[-1] = (2*self.random_state.random_sample()-1)*last_fea_val
            # append to new samples
            samples.append(X_new)

        return (np.vstack([X, pca.inverse_transform(samples) + mu]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'K2': self.K2,
                'K1_frac': self.K1_frac,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Random_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{random_smote,
                            author="Dong, Yanjie
                            and Wang, Xuehua",
                            editor="Xiong, Hui
                            and Lee, W. B.",
                            title="A New Over-Sampling Approach: Random-SMOTE
                                    for Learning from Imbalanced Data Sets",
                            booktitle="Knowledge Science, Engineering and
                                        Management",
                            year="2011",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="343--352",
                            isbn="978-3-642-25975-3"
                            }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_componentwise]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (int): number of neighbors
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn

        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model to find closest neighbors of minority
        # points
        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(X_min)))
            y_1_idx, y_2_idx = self.random_state.choice(ind[idx][1:], 2)
            t = self.sample_between_points_componentwise(
                X_min[y_1_idx], X_min[y_2_idx])
            samples.append(
                self.sample_between_points_componentwise(X_min[idx], t))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ISMOTE(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{ismote,
                            author="Li, Hu
                            and Zou, Peng
                            and Wang, Xiang
                            and Xia, Rongze",
                            editor="Sun, Zengqi
                            and Deng, Zhidong",
                            title="A New Combination Sampling Method for
                                    Imbalanced Data",
                            booktitle="Proceedings of 2013 Chinese Intelligent
                                        Automation Conference",
                            year="2013",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="547--554",
                            isbn="978-3-642-38466-0"
                            }
    """

    categories = [OverSampling.cat_changes_majority]

[docs]    def __init__(self,
                 n_neighbors=5,
                 minority_weight=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_neighbors (int): number of neighbors
            minority_weight (float): weight parameter according to the paper
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(minority_weight, "minority_weight", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_neighbors = n_neighbors
        self.minority_weight = minority_weight
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'n_neighbors': [3, 5, 7],
                                  'minority_weight': [0.2, 0.5, 0.8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        n_to_sample = int((len(X_maj) - len(X_min))/2 + 0.5)

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # computing distances of majority samples from minority ones
        nn = NearestNeighbors(n_neighbors=len(X_min), n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_maj)

        # sort majority instances in descending order by their mean distance
        # from minority samples
        to_sort = zip(np.arange(len(X_maj)), np.mean(dist, axis=1))
        ind_sorted, dist_sorted = zip(*sorted(to_sort, key=lambda x: -x[1]))

        # remove the ones being farthest from the minority samples
        X_maj = X_maj[list(ind_sorted[n_to_sample:])]

        # construct new dataset
        X_new = np.vstack([X_maj, X_min])
        y_new = np.hstack([np.repeat(self.maj_label, len(X_maj)),
                           np.repeat(self.min_label, len(X_min))])

        X_min = X_new[y_new == self.min_label]

        # fitting nearest neighbors model
        n_neighbors = min([len(X_new), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_new)
        dist, ind = nn.kneighbors(X_min)

        # do the oversampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.choice(np.arange(len(X_min)))
            y_idx = self.random_state.choice(ind[idx][1:])

            # different generation scheme depending on the class label
            if y_new[y_idx] == self.min_label:
                diff = (X_new[y_idx] - X_min[idx])
                r = self.random_state.random_sample()
                samples.append(X_min[idx] + r * diff * self.minority_weight)
            else:
                diff = (X_new[y_idx] - X_min[idx])
                r = self.random_state.random_sample()
                sample = X_min[idx] + r * diff * (1.0 - self.minority_weight)
                samples.append(sample)

        return (np.vstack([X_new, np.vstack(samples)]),
                np.hstack([y_new, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_neighbors': self.n_neighbors,
                'minority_weight': self.minority_weight,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class VIS_RST(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{vis_rst,
                            author="Borowska, Katarzyna
                            and Stepaniuk, Jaroslaw",
                            editor="Saeed, Khalid
                            and Homenda, Wladyslaw",
                            title="Imbalanced Data Classification: A Novel
                                    Re-sampling Approach Combining Versatile
                                    Improved SMOTE and Rough Sets",
                            booktitle="Computer Information Systems and
                                        Industrial Management",
                            year="2016",
                            publisher="Springer International Publishing",
                            address="Cham",
                            pages="31--42",
                            isbn="978-3-319-45378-1"
                            }

    Notes:
        * Replication of DANGER samples will be removed by the last step of
            noise filtering.
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_noise_removal]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0.0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # standardizing the data
        ss = StandardScaler()
        ss.fit(X)
        X = ss.transform(X)
        y = y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        # fitting nearest neighbors model to determine boundary region
        n_neighbors = min([len(X), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_maj)

        # determining boundary region of majority samples
        boundary = np.array([np.sum(y[ind[i]] == self.maj_label)
                             != n_neighbors for i in range(len(X_maj))])
        y_maj = y[y == self.maj_label]
        y_maj[boundary] = self.min_label
        y[y == self.maj_label] = y_maj

        # extracting new minority and majority set
        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        # labeling minority samples
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        # extracting labels
        labels = []
        for i in range(len(ind)):
            min_class_neighbors = np.sum(y[ind[i][1:]] == self.maj_label)
            if min_class_neighbors == n_neighbors-1:
                labels.append('noise')
            elif min_class_neighbors < n_neighbors/2:
                labels.append('safe')
            else:
                labels.append('danger')

        # extracting the number of different labels (noise is not used)
        safe = np.sum([li == 'safe' for li in labels])
        danger = np.sum([li == 'danger' for li in labels])

        if safe == 0:
            mode = 'no_safe'
        elif danger > 0.3*len(X_min):
            mode = 'high_complexity'
        else:
            mode = 'low_complexity'

        # fitting nearest neighbors to find the neighbors of minority elements
        # among minority elements
        n_neighbors_min = min([len(X_min), self.n_neighbors + 1])
        nn_min = NearestNeighbors(n_neighbors=n_neighbors_min,
                                  n_jobs=self.n_jobs)
        nn_min.fit(X_min)
        dist_min, ind_min = nn_min.kneighbors(X_min)

        # do the sampling
        samples = []
        mask = np.repeat(False, len(X_min))
        while len(samples) < n_to_sample:
            # choosing a random minority sample
            idx = self.random_state.choice(np.arange(len(X_min)))

            # implementation of sampling rules depending on the mode
            if mode == 'high_complexity':
                if labels[idx] == 'noise':
                    pass
                elif labels[idx] == 'danger' and not mask[idx]:
                    samples.append(X_min[idx])
                    mask[idx] = True
                else:
                    X_b = X_min[self.random_state.choice(ind_min[idx][1:])]
                    samples.append(self.sample_between_points(X_min[idx], X_b))
            elif mode == 'low_complexity':
                if labels[idx] == 'noise':
                    pass
                elif labels[idx] == 'danger':
                    X_b = X_min[self.random_state.choice(ind_min[idx][1:])]
                    samples.append(self.sample_between_points(X_min[idx], X_b))
                elif not mask[idx]:
                    samples.append(X_min[idx])
                    mask[idx] = True
            else:
                X_b = X_min[self.random_state.choice(ind_min[idx][1:])]
                samples.add(self.sample_between_points(X_min[idx], X_b))

        X_samp = np.vstack(samples)

        # final noise removal by removing those minority samples generated
        # and not belonging to the lower approximation
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=self.n_jobs).fit(X)
        dist_check, ind_check = nn.kneighbors(X_samp)

        def maj_zero(i):
            return np.sum(y[ind_check[i][1:]] == self.maj_label) == 0

        num_maj_mask = np.array([maj_zero(i) for i in range(len(samples))])
        X_samp = X_samp[num_maj_mask]

        return (ss.inverse_transform(np.vstack([X, X_samp])),
                np.hstack([y, np.repeat(self.min_label, len(X_samp))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class GASMOTE(OverSampling):
    """
    References:
        * BibTex::

            @Article{gasmote,
                        author="Jiang, Kun
                        and Lu, Jing
                        and Xia, Kuiliang",
                        title="A Novel Algorithm for Imbalance Data
                                Classification Based on Genetic
                                Algorithm Improved SMOTE",
                        journal="Arabian Journal for Science and
                                    Engineering",
                        year="2016",
                        month="Aug",
                        day="01",
                        volume="41",
                        number="8",
                        pages="3255--3266",
                        issn="2191-4281",
                        doi="10.1007/s13369-016-2179-2",
                        url="https://doi.org/10.1007/s13369-016-2179-2"
                        }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_memetic,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 n_neighbors=5,
                 maxn=7,
                 n_pop=10,
                 popl3=5,
                 pm=0.3,
                 pr=0.2,
                 Ge=10,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_neighbors (int): number of neighbors
            maxn (int): maximum number of samples to generate per minority
                        instances
            n_pop (int): size of population
            popl3 (int): number of crossovers
            pm (float): mutation probability
            pr (float): selection probability
            Ge (int): number of generations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(maxn, "maxn", 1)
        self.check_greater_or_equal(n_pop, "n_pop", 1)
        self.check_in_range(pm, "pm", [0, 1])
        self.check_in_range(pr, "pr", [0, 1])
        self.check_greater_or_equal(Ge, "Ge", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_neighbors = n_neighbors
        self.maxn = maxn
        self.n_pop = n_pop
        self.popl3 = popl3
        self.pm = pm
        self.pr = pr
        self.Ge = Ge
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        return cls.generate_parameter_combinations({'n_neighbors': [7],
                                                    'maxn': [2, 3, 4],
                                                    'n_pop': [10],
                                                    'popl3': [4],
                                                    'pm': [0.3],
                                                    'pr': [0.2],
                                                    'Ge': [10]}, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model to find minority neighbors of
        #  minority samples
        n_neighbors = min([self.n_neighbors + 1, len(X_min)])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)
        kfold = KFold(min([len(X), 5]))

        def fitness(conf):
            """
            Evluate fitness of configuration

            Args:
                conf (list(list)): configuration
            """
            # generate new samples
            samples = []
            for i in range(len(conf)):
                for _ in range(conf[i]):
                    X_b = X_min[self.random_state.choice(ind[i][1:])]
                    samples.append(self.sample_between_points(X_min[i], X_b))

            if len(samples) == 0:
                # if no samples are generated
                X_new = X
                y_new = y
            else:
                # construct dataset
                X_new = np.vstack([X, np.vstack(samples)])
                y_new = np.hstack(
                    [y, np.repeat(self.min_label, len(samples))])

            # execute kfold cross validation
            preds, tests = [], []
            for train, test in kfold.split(X_new):
                dt = DecisionTreeClassifier(random_state=self.random_state)
                dt.fit(X_new[train], y_new[train])
                preds.append(dt.predict(X_new[test]))
                tests.append(y_new[test])
            preds = np.hstack(preds)
            tests = np.hstack(tests)

            # compute fitness measure
            tp = np.sum(np.logical_and(
                tests == self.min_label, tests == preds))
            tn = np.sum(np.logical_and(
                tests == self.maj_label, tests == preds))
            fp = np.sum(np.logical_and(
                tests == self.maj_label, tests != preds))
            fn = np.sum(np.logical_and(
                tests == self.min_label, tests != preds))
            sens = tp/(tp + fn)
            spec = tn/(fp + tn)

            return np.sqrt(sens*spec)

        def crossover(conf_a, conf_b):
            """
            Crossover

            Args:
                conf_a (list(list)): configuration to crossover
                conf_b (list(list)): configuration to crossover

            Returns:
                list(list), list(list): the configurations after crossover
            """
            for _ in range(self.popl3):
                k = self.random_state.randint(len(conf_a))
                conf_a = np.hstack([conf_a[:k], conf_b[k:]])
                conf_b = np.hstack([conf_b[:k], conf_a[k:]])
            return conf_a, conf_b

        def mutation(conf, ge):
            """
            Mutation

            Args:
                conf (list(list)): configuration to mutate
                ge (int): iteration number
            """
            conf = conf.copy()
            if self.random_state.random_sample() < self.pm:
                pass
            else:
                for i in range(len(conf)):
                    r = self.random_state.random_sample()
                    r = r**((1 - ge/self.Ge)**3)
                    if self.random_state.randint(2) == 0:
                        conf[i] = int(conf[i] + (self.maxn - conf[i])*r)
                    else:
                        conf[i] = int(conf[i] - (conf[i] - 0)*r)
            return conf

        # generate initial population
        def init_pop():
            return self.random_state.randint(self.maxn, size=len(X_min))

        population = [[init_pop(), 0] for _ in range(self.n_pop)]

        # calculate fitness values
        for p in population:
            p[1] = fitness(p[0])

        # start iteration
        ge = 0
        while ge < self.Ge:
            # sorting population in descending order by fitness scores
            population = sorted(population, key=lambda x: -x[1])

            # selection operation (Step 2)
            pp = int(self.n_pop*self.pr)
            population_new = []
            for i in range(pp):
                population_new.append(population[i])
            population_new.extend(population[:(self.n_pop - pp)])
            population = population_new

            # crossover
            for _ in range(int(self.n_pop/2)):
                pop_0 = population[self.random_state.randint(self.n_pop)][0]
                pop_1 = population[self.random_state.randint(self.n_pop)][0]
                conf_a, conf_b = crossover(pop_0, pop_1)
                population.append([conf_a, fitness(conf_a)])
                population.append([conf_b, fitness(conf_b)])

            # mutation
            for _ in range(int(self.n_pop/2)):
                pop_0 = population[self.random_state.randint(self.n_pop)][0]
                conf = mutation(pop_0, ge)
                population.append([conf, fitness(conf)])

            ge = ge + 1

        # sorting final population
        population = sorted(population, key=lambda x: -x[1])

        # get best configuration
        conf = population[0][0]

        # generate final samples
        samples = []
        for i in range(len(conf)):
            for _ in range(conf[i]):
                samples.append(self.sample_between_points(
                    X_min[i], X_min[self.random_state.choice(ind[i][1:])]))

        if len(samples) == 0:
            return X.copy(), y.copy()

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_neighbors': self.n_neighbors,
                'maxn': self.maxn,
                'n_pop': self.n_pop,
                'popl3': self.popl3,
                'pm': self.pm,
                'pr': self.pr,
                'Ge': self.Ge,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class A_SUWO(OverSampling):
    """
    References:
        * BibTex::

            @article{a_suwo,
                        title = "Adaptive semi-unsupervised weighted
                                    oversampling (A-SUWO) for imbalanced
                                    datasets",
                        journal = "Expert Systems with Applications",
                        volume = "46",
                        pages = "405 - 416",
                        year = "2016",
                        issn = "0957-4174",
                        doi = "https://doi.org/10.1016/j.eswa.2015.10.031",
                        author = "Iman Nekooeimehr and Susana K. Lai-Yuen",
                        keywords = "Imbalanced dataset, Classification,
                                        Clustering, Oversampling"
                        }

    Notes:
        * Equation (7) misses a division by R_j.
        * It is not specified how to sample from clusters with 1 instances.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_density_based,
                  OverSampling.cat_noise_removal]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_clus_maj=7,
                 c_thres=0.8,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            n_clus_maj (int): number of majority clusters
            c_thres (float): threshold on distances
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_clus_maj, "n_clus_maj", 1)
        self.check_greater_or_equal(c_thres, "c_thres", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_clus_maj = n_clus_maj
        self.c_thres = c_thres
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_clus_maj': [5, 7, 9],
                                  'c_thres': [0.5, 0.8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_orig, y_orig = X, y

        # fitting nearest neighbors to find neighbors of all samples
        n_neighbors = min([len(X), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X)

        # identifying as noise those samples which do not have neighbors of
        # the same label
        def noise_func(i):
            return np.sum(y[ind[i][1:]] == y[i]) == 0
        noise = np.where(np.array([noise_func(i) for i in range(len(X))]))[0]

        # removing noise
        X = np.delete(X, noise, axis=0)
        y = np.delete(y, noise)

        # extarcting modified minority and majority datasets
        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        if len(X_min) == 0:
            _logger.info("All minority samples removed as noise")
            return X_orig.copy(), y_orig.copy()

        n_clus_maj = min([len(X_maj), self.n_clus_maj])

        # clustering majority samples
        ac = AgglomerativeClustering(n_clusters=n_clus_maj)
        ac.fit(X_maj)
        maj_clusters = [np.where(ac.labels_ == i)[0]
                        for i in range(n_clus_maj)]

        if len(maj_clusters) == 0:
            return X_orig.copy(), y_orig.copy()

        # initialize minority clusters
        min_clusters = [np.array([i]) for i in range(len(X_min))]

        # compute minority distance matrix of cluster
        dm_min = pairwise_distances(X_min)
        for i in range(len(dm_min)):
            dm_min[i, i] = np.inf

        # compute distance matrix of minority and majority clusters
        dm_maj = np.zeros(shape=(len(X_min), len(maj_clusters)))
        for i in range(len(X_min)):
            for j in range(len(maj_clusters)):
                pairwd = pairwise_distances(X_min[min_clusters[i]],
                                            X_maj[maj_clusters[j]])
                dm_maj[i, j] = np.min(pairwd)

        # compute threshold
        nn = NearestNeighbors(n_neighbors=len(X_min), n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)
        d_med = np.median(dist, axis=1)
        T = np.mean(d_med)*self.c_thres

        # do the clustering of minority samples
        while True:
            # finding minimum distance between minority clusters
            pi = np.min(dm_min)

            # if the minimum distance is higher than the threshold, stop
            if pi > T:
                break

            # find cluster pair of minimum distance
            min_dist_pair = np.where(dm_min == pi)
            min_i = min_dist_pair[0][0]
            min_j = min_dist_pair[1][0]

            # Step 3 - find majority clusters closer than pi
            A = np.where(np.logical_and(dm_maj[min_i] < pi,
                                        dm_maj[min_j] < pi))[0]

            # Step 4 - checking if there is a majority cluster between the
            # minority ones
            if len(A) > 0:
                dm_min[min_i, min_j] = np.inf
                dm_min[min_j, min_i] = np.inf
            else:
                # Step 5
                # unifying minority clusters
                min_clusters[min_i] = np.hstack([min_clusters[min_i],
                                                 min_clusters[min_j]])
                # removing one of them
                min_clusters = np.delete(min_clusters, min_j)

                # updating the minority distance matrix
                dm_min[min_i] = np.min(np.vstack([dm_min[min_i],
                                                  dm_min[min_j]]), axis=0)
                dm_min[:, min_i] = dm_min[min_i]
                # removing jth row and column (merged in i)
                dm_min = np.delete(dm_min, min_j, axis=0)
                dm_min = np.delete(dm_min, min_j, axis=1)

                # fixing the diagonal elements
                for i in range(len(dm_min)):
                    dm_min[i, i] = np.inf

                # updating the minority-majority distance matrix
                dm_maj[min_i] = np.min(np.vstack([dm_maj[min_i],
                                                  dm_maj[min_j]]), axis=0)
                dm_maj = np.delete(dm_maj, min_j, axis=0)

        # adaptive sub-cluster sizing
        eps = []
        # going through all minority clusters
        for c in min_clusters:
            # checking if cluster size is higher than 1
            if len(c) > 1:
                k = min([len(c), 5])
                kfold = KFold(k, random_state=self.random_state)
                preds = []
                # executing k-fold cross validation with linear discriminant
                # analysis
                X_c = X_min[c]
                for train, test in kfold.split(X_c):
                    X_train = np.vstack([X_maj, X_c[train]])
                    y_train_maj = np.repeat(self.maj_label, len(X_maj))
                    y_train_min = np.repeat(self.min_label, len(X_c[train]))
                    y_train = np.hstack([y_train_maj, y_train_min])
                    ld = LinearDiscriminantAnalysis()
                    ld.fit(X_train, y_train)
                    preds.append(ld.predict(X_c[test]))
                preds = np.hstack(preds)
                # extracting error rate
                eps.append(np.sum(preds == self.maj_label)/len(preds))
            else:
                eps.append(1.0)

        # sampling distribution over clusters
        min_cluster_dist = eps/np.sum(eps)

        # synthetic instance generation - determining within cluster
        # distribution finding majority neighbor distances of minority
        # samples
        nn = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
        nn.fit(X_maj)
        dist, ind = nn.kneighbors(X_min)
        dist = dist/len(X[0])
        dist = 1.0/dist

        # computing the THs
        THs = []
        for c in min_clusters:
            THs.append(np.mean(dist[c, 0]))

        # determining within cluster distributions
        within_cluster_dist = []
        for i, c in enumerate(min_clusters):
            Gamma = dist[c, 0]
            Gamma[Gamma > THs[i]] = THs[i]
            within_cluster_dist.append(Gamma/np.sum(Gamma))

        # extracting within cluster neighbors
        within_cluster_neighbors = []
        for c in min_clusters:
            n_neighbors = min([len(c), self.n_neighbors])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            nn.fit(X_min[c])
            within_cluster_neighbors.append(nn.kneighbors(X_min[c])[1])

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            # choose random cluster index
            cluster_idx = self.random_state.choice(
                np.arange(len(min_clusters)), p=min_cluster_dist)
            if len(min_clusters[cluster_idx]) > 1:
                # if the cluster has at least two elemenets
                domain = np.arange(len(min_clusters[cluster_idx]))
                distribution = within_cluster_dist[cluster_idx]
                sample_idx = self.random_state.choice(domain, p=distribution)

                domain = within_cluster_neighbors[cluster_idx][sample_idx][1:]
                neighbor_idx = self.random_state.choice(domain)
                point = X_min[min_clusters[cluster_idx][sample_idx]]
                neighbor = X_min[min_clusters[cluster_idx][neighbor_idx]]
                samples.append(self.sample_between_points(point, neighbor))
            else:
                samples.append(X_min[min_clusters[cluster_idx][0]])

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_clus_maj': self.n_clus_maj,
                'c_thres': self.c_thres,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SMOTE_FRST_2T(OverSampling):
    """
    References:
        * BibTex::

            @article{smote_frst_2t,
                        title = "Fuzzy-rough imbalanced learning for the
                                    diagnosis of High Voltage Circuit
                                    Breaker maintenance: The SMOTE-FRST-2T
                                    algorithm",
                        journal = "Engineering Applications of Artificial
                        Intelligence",
                        volume = "48",
                        pages = "134 - 139",
                        year = "2016",
                        issn = "0952-1976",
                        doi = "https://doi.org/10.1016/j.engappai.2015.10.009",
                        author = "Ramentol, E. and Gondres, I. and Lajes, S.
                                    and Bello, R. and Caballero,Y. and
                                    Cornelis, C. and Herrera, F.",
                        keywords = "High Voltage Circuit Breaker (HVCB),
                                    Imbalanced learning, Fuzzy rough set
                                    theory, Resampling methods"
                        }

    Notes:
        * Unlucky setting of parameters might result 0 points added, we have
            fixed this by increasing the gamma_S threshold if the number of
            samples accepted is low.
        * Similarly, unlucky setting of parameters might result all majority
            samples turned into minority.
        * In my opinion, in the algorithm presented in the paper the
            relations are incorrect. The authors talk about accepting samples
            having POS score below a threshold, and in the algorithm in
            both places POS >= gamma is used.
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_noise_removal,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_application]

[docs]    def __init__(self,
                 n_neighbors=5,
                 gamma_S=0.7,
                 gamma_M=0.03,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_neighbors (int): number of neighbors in the SMOTE sampling
            gamma_S (float): threshold of synthesized samples
            gamma_M (float): threshold of majority samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(gamma_S, "gamma_S", 0)
        self.check_greater_or_equal(gamma_M, "gamma_M", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.gamma_S = gamma_S
        self.gamma_M = gamma_M
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'n_neighbors': [3, 5, 7],
                                  'gamma_S': [0.8, 1.0],
                                  'gamma_M': [0.03, 0.05, 0.1]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # Turning the ranges to 1 speeds up the positive membership
        # calculations
        mmscaler = MinMaxScaler()
        X = mmscaler.fit_transform(X)

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        # extracting the attribute ranges

        d = len(X[0])

        # after MinMax scaling, the POS value can be calculated as follows
        pos_cache = pairwise_distances(X_min, X_maj, metric='l1')
        pos_cache = 1.0 - pos_cache
        pos_cache = pos_cache.clip(0, d)
        pos_cache = 1.0 - pos_cache

        # initializing some lists containing the results
        result_synth = []
        result_maj = []
        iteration = 0

        gamma_S = self.gamma_S
        gamma_M = self.gamma_M

        # iterating until the dataset becomes balanced
        while (len(X_min) + len(result_synth) + len(result_maj)) < len(X_maj):
            _logger.info(self.__class__.__name__ + ":" +
                         ("iteration: %d" % iteration))
            # checking if the parameters aren't too conservative
            if len(result_synth) < iteration:
                gamma_S = gamma_S*1.1
                _logger.info(self.__class__.__name__ + ": " +
                             "gamma_S increased to %f" % gamma_S)

            # determine proportion
            diff = (sum(y == self.maj_label) -
                    sum(y == self.min_label))
            prop = max(1.1/diff, 0.2)

            # executing SMOTE to generate some minority samples
            smote = SMOTE(proportion=prop,
                          n_neighbors=self.n_neighbors,
                          n_jobs=self.n_jobs,
                          random_state=self.random_state)
            X_samp, y_samp = smote.sample(X, y)
            X_samp = X_samp[len(X):]

            new_synth = []

            # computing POS membership values for the new samples
            pos_synth = pairwise_distances(X_min, X_samp, metric='l1')
            pos_synth = 1.0 - pos_synth
            pos_synth = pos_synth.clip(0, d)
            pos_synth = 1.0 - pos_synth

            # adding samples with POS membership smaller than gamma_S to the
            # minority set
            min_pos = np.min(pos_synth, axis=0)
            to_add = np.where(min_pos < gamma_S)[0]
            result_synth.extend(X_samp[to_add])
            new_synth.extend(X_samp[to_add])

            # checking the minimum POS values of the majority samples
            min_pos = np.min(pos_cache, axis=0)
            to_remove = np.where(min_pos < self.gamma_M)[0]

            # if the number of majority samples with POS membership smaller
            # than gamma_M is not extreme, then changing labels, otherwise
            # decreasing gamma_M
            if len(to_remove) > (len(X_maj) - len(X_min))/2:
                to_remove = np.array([])
                gamma_M = gamma_M*0.9
                _logger.info(self.__class__.__name__ + ": " +
                             "gamma_M decreased to %f" % gamma_M)
            else:
                result_maj.extend(X_maj[to_remove])
                X_maj = np.delete(X_maj, to_remove, axis=0)
                pos_cache = np.delete(pos_cache, to_remove, axis=1)

            # updating pos cache
            if len(new_synth) > 0:
                pos_cache_new = pairwise_distances(
                    np.vstack(new_synth), X_maj, metric='l1')
                pos_cache_new = 1.0 - pos_cache_new
                pos_cache_new = pos_cache_new.clip(0, d)
                pos_cache_new = 1.0 - pos_cache_new

                pos_cache = np.vstack([pos_cache, pos_cache_new])

            message = "minority added: %d, majority removed %d"
            message = message % (len(to_add), len(to_remove))
            _logger.info(self.__class__.__name__ + ":" + message)

            iteration = iteration + 1

        # packing the results
        X_res = np.vstack([X_maj, X_min])
        if len(result_synth) > 0:
            X_res = np.vstack([X_res, np.vstack(result_synth)])
        if len(result_maj) > 0:
            X_res = np.vstack([X_res, np.vstack(result_maj)])

        if len(X_maj) == 0:
            _logger.warning('All majority samples removed')
            return mmscaler.inverse_transform(X), y

        y_res_maj = np.repeat(self.maj_label, len(X_maj))
        n_y_res_min = len(X_min) + len(result_synth) + len(result_maj)
        y_res_min = np.repeat(self.min_label, n_y_res_min)
        y_res = np.hstack([y_res_maj, y_res_min])

        return mmscaler.inverse_transform(X_res), y_res

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_neighbors': self.n_neighbors,
                'gamma_S': self.gamma_S,
                'gamma_M': self.gamma_M,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class AND_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @inproceedings{and_smote,
                             author = {Yun, Jaesub and Ha,
                                 Jihyun and Lee, Jong-Seok},
                             title = {Automatic Determination of Neighborhood
                                        Size in SMOTE},
                             booktitle = {Proceedings of the 10th International
                                            Conference on Ubiquitous
                                            Information Management and
                                            Communication},
                             series = {IMCOM '16},
                             year = {2016},
                             isbn = {978-1-4503-4142-4},
                             location = {Danang, Viet Nam},
                             pages = {100:1--100:8},
                             articleno = {100},
                             numpages = {8},
                             doi = {10.1145/2857546.2857648},
                             acmid = {2857648},
                             publisher = {ACM},
                             address = {New York, NY, USA},
                             keywords = {SMOTE, imbalanced learning, synthetic
                                            data generation},
                            }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self, proportion=1.0, K=15, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            K (int): maximum number of nearest neighbors
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(K, "K", 2)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.K = K
        self.n_jobs = n_jobs
        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """

        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'K': [9, 15, 21]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        K = min([len(X_min), self.K])
        # find K nearest neighbors of all samples
        nn = NearestNeighbors(n_neighbors=K, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X)

        min_ind = np.where(y == self.min_label)[0]

        # Executing the algorithm
        kappa = []
        for i in range(len(min_ind)):
            regions_min = []
            regions_maj = []

            for j in range(1, K):
                # continueing if the label of the neighbors is minority
                if y[ind[min_ind[i]][j]] != self.min_label:
                    continue

                # region coordinates
                reg = np.hstack([min_ind[i], ind[min_ind[i]][j]])
                # compute corner points
                reg_min = np.min(X[reg])
                reg_max = np.max(X[reg])

                r_min = []
                r_maj = []
                # all the points in the region must be among the neighbors
                # what we do is counting how many of them are minority and
                # majority samples
                for k in ind[min_ind[i]][:(j+1)]:
                    if np.all(reg_min <= X[k]) and np.all(X[k] <= reg_max):
                        if y[k] == self.min_label:
                            r_min.append(k)
                        else:
                            r_maj.append(k)

                # appending the coordinates of points to the minority and
                # majority regions
                regions_min.append(r_min)
                regions_maj.append(r_maj)

            # taking the cumulative unions of minority and majority points
            for j in range(1, len(regions_min)):
                regions_min[j] = list(
                    set(regions_min[j]).union(set(regions_min[j-1])))
                regions_maj[j] = list(
                    set(regions_maj[j]).union(set(regions_maj[j-1])))

            # computing the lengths of the increasing minority and majority
            # sets
            regions_min = np.array([len(r) for r in regions_min])
            regions_maj = np.array([len(r) for r in regions_maj])

            # computing the precision of minority classification (all points
            # are supposed to be classified as minority)
            prec = regions_min/(regions_min + regions_maj)
            # taking the difference
            d = np.diff(prec, 1)
            # finding the biggest drop (+1 because diff reduces length, +1
            # because of indexing begins with 0)
            if len(d) == 0:
                k = 0
            else:
                k = np.argmin(d) + 2
            # appending the coordinate of the biggest drop as the ideal
            # neighborhood size note that k indices the minority neighbors
            kappa.append(k)

        # finding nearest minority neighbors of minority samples
        nn = NearestNeighbors(n_neighbors=max(kappa) + 1, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        if np.sum(kappa) == 0:
            _logger.warning(self.__class__.__name__ + ": " +
                            "No minority samples in nearest neighbors")
            return X.copy(), y.copy()

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            # choose random point
            idx = self.random_state.randint(len(X_min))
            if kappa[idx] > 0:
                domain = ind[idx][1:(kappa[idx]+1)]
                X_b = X_min[self.random_state.choice(domain)]
                samples.append(self.sample_between_points(X_min[idx], X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'K': self.K,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class NRAS(OverSampling):
    """
    References:
        * BibTex::

            @article{nras,
                        title = "Noise Reduction A Priori Synthetic
                                    Over-Sampling for class imbalanced data
                                    sets",
                        journal = "Information Sciences",
                        volume = "408",
                        pages = "146 - 161",
                        year = "2017",
                        issn = "0020-0255",
                        doi = "https://doi.org/10.1016/j.ins.2017.04.046",
                        author = "William A. Rivera",
                        keywords = "NRAS, SMOTE, OUPS, Class imbalance,
                                        Classification"
                        }
    """

    categories = [OverSampling.cat_sample_ordinary,
                  OverSampling.cat_noise_removal]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 t=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors
            t (float): [0,1] fraction of n_neighbors as threshold
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_in_range(t, "t", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.t = t
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [5, 7, 9],
                                  't': [0.3, 0.5, 0.8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # standardization is needed to make the range of the propensity scores
        # similar to that of the features
        mms = MinMaxScaler()
        X_trans = mms.fit_transform(X)

        # determining propensity scores using logistic regression
        lr = LogisticRegression(solver='lbfgs',
                                n_jobs=self.n_jobs,
                                random_state=self.random_state)
        lr.fit(X_trans, y)
        propensity = lr.predict_proba(X_trans)[:, np.where(
            lr.classes_ == self.min_label)[0][0]]

        X_min = X_trans[y == self.min_label]

        # adding propensity scores as a new feature
        X_new = np.column_stack([X_trans, propensity])
        X_min_new = X_new[y == self.min_label]

        # finding nearest neighbors of minority samples
        n_neighbors = min([len(X_new), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_new)
        dist, ind = nn.kneighbors(X_min_new)

        # do the sampling
        samples = []
        to_remove = []
        while len(samples) < n_to_sample:
            idx = self.random_state.randint(len(X_min))
            # finding the number of minority neighbors
            t_hat = np.sum(y[ind[idx][1:]] == self.min_label)
            if t_hat < self.t*n_neighbors:
                # removing the minority point if the number of minority
                # neighbors is less then the threshold
                # to_remove indexes X_min
                if idx not in to_remove:
                    to_remove.append(idx)
                    # compensating the removal of the minority point
                    n_to_sample = n_to_sample + 1

                if len(to_remove) == len(X_min):
                    _logger.warning(self.__class__.__name__ + ": " +
                                    "all minority samples identified as noise")
                    return X.copy(), y.copy()
            else:
                # otherwise do the sampling
                X_b = X_trans[self.random_state.choice(ind[idx][1:])]
                samples.append(self.sample_between_points(X_min[idx], X_b))

        # remove noisy elements
        X_maj = X_trans[y == self.maj_label]
        X_min = np.delete(X_min, to_remove, axis=0)

        return (mms.inverse_transform(np.vstack([X_maj,
                                                 X_min,
                                                 np.vstack(samples)])),
                np.hstack([np.repeat(self.maj_label, len(X_maj)),
                           np.repeat(self.min_label, len(X_min)),
                           np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                't': self.t,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class AMSCO(OverSampling):
    """
    References:
        * BibTex::

            @article{amsco,
                        title = "Adaptive multi-objective swarm fusion for
                                    imbalanced data classification",
                        journal = "Information Fusion",
                        volume = "39",
                        pages = "1 - 24",
                        year = "2018",
                        issn = "1566-2535",
                        doi = "https://doi.org/10.1016/j.inffus.2017.03.007",
                        author = "Jinyan Li and Simon Fong and Raymond K.
                                    Wong and Victor W. Chu",
                        keywords = "Swarm fusion, Swarm intelligence
                                    algorithm, Multi-objective, Crossover
                                    rebalancing, Imbalanced data
                                    classification"
                        }

    Notes:
        * It is not clear how the kappa threshold is used, I do use the RA
            score to drive all the evolution. Particularly:

            "In the last phase of each iteration, the average Kappa value
            in current non-inferior set is compare with the latest threshold
            value, the threshold is then increase further if the average value
            increases, and vice versa. By doing so, the non-inferior region
            will be progressively reduced as the Kappa threshold lifts up."

        I don't see why would the Kappa threshold lift up if the kappa
        thresholds are decreased if the average Kappa decreases ("vice versa").

        * Due to the interpretation of kappa threshold and the lack of detailed
            description of the SIS process, the implementation is not exactly
            what is described in the paper, but something very similar.
    """

    categories = [OverSampling.cat_changes_majority,
                  OverSampling.cat_memetic,
                  OverSampling.cat_uses_classifier]

[docs]    def __init__(self,
                 n_pop=5,
                 n_iter=15,
                 omega=0.1,
                 r1=0.1,
                 r2=0.1,
                 n_jobs=1,
                 classifier=DecisionTreeClassifier(random_state=2),
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            n_pop (int): size of populations
            n_iter (int): optimization steps
            omega (float): intertia of PSO
            r1 (float): force towards local optimum
            r2 (float): force towards global optimum
            n_jobs (int): number of parallel jobs
        """
        super().__init__()
        self.check_greater_or_equal(n_pop, "n_pop", 1)
        self.check_greater_or_equal(n_iter, "n_iter", 1)
        self.check_greater_or_equal(omega, "omega", 0)
        self.check_greater_or_equal(r1, "r1", 0)
        self.check_greater_or_equal(r2, "r2", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.n_pop = n_pop
        self.n_iter = n_iter
        self.omega = omega
        self.r1 = r1
        self.r2 = r2
        self.n_jobs = n_jobs
        self.classifier = classifier

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        # as the method is an overall optimization, 1 reasonable settings
        # should be enough

        classifiers = [DecisionTreeClassifier(random_state=2)]
        parameter_combinations = {'n_pop': [5],
                                  'n_iter': [15],
                                  'omega': [0.1],
                                  'r1': [0.1],
                                  'r2': [0.1],
                                  'classifier': classifiers}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]
        X_maj = X[y == self.maj_label]

        n_cross_val = min([4, len(X_min)])

        def fitness(X_min, X_maj):
            """
            Calculating fitness function

            Args:
                X_min (np.matrix): minority samples
                X_maj (np.matrix): majority samples

            Returns:
                float, float: kappa, accuracy
            """
            kfold = StratifiedKFold(n_cross_val)

            # prepare assembled dataset
            X_ass = np.vstack([X_min, X_maj])
            y_ass = np.hstack([np.repeat(self.min_label, len(X_min)),
                               np.repeat(self.maj_label, len(X_maj))])

            preds = []
            tests = []
            for train, test in kfold.split(X_ass, y_ass):
                self.classifier.fit(X_ass[train], y_ass[train])
                preds.append(self.classifier.predict(X))
                tests.append(y)
            preds = np.hstack(preds)
            tests = np.hstack(tests)

            # calculate kappa and accuracy scores
            tp = np.sum(np.logical_and(preds == tests,
                                       tests == self.min_label))
            fn = np.sum(np.logical_and(preds != tests,
                                       tests == self.min_label))
            tn = np.sum(np.logical_and(preds == tests,
                                       tests == self.maj_label))
            fp = np.sum(np.logical_and(preds != tests,
                                       tests == self.maj_label))

            p_o = (tp + tn)/(tp + fn + tn + fp)
            p_e = (tp + fn)*(tp + fp)/(tp + fn + tn + fp)**2 + \
                (fp + tn)*(fn + tn)/(tp + fn + tn + fp)**2

            kappa = (p_o - p_e)/(1.0 - p_e)
            accuracy = (tp + tn)/(tp + fn + tn + fp)

            return kappa, accuracy

        def OSMOTE(X_min, X_maj):
            """
            Executing OSMOTE phase

            Args:
                X_min (np.matrix): minority samples
                X_maj (np.matrix): majority samples

            Returns:
                np.matrix, np.matrix: new minority and majority datasets
            """

            # initialize particles, first coordinate represents proportion
            # parameter of SMOTE
            # the second coordinate represents the number of neighbors to
            # take into consideration
            def init_pop():
                proportion = self.random_state.random_sample()/2.0+0.5
                n_neighbors = self.random_state.randint(3, 10)
                return np.array([proportion, n_neighbors])
            particles = [init_pop() for _ in range(self.n_pop)]
            # velocities initialized
            velocities = [np.array([0.1, 1]) for _ in range(self.n_pop)]
            # setting the limits of the search space
            limits = [np.array([0.25, 3]), np.array([4.0, 10])]
            # local best results
            local_best = [particles[i].copy() for i in range(self.n_pop)]
            # local best scores
            local_score = [(0.0, 0.0)]*self.n_pop
            # global best result
            global_best = particles[0].copy()
            # global best score
            global_score = (0.0, 0.0)
            # best dataset
            best_dataset = None

            # running the optimization
            for _ in range(self.n_iter):
                # update velocities
                for i in range(len(velocities)):
                    diff1 = (local_best[i] - velocities[i])
                    diff2 = (global_best - velocities[i])
                    velocities[i] = (velocities[i]*self.omega +
                                     self.r1 * diff1 + self.r2*diff2)
                    # clipping velocities using the upper bounds of the
                    # particle search space
                    velocities[i][0] = np.clip(
                        velocities[i][0], -limits[1][0]/2, limits[1][0]/2)
                    velocities[i][1] = np.clip(
                        velocities[i][1], -limits[1][1]/2, limits[1][1]/2)

                # update particles
                for i in range(len(particles)):
                    particles[i] = particles[i] + velocities[i]
                    # clipping the particle positions using the lower and
                    # upper bounds
                    particles[i][0] = np.clip(
                        particles[i][0], limits[0][0], limits[1][0])
                    particles[i][1] = np.clip(
                        particles[i][1], limits[0][1], limits[1][1])

                # evaluate
                scores = []
                for i in range(len(particles)):
                    # apply SMOTE
                    smote = SMOTE(particles[i][0],
                                  int(np.rint(particles[i][1])),
                                  n_jobs=self.n_jobs,
                                  random_state=self.random_state)
                    X_to_sample = np.vstack([X_maj, X_min])
                    y_to_sample_maj = np.repeat(
                        self.maj_label, len(X_maj))
                    y_to_sample_min = np.repeat(
                        self.min_label, len(X_min))
                    y_to_sample = np.hstack([y_to_sample_maj, y_to_sample_min])
                    X_samp, y_samp = smote.sample(X_to_sample, y_to_sample)

                    # evaluate
                    scores.append(fitness(X_samp[len(X_maj):],
                                          X_samp[:len(X_maj)]))

                    # update scores according to the multiobjective setting
                    if (scores[i][0]*scores[i][1] >
                            local_score[i][0]*local_score[i][1]):
                        local_best[i] = particles[i].copy()
                        local_score[i] = scores[i]
                    if (scores[i][0]*scores[i][1] >
                            global_score[0]*global_score[1]):
                        global_best = particles[i].copy()
                        global_score = scores[i]
                        best_dataset = (X_samp[len(X_maj):],
                                        X_samp[:len(X_maj)])

            return best_dataset[0], best_dataset[1]

        def SIS(X_min, X_maj):
            """
            SIS procedure

            Args:
                X_min (np.matrix): minority dataset
                X_maj (np.matrix): majority dataset

            Returns:
                np.matrix, np.matrix: new minority and majority datasets
            """
            min_num = len(X_min)
            max_num = len(X_maj)
            if min_num >= max_num:
                return X_min, X_maj

            # initiate particles
            def init_particle():
                num = self.random_state.randint(min_num, max_num)
                maj = self.random_state.choice(np.arange(len(X_maj)), num)
                return maj

            particles = [init_particle() for _ in range(self.n_pop)]
            scores = [fitness(X_min, X_maj[particles[i]])
                      for i in range(self.n_pop)]
            best_score = (0.0, 0.0)
            best_dataset = None

            for _ in range(self.n_iter):
                # mutate and evaluate
                # the way mutation or applying PSO is not described in the
                # paper in details
                for i in range(self.n_pop):
                    # removing some random elements
                    domain = np.arange(len(particles[i]))
                    n_max = min([10, len(particles[i])])
                    n_to_choose = self.random_state.randint(0, n_max)
                    to_remove = self.random_state.choice(domain, n_to_choose)
                    mutant = np.delete(particles[i], to_remove)

                    # adding some random elements
                    maj_set = set(np.arange(len(X_maj)))
                    part_set = set(particles[i])
                    diff = list(maj_set.difference(part_set))
                    n_max = min([10, len(diff)])
                    n_to_choose = self.random_state.randint(0, n_max)
                    diff_elements = self.random_state.choice(diff, n_to_choose)
                    mutant = np.hstack([mutant, np.array(diff_elements)])
                    # evaluating the variant
                    score = fitness(X_min, X_maj[mutant])
                    if score[1] > scores[i][1]:
                        particles[i] = mutant.copy()
                        scores[i] = score
                    if score[1] > best_score[1]:
                        best_score = score
                        best_dataset = mutant.copy()

            return X_min, X_maj[best_dataset]

        # executing the main optimization procedure
        current_min = X_min
        current_maj = X_maj
        for it in range(self.n_iter):
            _logger.info(self.__class__.__name__ + ": " +
                         'staring iteration %d' % it)
            new_min, _ = OSMOTE(X_min, current_maj)
            _, new_maj = SIS(current_min, X_maj)

            # calculating fitness values of the four combinations
            fitness_0 = np.prod(fitness(new_min, current_maj))
            fitness_1 = np.prod(fitness(current_min, current_maj))
            fitness_2 = np.prod(fitness(new_min, new_maj))
            fitness_3 = np.prod(fitness(current_min, new_maj))

            # selecting the new current_maj and current_min datasets
            message = 'fitness scores: %f %f %f %f'
            message = message % (fitness_0, fitness_1, fitness_2, fitness_3)
            _logger.info(self.__class__.__name__ + ": " + message)
            max_fitness = np.max([fitness_0, fitness_1, fitness_2, fitness_3])
            if fitness_1 == max_fitness or fitness_3 == max_fitness:
                current_maj = new_maj
            if fitness_0 == max_fitness or fitness_2 == max_fitness:
                current_min = new_min

        return (np.vstack([current_maj, current_min]),
                np.hstack([np.repeat(self.maj_label, len(current_maj)),
                           np.repeat(self.min_label, len(current_min))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'n_pop': self.n_pop,
                'n_iter': self.n_iter,
                'omega': self.omega,
                'r1': self.r1,
                'r2': self.r2,
                'n_jobs': self.n_jobs,
                'classifier': self.classifier,
                'random_state': self._random_state_init}


[docs]class SSO(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{sso,
                            author="Rong, Tongwen
                            and Gong, Huachang
                            and Ng, Wing W. Y.",
                            editor="Wang, Xizhao
                            and Pedrycz, Witold
                            and Chan, Patrick
                            and He, Qiang",
                            title="Stochastic Sensitivity Oversampling
                                    Technique for Imbalanced Data",
                            booktitle="Machine Learning and Cybernetics",
                            year="2014",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="161--171",
                            isbn="978-3-662-45652-1"
                            }

    Notes:
        * In the algorithm step 2d adds a constant to a vector. I have
            changed it to a componentwise adjustment, and also used the
            normalized STSM as I don't see any reason why it would be
            some reasonable, bounded value.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_classifier,
                  OverSampling.cat_uses_clustering,
                  OverSampling.cat_density_based]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 h=10,
                 n_iter=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            h (int): number of hidden units
            n_iter (int): optimization steps
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(h, "h", 1)
        self.check_greater_or_equal(n_iter, "n_iter", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.h = h
        self.n_iter = n_iter
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5],
                                  'h': [2, 5, 10, 20],
                                  'n_iter': [5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # number of samples to generate in each iteration
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        samp_per_iter = max([1, int(n_to_sample/self.n_iter)])

        # executing the algorithm
        for _ in range(self.n_iter):
            X_min = X[y == self.min_label]

            # applying kmeans clustering to find the hidden neurons
            h = min([self.h, len(X_min)])
            kmeans = KMeans(n_clusters=h,
                            random_state=self.random_state)
            kmeans.fit(X)

            # extracting the hidden center elements
            u = kmeans.cluster_centers_

            # extracting scale parameters as the distances of closest centers
            nn_cent = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
            nn_cent.fit(u)
            dist_cent, ind_cent = nn_cent.kneighbors(u)
            v = dist_cent[:, 1]

            # computing the response of the hidden units
            phi = pairwise_distances(X, u)
            phi = phi**2
            phi = np.exp(-phi/v**2)

            # applying linear regression to find the best weights
            lr = LinearRegression()
            lr.fit(phi, y)
            f = lr.predict(phi[np.where(y == self.min_label)[0]])
            w = lr.coef_

            def eq_6(Q, w, u, v, x):
                """
                Equation 6 in the paper
                """
                tmp_sum = np.zeros(h)
                for i in range(h):
                    a = (x - u[i] + Q)/np.sqrt(2*v[i])
                    b = (x - u[i] - Q)/np.sqrt(2*v[i])
                    tmp_prod = (sspecial.erf(a) - sspecial.erf(b))
                    tmp_sum[i] = np.sqrt(np.pi/2)*v[i]*np.prod(tmp_prod)
                return np.dot(tmp_sum, w)/(2*Q)**len(x)

            def eq_8(Q, w, u, v, x):
                """
                Equation 8 in the paper
                """
                res = 0.0
                for i in range(h):
                    vi2 = v[i]**2
                    for r in range(h):
                        vr2 = v[r]**2
                        a1 = (np.sqrt(2*vi2*vr2*(vi2 + vr2)))

                        a00_v = (vi2 + vr2)*(x + Q)
                        a01_v = vi2*u[r] + vr2*u[i]
                        a0_v = a00_v - a01_v
                        a_v = a0_v/a1

                        b_v = ((vi2 + vr2)*(x - Q) - (vi2*u[r] + vr2*u[i]))/a1
                        tmp_prod = sspecial.erf(a_v) - sspecial.erf(b_v)

                        tmp_a = (np.sqrt(2*vi2*vr2*(vi2 + vr2)) /
                                 (vi2 + vr2))**len(x)
                        norm = np.linalg.norm(u[r] - u[i])
                        tmp_b = np.exp(-0.5 * norm**2/(vi2 + vr2))
                        res = res + tmp_a*tmp_b*np.prod(tmp_prod)*w[i]*w[r]

                return (np.sqrt(np.pi)/(4*Q))**len(x)*res

            # applying nearest neighbors to extract Q values
            n_neighbors = min([self.n_neighbors + 1, len(X)])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            nn.fit(X)
            dist, ind = nn.kneighbors(X_min)

            Q = np.mean(dist[:, n_neighbors-1])/np.sqrt(len(X[0]))

            # calculating the sensitivity factors
            I_1 = np.array([eq_6(Q, w, u, v, x) for x in X_min])
            I_2 = np.array([eq_8(Q, w, u, v, x) for x in X_min])

            stsm = f**2 - 2*f*I_1 + I_2

            # calculating the sampling weights
            weights = np.abs(stsm)/np.sum(np.abs(stsm))

            n_neighbors = min([len(X_min), self.n_neighbors+1])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            nn.fit(X_min)
            dist, ind = nn.kneighbors(X_min)

            samples = []
            for _ in range(samp_per_iter):
                idx = self.random_state.choice(
                    np.arange(len(X_min)), p=weights)
                X_new = X_min[idx].copy()
                for s in range(len(X_new)):
                    lam = self.random_state.random_sample(
                    )*(2*(1 - weights[idx])) - (1 - weights[idx])
                    X_new[s] = X_new[s] + Q*lam
                samples.append(X_new)

            samples = np.vstack(samples)
            X = np.vstack([X, samples])
            y = np.hstack([y, np.repeat(self.min_label, len(samples))])

        return X.copy(), y.copy()

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'h': self.h,
                'n_iter': self.n_iter,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class NDO_sampling(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{ndo_sampling,
                            author={Zhang, L. and Wang, W.},
                            booktitle={2011 International Conference of
                                        Information Technology, Computer
                                        Engineering and Management Sciences},
                            title={A Re-sampling Method for Class Imbalance
                                    Learning with Credit Data},
                            year={2011},
                            volume={1},
                            number={},
                            pages={393-397},
                            keywords={data handling;sampling methods;
                                        resampling method;class imbalance
                                        learning;credit rating;imbalance
                                        problem;synthetic minority
                                        over-sampling technique;sample
                                        distribution;synthetic samples;
                                        credit data set;Training;
                                        Measurement;Support vector machines;
                                        Logistics;Testing;Noise;Classification
                                        algorithms;class imbalance;credit
                                        rating;SMOTE;sample distribution},
                            doi={10.1109/ICM.2011.34},
                            ISSN={},
                            month={Sept}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_application]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 T=0.5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            T (float): threshold parameter
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(T, "T", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.T = T
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'T': [0.5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # fitting nearest neighbors model to find the neighbors of minority
        # samples among all elements
        n_neighbors = min([len(X), self.n_neighbors+1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X)
        dist, ind = nn.kneighbors(X_min)

        # calculating the distances between samples in the same and different
        # classes
        d_intra = []
        d_exter = []
        for i in range(len(X_min)):
            min_mask = np.where(y[ind[i][1:]] == self.min_label)[0]
            maj_mask = np.where(y[ind[i][1:]] == self.maj_label)[0]
            if len(min_mask) > 0:
                d_intra.append(np.mean(dist[i][1:][min_mask]))
            if len(maj_mask) > 0:
                d_exter.append(np.mean(dist[i][1:][maj_mask]))
        d_intra_mean = np.mean(np.array(d_intra))
        d_exter_mean = np.mean(np.array(d_exter))

        # calculating the alpha value
        alpha = d_intra_mean/d_exter_mean

        # deciding if SMOTE is enough
        if alpha < self.T:
            smote = SMOTE(self.proportion, random_state=self.random_state)
            return smote.sample(X, y)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.randint(len(X_min))
            random_idx = self.random_state.choice(ind[idx][1:])
            # create sample close to the initial minority point
            samples.append(X_min[idx] + (X[random_idx] - X_min[idx])
                           * self.random_state.random_sample()/2.0)
            if y[random_idx] == self.min_label:
                # create another sample close to the neighboring minority point
                samples.append(X[random_idx] + (X_min[idx] - X[random_idx])
                               * self.random_state.random_sample()/2.0)

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'T': self.T,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


class RBFNeuron(RandomStateMixin):
    """
    This class abstracts a neuron of an RBF network
    """

    def __init__(self,
                 c,
                 Ib,
                 Ob,
                 ranges,
                 range_mins,
                 init_conn_mask,
                 init_conn_weights,
                 random_state=None):
        """
        Constructor of the neuron

        Args:
            c (np.array): center of the hidden unit
            Ib (float): upper bound on the absolute values of input weights
            Ob (float): upper bound on the absolute values of output weights
            ranges (np.array): ranges widths of parameters
            range_min (np.array): lower bounds of parameter ranges
            init_conn_mask (np.array): initial input connections
            init_conn_weights (np.array): initial weights of input connections
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        self.d = len(c)
        self.c = c
        self.Ib = Ib
        self.Ob = Ob
        self.init_conn_mask = init_conn_mask
        self.init_conn_weights = init_conn_weights
        self.ranges = ranges
        self.range_mins = range_mins

        self.set_random_state(random_state)

        self.beta = (self.random_state.random_sample()-0.5)*Ob
        self.mask = init_conn_mask
        self.input_weights = init_conn_weights
        self.r = self.random_state.random_sample()

    def clone(self):
        """
        Clones the neuron

        Returns:
            RBFNeuron: an identical neuron
        """
        r = RBFNeuron(self.c,
                      self.Ib,
                      self.Ob,
                      self.ranges,
                      self.range_mins,
                      self.init_conn_mask,
                      self.init_conn_weights,
                      random_state=self.random_state)
        r.beta = self.beta
        r.mask = self.mask.copy()
        r.input_weights = self.input_weights.copy()
        r.r = self.r

        return r

    def evaluate(self, X):
        """
        Evaluates the system on dataset X

        Args:
            X (np.matrix): dataset to evaluate on

        Returns:
            np.array: the output of the network
        """
        wX = X[:, self.mask]*self.input_weights
        term_exp = -np.linalg.norm(wX - self.c[self.mask], axis=1)**2/self.r**2
        return self.beta*np.exp(term_exp)

    def mutate(self):
        """
        Mutates the neuron
        """
        r = self.random_state.random_sample()
        if r < 0.2:
            # centre creep
            self.c = self.random_state.normal(self.c, self.r)
        elif r < 0.4:
            # radius creep
            tmp = self.random_state.normal(self.r, np.var(self.ranges))
            if tmp > 0:
                self.r = tmp
        elif r < 0.6:
            # randomize centers
            self.c = self.random_state.random_sample(
                size=len(self.c))*self.ranges + self.range_mins
        elif r < 0.8:
            # randomize radii
            self.r = self.random_state.random_sample()*np.mean(self.ranges)
        else:
            # randomize output weight
            self.beta = self.random_state.normal(self.beta, self.Ob)

    def add_connection(self):
        """
        Adds a random input connection to the neuron
        """
        if len(self.mask) < self.d:
            d_set = set(range(self.d))
            mask_set = set(self.mask.tolist())
            domain = list(d_set.difference(mask_set))
            additional_elements = np.array(self.random_state.choice(domain))
            self.mask = np.hstack([self.mask, additional_elements])
            random_weight = (self.random_state.random_sample()-0.5)*self.Ib
            self.input_weights = np.hstack([self.input_weights, random_weight])

    def delete_connection(self):
        """
        Deletes a random input connection
        """
        if len(self.mask) > 1:
            idx = self.random_state.randint(len(self.mask))
            self.mask = np.delete(self.mask, idx)
            self.input_weights = np.delete(self.input_weights, idx)


class RBF(RandomStateMixin):
    """
    RBF network abstraction
    """

    def __init__(self,
                 X,
                 m_min,
                 m_max,
                 Ib,
                 Ob,
                 init_conn_mask,
                 init_conn_weights,
                 random_state=None):
        """
        Initializes the RBF network

        Args:
            X (np.matrix): dataset to work with
            m_min (int): minimum number of hidden neurons
            m_max (int): maximum number of hidden neurons
            Ib (float): maximum absolute value of input weights
            Ob (float): maximum absolute value of output weights
            init_conn_mask (np.array): initial input connections
            init_conn_weights (np.array): initial input weights
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        self.X = X
        self.m_min = m_min
        self.m_max = m_max
        self.Ib = Ib
        self.Ob = Ob
        self.init_conn_mask = init_conn_mask
        self.init_conn_weights = init_conn_weights

        self.set_random_state(random_state)

        self.neurons = []
        self.range_mins = np.min(X, axis=0)
        self.ranges = np.max(X, axis=0) - self.range_mins

        # adding initial neurons
        num_neurons = self.random_state.randint(m_min, m_max)
        for _ in range(num_neurons):
            self.neurons.append(self.create_new_node())

        self.beta_0 = (self.random_state.random_sample()-0.5)*Ob

    def clone(self):
        """
        Clones the entire network

        Returns:
            RBF: the cloned network
        """
        r = RBF(self.X,
                self.m_min,
                self.m_max,
                self.Ib,
                self.Ob,
                self.init_conn_mask,
                self.init_conn_weights,
                random_state=self.random_state)
        r.neurons = [n.clone() for n in self.neurons]
        r.range_mins = self.range_mins.copy()
        r.ranges = self.ranges.copy()
        r.beta_0 = self.beta_0

        return r

    def create_new_node(self):
        """
        Creates a new node.

        Returns:
            RBFNeuron: a new hidden neuron
        """
        return RBFNeuron(self.X[self.random_state.randint(len(self.X))],
                         self.Ib,
                         self.Ob,
                         self.ranges,
                         self.range_mins,
                         self.init_conn_mask,
                         self.init_conn_weights,
                         random_state=self.random_state)

    def update_data(self, X):
        """
        Updates the data to work with
        """
        self.X = X
        for n in self.neurons:
            n.X = X

    def improve_centers(self):
        """
        Improves the center locations by kmeans clustering
        """
        if len(np.unique(self.X, axis=0)) > len(self.neurons):
            cluster_init = np.vstack([n.c for n in self.neurons])
            kmeans = KMeans(n_clusters=len(self.neurons),
                            init=cluster_init,
                            n_init=1,
                            max_iter=30,
                            random_state=self.random_state)
            kmeans.fit(self.X)
            for i in range(len(self.neurons)):
                self.neurons[i].c = kmeans.cluster_centers_[i]

    def evaluate(self, X, y):
        """
        Evaluates the target function

        Returns:
            float: the target function value
        """
        evaluation = np.column_stack([n.evaluate(X) for n in self.neurons])
        f = self.beta_0 + np.sum(evaluation, axis=1)
        L_star = np.mean(abs(y[y == 1] - f[y == 1]))
        L_star += np.mean(abs(y[y == 0] - f[y == 0]))
        return L_star

    def mutation(self):
        """
        Mutates the neurons

        Returns:
            RBF: a new, mutated RBF network
        """
        rbf = self.clone()
        for n in rbf.neurons:
            n.mutate()
        return rbf

    def structural_mutation(self):
        """
        Applies structural mutation

        Returns:
            RBF: a new, structurally mutated network
        """
        # in the binary case the removal of output connections is the same as
        # removing hidden nodes
        rbf = self.clone()
        r = self.random_state.random_sample()
        if r < 0.5:
            if len(rbf.neurons) < rbf.m_max:
                rbf.neurons.append(rbf.create_new_node())
            elif len(rbf.neurons) > rbf.m_min:
                del rbf.neurons[self.random_state.randint(len(rbf.neurons))]
        else:
            rbf.neurons[self.random_state.randint(
                len(rbf.neurons))].delete_connection()
            rbf.neurons[self.random_state.randint(
                len(rbf.neurons))].add_connection()

        return rbf

    def recombine(self, rbf):
        """
        Recombines two networks

        Args:
            rbf (RBF): another network

        Returns:
            RBF: the result of recombination
        """
        # the order of neurons doesn't matter, so the logic can be simplified
        new = self.clone()
        if self.random_state.random_sample() < 0.5:
            n_random = self.random_state.randint(1, len(new.neurons))
            new_neurons_0 = self.random_state.choice(new.neurons, n_random)
            n_random = self.random_state.randint(1, len(rbf.neurons))
            new_neurons_1 = self.random_state.choice(rbf.neurons, n_random)
            new.neurons = [n.clone() for n in new_neurons_0]
            new.neurons.extend([n.clone() for n in new_neurons_1])
            while len(new.neurons) > self.m_max:
                del new.neurons[self.random_state.randint(len(new.neurons))]
        else:
            for i in range(len(new.neurons)):
                if self.random_state.random_sample() < 0.2:
                    n_random = self.random_state.randint(len(rbf.neurons))
                    new.neurons[i] = rbf.neurons[n_random].clone()
        return new


[docs]class DSRBF(OverSampling):
    """
    References:
        * BibTex::

            @article{dsrbf,
                        title = "A dynamic over-sampling procedure based on
                                    sensitivity for multi-class problems",
                        journal = "Pattern Recognition",
                        volume = "44",
                        number = "8",
                        pages = "1821 - 1833",
                        year = "2011",
                        issn = "0031-3203",
                        doi = "https://doi.org/10.1016/j.patcog.2011.02.019",
                        author = "Francisco Fernández-Navarro and César
                                    Hervás-Martínez and Pedro Antonio
                                    Gutiérrez",
                        keywords = "Classification, Multi-class, Sensitivity,
                                    Accuracy, Memetic algorithm, Imbalanced
                                    datasets, Over-sampling method, SMOTE"
                        }

    Notes:
        * It is not entirely clear why J-1 output is supposed where J is the
            number of classes.
        * The fitness function is changed to a balanced mean loss, as I found
            that it just ignores classification on minority samples
            (class label +1) in the binary case.
        * The iRprop+ optimization is not implemented.
        * The original paper proposes using SMOTE incrementally. Instead of
            that, this implementation applies SMOTE to generate all samples
            needed in the sampling epochs and the evolution of RBF networks
            is used to select the sampling providing the best results.
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_classifier,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_memetic]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 m_min=4,
                 m_max=10,
                 Ib=2,
                 Ob=2,
                 n_pop=500,
                 n_init_pop=5000,
                 n_iter=40,
                 n_sampling_epoch=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors in the SMOTE sampling
            m_min (int): minimum number of hidden units
            m_max (int): maximum number of hidden units
            Ib (float): input weight range
            Ob (float): output weight range
            n_pop (int): size of population
            n_init_pop (int): size of initial population
            n_iter (int): number of iterations
            n_sampling_epoch (int): resampling after this many iterations
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(m_min, "m_min", 1)
        self.check_greater_or_equal(m_max, "m_max", 1)
        self.check_greater(Ib, "Ib", 0)
        self.check_greater(Ob, "Ob", 0)
        self.check_greater_or_equal(n_pop, "n_pop", 2)
        self.check_greater_or_equal(n_init_pop, "n_pop", 2)
        self.check_greater_or_equal(n_iter, "n_iter", 0)
        self.check_greater_or_equal(n_sampling_epoch, "n_sampling_epoch", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.m_min = m_min
        self.m_max = m_max
        self.Ib = Ib
        self.Ob = Ob
        self.n_pop = n_pop
        self.n_init_pop = n_init_pop
        self.n_iter = n_iter
        self.n_sampling_epoch = n_sampling_epoch
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        # as the technique optimizes, it is unnecessary to check various
        # combinations except one specifying a decent workspace with a large
        # number of iterations
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'm_min': [4],
                                  'm_max': [10],
                                  'Ib': [2.0],
                                  'Ob': [2.0],
                                  'n_pop': [100],
                                  'n_init_pop': [1000],
                                  'n_iter': [40],
                                  'n_sampling_epoch': [8]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # Standardizing the data to let the network work with comparable
        # attributes
        ss = StandardScaler()
        X = ss.fit_transform(X)
        X_orig = X
        y_orig = y

        X, y = SMOTE(proportion=self.proportion,
                     n_neighbors=self.n_neighbors,
                     n_jobs=self.n_jobs,
                     random_state=self.random_state).sample(X, y)

        # generate initial connections and weights randomly
        domain = np.arange(len(X[0]))
        n_random = int(len(X[0])/2)
        init_conn_mask = self.random_state.choice(domain, n_random)
        init_conn_weights = self.random_state.random_sample(size=n_random)

        # setting epoch lengths
        epoch_len = int(self.n_iter/self.n_sampling_epoch)

        if len(X_orig) < self.m_min + 1:
            return X_orig.copy(), y_orig.copy()
        m_max = min(len(X_orig), self.m_max)

        # generating initial population
        def init_pop():
            return RBF(X,
                       self.m_min,
                       m_max,
                       self.Ib,
                       self.Ob,
                       init_conn_mask,
                       init_conn_weights,
                       random_state=self.random_state)

        population = [init_pop() for _ in range(self.n_init_pop)]
        population = [[p, X, y, np.inf] for p in population]
        population = sorted([[p[0], p[1], p[2], p[0].evaluate(p[1], p[2])]
                             for p in population], key=lambda x: x[3])
        population = population[:self.n_pop]

        # executing center improval in the hidden units
        for p in population:
            p[0].improve_centers()

        # executing the optimization process
        for iteration in range(self.n_iter):
            message = "Iteration %d/%d, loss: %f, data size %d"
            message = message % (iteration, self.n_iter, population[0][3],
                                 len(population[0][1]))
            _logger.info(self.__class__.__name__ + ": " + message)
            # evaluating non-evaluated elements
            for p in population:
                if p[3] == np.inf:
                    p[3] = p[0].evaluate(p[1], p[2])

            # sorting the population by the loss values
            population = sorted([p for p in population], key=lambda x: x[3])
            population = population[:self.n_pop]

            # determining the number of elements to be changed
            p_best = population[0]
            p_parametric_mut = population[:int(0.1*self.n_pop)]
            p_structural_mut = population[:int(0.9*self.n_pop-1)]
            p_recombination = population[:int(0.1*self.n_pop)]

            # executing mutation
            for p in p_parametric_mut:
                population.append([p[0].mutation(), p[1], p[2], np.inf])

            # executing structural mutation
            for p in p_structural_mut:
                population.append(
                    [p[0].structural_mutation(), p[1], p[2], np.inf])

            # executing recombination
            for p in p_recombination:
                domain = range(len(p_recombination))
                p_rec_idx = self.random_state.choice(domain)
                p_rec = p_recombination[p_rec_idx][0]
                population.append([p[0].recombine(p_rec), p[1], p[2], np.inf])

            # do the sampling
            if iteration % epoch_len == 0:
                smote = SMOTE(proportion=self.proportion,
                              n_neighbors=self.n_neighbors,
                              n_jobs=self.n_jobs,
                              random_state=self.random_state)
                X, y = smote.sample(X_orig, y_orig)
                for i in range(self.n_pop):
                    tmp = [population[i][0].clone(), X, y, np.inf]
                    tmp[0].update_data(X)
                    tmp[0].improve_centers()
                    population.append(tmp)

        # evaluate unevaluated elements of the population
        for p in population:
            if p[3] == np.inf:
                p[3] = p[0].evaluate(p[1], p[2])

        # sorting the population
        population = sorted([p for p in population],
                            key=lambda x: x[3])[:self.n_pop]

        return ss.inverse_transform(p_best[1]), p_best[2]

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'm_min': self.m_min,
                'm_max': self.m_max,
                'Ib': self.Ib,
                'Ob': self.Ob,
                'n_pop': self.n_pop,
                'n_init_pop': self.n_init_pop,
                'n_iter': self.n_iter,
                'n_sampling_epoch': self.n_sampling_epoch,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Gaussian_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @article{gaussian_smote,
                      title={Gaussian-Based SMOTE Algorithm for Solving Skewed
                                Class Distributions},
                      author={Hansoo Lee and Jonggeun Kim and Sungshin Kim},
                      journal={Int. J. Fuzzy Logic and Intelligent Systems},
                      year={2017},
                      volume={17},
                      pages={229-234}
                    }
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 sigma=1.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors
            sigma (float): variance
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater(sigma, "sigma", 0.0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.sigma = sigma
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'sigma': [0.5, 1.0, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # standardization applied to make sigma compatible with the data
        ss = StandardScaler()
        X_ss = ss.fit_transform(X)

        # fitting nearest neighbors model to find the minority neighbors of
        # minority samples
        X_min = X_ss[y == self.min_label]
        n_neighbors = min([len(X_min), self.n_neighbors + 1])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            idx = self.random_state.randint(len(X_min))
            random_neighbor = self.random_state.choice(ind[idx][1:])
            s0 = self.sample_between_points(X_min[idx], X_min[random_neighbor])
            samples.append(self.random_state.normal(s0, self.sigma))

        return (np.vstack([X, ss.inverse_transform(np.vstack(samples))]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'sigma': self.sigma,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class kmeans_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @article{kmeans_smote,
                        title = "Improving imbalanced learning through a
                                    heuristic oversampling method based
                                    on k-means and SMOTE",
                        journal = "Information Sciences",
                        volume = "465",
                        pages = "1 - 20",
                        year = "2018",
                        issn = "0020-0255",
                        doi = "https://doi.org/10.1016/j.ins.2018.06.056",
                        author = "Georgios Douzas and Fernando Bacao and
                                    Felix Last",
                        keywords = "Class-imbalanced learning, Oversampling,
                                    Classification, Clustering, Supervised
                                    learning, Within-class imbalance"
                        }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_clusters=10,
                 irt=2.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            n_neighbors (int): number of neighbors
            n_clusters (int): number of clusters
            irt (float): imbalanced ratio threshold
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_clusters, "n_clusters", 1)
        self.check_greater_or_equal(irt, "irt", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_clusters = n_clusters
        self.irt = irt
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_clusters': [2, 5, 10, 20, 50],
                                  'irt': [0.5, 0.8, 1.0, 1.5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # applying kmeans clustering to all data
        n_clusters = min([self.n_clusters, len(X)])
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X)

        # extracting clusters
        labels = kmeans.labels_
        clusters = [np.where(labels == li)[0] for li in range(n_clusters)]

        # cluster filtering
        def cluster_filter(c):
            numerator = np.sum(y[c] == self.maj_label) + 1
            denominator = np.sum(y[c] == self.min_label) + 1
            n_minority = np.sum(y[c] == self.min_label)
            return numerator/denominator < self.irt and n_minority > 1

        filt_clusters = [c for c in clusters if cluster_filter(c)]

        if len(filt_clusters) == 0:
            _logger.warning(self.__class__.__name__ + ": " +
                            "number of clusters after filtering is 0")
            return X.copy(), y.copy()

        # Step 2 in the paper
        sparsity = []
        nearest_neighbors = []
        cluster_minority_ind = []
        for c in filt_clusters:
            # extract minority indices in the cluster
            minority_ind = c[y[c] == self.min_label]
            cluster_minority_ind.append(minority_ind)
            # compute distance matrix of minority samples in the cluster
            dm = pairwise_distances(X[minority_ind])
            min_count = len(minority_ind)
            # compute the average of distances
            avg_min_dist = (np.sum(dm) - dm.trace()) / \
                (len(minority_ind)**2 - len(minority_ind))
            # compute sparsity (Step 4)
            sparsity.append(avg_min_dist**len(X[0])/min_count)
            # extract the nearest neighbors graph
            n_neighbors = min([len(minority_ind), self.n_neighbors + 1])
            nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
            nn.fit(X[minority_ind])
            nearest_neighbors.append(nn.kneighbors(X[minority_ind]))

        # Step 5 - compute density of sampling
        weights = sparsity/np.sum(sparsity)

        # do the sampling
        samples = []
        while len(samples) < n_to_sample:
            # choose random cluster index and random minority element
            clust_ind = self.random_state.choice(
                np.arange(len(weights)), p=weights)
            idx = self.random_state.randint(
                len(cluster_minority_ind[clust_ind]))
            base_idx = cluster_minority_ind[clust_ind][idx]
            # choose random neighbor
            neighbor_cluster_indices = nearest_neighbors[clust_ind][1][idx][1:]
            domain = cluster_minority_ind[clust_ind][neighbor_cluster_indices]
            neighbor_idx = self.random_state.choice(domain)
            # sample
            X_a = X[base_idx]
            X_b = X[neighbor_idx]
            samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_clusters': self.n_clusters,
                'irt': self.irt,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class Supervised_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @article{supervised_smote,
                        author = {Hu, Jun AND He, Xue AND Yu, Dong-Jun AND
                                    Yang, Xi-Bei AND Yang, Jing-Yu AND Shen,
                                    Hong-Bin},
                        journal = {PLOS ONE},
                        publisher = {Public Library of Science},
                        title = {A New Supervised Over-Sampling Algorithm
                                    with Application to Protein-Nucleotide
                                    Binding Residue Prediction},
                        year = {2014},
                        month = {09},
                        volume = {9},
                        url = {https://doi.org/10.1371/journal.pone.0107676},
                        pages = {1-10},
                        number = {9},
                        doi = {10.1371/journal.pone.0107676}
                    }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_uses_classifier,
                  OverSampling.cat_application]

[docs]    def __init__(self,
                 proportion=1.0,
                 th_lower=0.5,
                 th_upper=1.0,
                 classifier=RandomForestClassifier(n_estimators=50,
                                                   n_jobs=1,
                                                   random_state=5),
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            th_lower (float): lower bound of the confidence interval
            th_upper (float): upper bound of the confidence interval
            classifier (obj): classifier used to estimate class memberships
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_in_range(th_lower, "th_lower", [0, 1])
        self.check_in_range(th_upper, "th_upper", [0, 1])
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.th_lower = th_lower
        self.th_upper = th_upper
        self.classifier = classifier
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        classifiers = [RandomForestClassifier(n_estimators=50,
                                              n_jobs=1,
                                              random_state=5)]
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'th_lower': [0.3, 0.5, 0.8],
                                  'th_upper': [1.0],
                                  'classifier': classifiers}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        # training the classifier
        self.classifier.fit(X, y)

        X_min = X[y == self.min_label]

        th_lower = self.th_lower

        # do the sampling
        samples = []
        n_trials = 1
        n_success = 1
        while len(samples) < n_to_sample:
            n_trials = n_trials + 1

            domain = range(len(X_min))
            x0, x1 = self.random_state.choice(domain, 2, replace=False)
            x0, x1 = X_min[x0], X_min[x1]
            sample = self.sample_between_points(x0, x1)
            probs = self.classifier.predict_proba(sample.reshape(1, -1))
            # extract probability
            class_column = np.where(self.classifier.classes_ == self.min_label)
            class_column = class_column[0][0]
            prob = probs[0][class_column]
            if prob >= th_lower and prob <= self.th_upper:
                samples.append(sample)
                n_success = n_success + 1

            # decreasing lower threshold if needed
            if n_success/n_trials < 0.02:
                th_lower = th_lower * 0.9
                n_success = 1
                n_trials = 1

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'th_lower': self.th_lower,
                'th_upper': self.th_upper,
                'classifier': self.classifier,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class SN_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @Article{sn_smote,
                        author="Garc{\'i}a, V.
                        and S{\'a}nchez, J. S.
                        and Mart{\'i}n-F{\'e}lez, R.
                        and Mollineda, R. A.",
                        title="Surrounding neighborhood-based SMOTE for
                                learning from imbalanced data sets",
                        journal="Progress in Artificial Intelligence",
                        year="2012",
                        month="Dec",
                        day="01",
                        volume="1",
                        number="4",
                        pages="347--362",
                        issn="2192-6360",
                        doi="10.1007/s13748-012-0027-5",
                        url="https://doi.org/10.1007/s13748-012-0027-5"
                        }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=5,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_neighbors (float): number of neighbors
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # the search for the k nearest centroid neighbors is limited for the
        # nearest 10*n_neighbors neighbors
        n_neighbors = min([self.n_neighbors*10, len(X_min)])
        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # determining k nearest centroid neighbors
        ncn = np.zeros(shape=(len(X_min), self.n_neighbors)).astype(int)
        ncn_nums = np.zeros(len(X_min)).astype(int)

        # extracting nearest centroid neighbors
        for i in range(len(X_min)):
            # the first NCN neighbor is the first neighbor
            ncn[i, 0] = ind[i][1]

            # iterating through all neighbors and finding the one with smaller
            # centroid distance to X_min[i] than the previous set of neighbors
            n_cent = 1
            centroid = X_min[ncn[i, 0]]
            cent_dist = np.linalg.norm(centroid - X_min[i])
            j = 2
            while j < len(ind[i]) and n_cent < self.n_neighbors:
                new_cent_dist = np.linalg.norm(
                    (centroid + X_min[ind[i][j]])/(n_cent + 1) - X_min[i])

                # checking if new nearest centroid neighbor found
                if new_cent_dist < cent_dist:
                    centroid = centroid + X_min[ind[i][j]]
                    ncn[i, n_cent] = ind[i][j]
                    n_cent = n_cent + 1
                    cent_dist = new_cent_dist
                j = j + 1

            # registering the number of nearest centroid neighbors found
            ncn_nums[i] = n_cent

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            random_idx = self.random_state.randint(len(X_min))
            random_neighbor_idx = self.random_state.choice(
                ncn[random_idx][:ncn_nums[random_idx]])
            samples.append(self.sample_between_points(
                X_min[random_idx], X_min[random_neighbor_idx]))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class CCR(OverSampling):
    """
    References:
        * BibTex::

            @article{ccr,
                    author = {Koziarski, Michał and Wozniak, Michal},
                    year = {2017},
                    month = {12},
                    pages = {727–736},
                    title = {CCR: A combined cleaning and resampling algorithm
                                for imbalanced data classification},
                    volume = {27},
                    journal = {International Journal of Applied Mathematics
                                and Computer Science}
                    }

    Notes:
        * Adapted from https://github.com/michalkoziarski/CCR
    """

    categories = [OverSampling.cat_extensive]

[docs]    def __init__(self,
                 proportion=1.0,
                 energy=1.0,
                 scaling=0.0,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal
                                to the number of majority samples
            energy (float): energy parameter
            scaling (float): scaling factor
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(energy, "energy", 0)
        self.check_greater_or_equal(scaling, "scaling", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.energy = energy
        self.scaling = scaling
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'energy': [0.001, 0.0025, 0.005,
                                             0.01, 0.025, 0.05, 0.1,
                                             0.25, 0.5, 1.0, 2.5, 5.0,
                                             10.0, 25.0, 50.0, 100.0],
                                  'scaling': [0.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        def taxicab_sample(n, r):
            sample = []
            random_numbers = self.random_state.rand(n)

            for i in range(n):
                # spread = r - np.sum(np.abs(sample))
                spread = r
                if len(sample) > 0:
                    spread -= abs(sample[-1])
                sample.append(spread * (2 * random_numbers[i] - 1))

            return self.random_state.permutation(sample)

        minority = X[y == self.min_label]
        majority = X[y == self.maj_label]

        energy = self.energy * (X.shape[1] ** self.scaling)

        distances = pairwise_distances(minority, majority, metric='l1')

        radii = np.zeros(len(minority))
        translations = np.zeros(majority.shape)

        for i in range(len(minority)):
            minority_point = minority[i]
            remaining_energy = energy
            r = 0.0
            sorted_distances = np.argsort(distances[i])
            current_majority = 0

            while True:
                if current_majority > len(majority):
                    break

                if current_majority == len(majority):
                    if current_majority == 0:
                        radius_change = remaining_energy / \
                            (current_majority + 1.0)
                    else:
                        radius_change = remaining_energy / current_majority

                    r += radius_change
                    break

                radius_change = remaining_energy / (current_majority + 1.0)

                dist = distances[i, sorted_distances[current_majority]]
                if dist >= r + radius_change:
                    r += radius_change
                    break
                else:
                    if current_majority == 0:
                        last_distance = 0.0
                    else:
                        cm1 = current_majority - 1
                        last_distance = distances[i, sorted_distances[cm1]]

                    curr_maj_idx = sorted_distances[current_majority]
                    radius_change = distances[i, curr_maj_idx] - last_distance
                    r += radius_change
                    decrease = radius_change * (current_majority + 1.0)
                    remaining_energy -= decrease
                    current_majority += 1

            radii[i] = r

            for j in range(current_majority):
                majority_point = majority[sorted_distances[j]].astype(float)
                d = distances[i, sorted_distances[j]]

                if d < 1e-20:
                    n_maj_point = len(majority_point)
                    r_num = self.random_state.rand(n_maj_point)
                    r_num = 1e-6 * r_num + 1e-6
                    r_sign = self.random_state.choice([-1.0, 1.0], n_maj_point)
                    majority_point += r_num * r_sign
                    d = np.sum(np.abs(minority_point - majority_point))

                translation = (r - d) / d * (majority_point - minority_point)
                translations[sorted_distances[j]] += translation

        majority = majority.astype(float)
        majority += translations

        appended = []
        for i in range(len(minority)):
            minority_point = minority[i]
            synthetic_samples = n_to_sample / (radii[i] * np.sum(1.0 / radii))
            synthetic_samples = int(np.round(synthetic_samples))
            r = radii[i]

            for _ in range(synthetic_samples):
                appended.append(minority_point +
                                taxicab_sample(len(minority_point), r))

        if len(appended) == 0:
            _logger.info("No samples were added")
            return X.copy(), y.copy()

        return (np.vstack([X, np.vstack(appended)]),
                np.hstack([y, np.repeat(self.min_label, len(appended))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'energy': self.energy,
                'scaling': self.scaling,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class ANS(OverSampling):
    """
    References:
        * BibTex::

            @article{ans,
                     author = {Siriseriwan, W and Sinapiromsaran, Krung},
                     year = {2017},
                     month = {09},
                     pages = {565-576},
                     title = {Adaptive neighbor synthetic minority oversampling
                                technique under 1NN outcast handling},
                     volume = {39},
                     booktitle = {Songklanakarin Journal of Science and
                                    Technology}
                     }

    Notes:
        * The method is not prepared for the case when there is no c satisfying
            the condition in line 25 of the algorithm, fixed.
        * The method is not prepared for empty Pused sets, fixed.
    """
    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_sample_ordinary,
                  OverSampling.cat_density_based]

[docs]    def __init__(self, proportion=1.0, n_jobs=1, random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.
        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [
            0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        X_min = X[y == self.min_label]

        # outcast extraction algorithm

        # maximum C value
        C_max = int(0.25*len(X))

        # finding the first minority neighbor of minority samples
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        nn.fit(X_min)
        dist, ind = nn.kneighbors(X_min)

        # extracting the distances of first minority neighbors from minority
        # samples
        first_pos_neighbor_distances = dist[:, 1]

        # fitting another nearest neighbors model to extract majority
        # samples in the neighborhoods of minority samples
        nn = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
        nn.fit(X)

        # extracting the number of majority samples in the neighborhood of
        # minority samples
        out_border = []
        for i in range(len(X_min)):
            x = X_min[i].reshape(1, -1)
            ind = nn.radius_neighbors(x,
                                      first_pos_neighbor_distances[i],
                                      return_distance=False)
            out_border.append(np.sum(y[ind[0]] == self.maj_label))

        out_border = np.array(out_border)

        # finding the optimal C value by comparing the number of outcast
        # minority samples when traversing the range [1, C_max]
        n_oc_m1 = -1
        C = 0
        best_diff = np.inf
        for c in range(1, C_max):
            n_oc = np.sum(out_border >= c)
            if abs(n_oc - n_oc_m1) < best_diff:
                best_diff = abs(n_oc - n_oc_m1)
                C = n_oc
            n_oc_m1 = n_oc

        # determining the set of minority samples Pused
        Pused = np.where(out_border < C)[0]

        # Adaptive neighbor SMOTE algorithm

        # checking if there are minority samples left
        if len(Pused) == 0:
            _logger.info(self.__class__.__name__ + ": " + "Pused is empty")
            return X.copy(), y.copy()

        # finding the maximum distances of first positive neighbors
        eps = np.max(first_pos_neighbor_distances[Pused])

        # fitting nearest neighbors model to find nearest minority samples in
        # the neighborhoods of minority samples
        nn = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs)
        nn.fit(X_min[Pused])
        ind = nn.radius_neighbors(X_min[Pused], eps, return_distance=False)

        # extracting the number of positive samples in the neighborhoods
        Np = np.array([len(i) for i in ind])

        if np.all(Np == 1):
            message = "all samples have only 1 neighbor in the given radius"
            _logger.warning(self.__class__.__name__ + ": " + message)
            return X.copy(), y.copy()

        # determining the distribution used to generate samples
        distribution = Np/np.sum(Np)

        # generating samples
        samples = []
        while len(samples) < n_to_sample:
            random_idx = self.random_state.choice(
                np.arange(len(Pused)), p=distribution)
            if len(ind[random_idx]) > 1:
                random_neig_idx = self.random_state.choice(ind[random_idx])
                while random_neig_idx == random_idx:
                    random_neig_idx = self.random_state.choice(ind[random_idx])
                X_a = X_min[Pused[random_idx]]
                X_b = X_min[Pused[random_neig_idx]]
                samples.append(self.sample_between_points(X_a, X_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class cluster_SMOTE(OverSampling):
    """
    References:
        * BibTex::

            @INPROCEEDINGS{cluster_SMOTE,
                            author={Cieslak, D. A. and Chawla, N. V. and
                                        Striegel, A.},
                            booktitle={2006 IEEE International Conference
                                        on Granular Computing},
                            title={Combating imbalance in network
                                        intrusion datasets},
                            year={2006},
                            volume={},
                            number={},
                            pages={732-737},
                            keywords={Intelligent networks;Intrusion detection;
                                        Telecommunication traffic;Data mining;
                                        Computer networks;Data security;
                                        Machine learning;Counting circuits;
                                        Computer security;Humans},
                            doi={10.1109/GRC.2006.1635905},
                            ISSN={},
                            month={May}}
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_uses_clustering]

[docs]    def __init__(self,
                 proportion=1.0,
                 n_neighbors=3,
                 n_clusters=3,
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): number of neighbors in SMOTE
            n_clusters (int): number of clusters
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__()
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_greater_or_equal(n_clusters, "n_components", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

[docs]    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable paramter combinations.

        Returns:
            list(dict): a list of meaningful paramter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5, 7],
                                  'n_clusters': [3, 5, 7, 9]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the class paramters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(X, y)

        X_min = X[y == self.min_label]

        # determining the number of samples to generate
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        n_clusters = min([len(X_min), self.n_clusters])
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=self.random_state)
        kmeans.fit(X_min)
        cluster_labels = kmeans.labels_
        unique_labels = np.unique(cluster_labels)

        # creating nearest neighbors objects for each cluster
        cluster_indices = [np.where(cluster_labels == c)[0]
                           for c in unique_labels]

        def nneighbors(idx):
            n_neighbors = min([self.n_neighbors, len(cluster_indices[idx])])
            nn = NearestNeighbors(n_neighbors=n_neighbors)
            return nn.fit(X_min[cluster_indices[idx]])

        cluster_nns = [nneighbors(idx) for idx in range(len(cluster_indices))]

        if max([len(c) for c in cluster_indices]) <= 1:
            _logger.info(self.__class__.__name__ + ": " +
                         "All clusters contain 1 element")
            return X.copy(), y.copy()

        # generating the samples
        samples = []
        while len(samples) < n_to_sample:
            cluster_idx = self.random_state.randint(len(cluster_indices))
            if len(cluster_indices[cluster_idx]) <= 1:
                continue
            random_idx = self.random_state.randint(
                len(cluster_indices[cluster_idx]))
            sample_a = X_min[cluster_indices[cluster_idx]][random_idx]
            dist, indices = cluster_nns[cluster_idx].kneighbors(
                sample_a.reshape(1, -1))
            sample_b_idx = self.random_state.choice(
                cluster_indices[cluster_idx][indices[0][1:]])
            sample_b = X_min[sample_b_idx]
            samples.append(self.sample_between_points(sample_a, sample_b))

        return (np.vstack([X, np.vstack(samples)]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'n_clusters': self.n_clusters,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


[docs]class MulticlassOversampling(StatisticsMixin):
    """
    Carries out multiclass oversampling

    Example::

        import smote_variants as sv
        import sklearn.datasets as datasets

        dataset= datasets.load_wine()

        oversampler= sv.MulticlassOversampling(sv.distance_SMOTE())

        X_samp, y_samp= oversampler.sample(dataset['data'], dataset['target'])
    """

[docs]    def __init__(self,
                 oversampler=SMOTE(random_state=2),
                 strategy="eq_1_vs_many_successive"):
        """
        Constructor of the multiclass oversampling object

        Args:
            oversampler (obj): an oversampling object
            strategy (str/obj): a multiclass oversampling strategy, currently
                                'eq_1_vs_many_successive' or
                                'equalize_1_vs_many'
        """
        self.oversampler = oversampler
        self.strategy = strategy

[docs]    def sample_equalize_1_vs_many(self, X, y):
        """
        Does the sample generation by oversampling each minority class to the
        cardinality of the majority class using all original samples in each
        run.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """

        message = "Running multiclass oversampling with strategy %s"
        message = message % str(self.strategy)
        _logger.info(self.__class__.__name__ + ": " + message)

        if 'proportion' not in self.oversampler.get_params():
            message = ("Multiclass oversampling strategy %s cannot be "
                       "used with oversampling techniques without proportion"
                       " parameter")
            message = message % str(self.strategy)
            raise ValueError(message)

        # extract class label statistics
        self.class_label_statistics(X, y)

        # sort labels by number of samples
        class_labels = self.class_stats.keys()
        class_labels = sorted(class_labels, key=lambda x: -self.class_stats[x])

        majority_class_label = class_labels[0]

        # determining the majority class data
        X_maj = X[y == majority_class_label]

        # dict to store the results
        results = {}
        results[majority_class_label] = X_maj.copy()

        # running oversampling for all minority classes against all oversampled
        # classes
        for i in range(1, len(class_labels)):
            message = "Sampling minority class with label: %d"
            message = message % class_labels[i]
            _logger.info(self.__class__.__name__ + ": " + message)

            # extract current minority class
            minority_class_label = class_labels[i]
            X_min = X[y == minority_class_label]
            X_maj = X[y != minority_class_label]

            # prepare data to pass to oversampling
            X_training = np.vstack([X_maj, X_min])
            y_training = np.hstack(
                [np.repeat(0, len(X_maj)), np.repeat(1, len(X_min))])

            # prepare parameters by properly setting the proportion value
            params = self.oversampler.get_params()

            num_to_generate = self.class_stats[majority_class_label] - \
                self.class_stats[class_labels[i]]
            num_to_gen_to_all = len(X_maj) - self.class_stats[class_labels[i]]

            params['proportion'] = num_to_generate/num_to_gen_to_all

            # instantiating new oversampling object with the proper proportion
            # parameter
            oversampler = self.oversampler.__class__(**params)

            # executing the sampling
            X_samp, y_samp = oversampler.sample(X_training, y_training)

            # registaring the newly oversampled minority class in the output
            # set
            results[class_labels[i]] = X_samp[len(
                X_training):][y_samp[len(X_training):] == 1]

        # constructing the output set
        X_final = results[class_labels[1]]
        y_final = np.repeat(class_labels[1], len(results[class_labels[1]]))

        for i in range(2, len(class_labels)):
            X_final = np.vstack([X_final, results[class_labels[i]]])
            y_new = np.repeat(class_labels[i], len(results[class_labels[i]]))
            y_final = np.hstack([y_final, y_new])

        return np.vstack([X, X_final]), np.hstack([y, y_final])

[docs]    def sample_equalize_1_vs_many_successive(self, X, y):
        """
        Does the sample generation by oversampling each minority class
        successively to the cardinality of the majority class,
        incorporating the results of previous oversamplings.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """

        message = "Running multiclass oversampling with strategy %s"
        message = message % str(self.strategy)
        _logger.info(self.__class__.__name__ + ": " + message)

        if 'proportion' not in self.oversampler.get_params():
            message = ("Multiclass oversampling strategy %s cannot be used"
                       " with oversampling techniques without proportion"
                       " parameter") % str(self.strategy)
            raise ValueError(message)

        # extract class label statistics
        self.class_label_statistics(X, y)

        # sort labels by number of samples
        class_labels = self.class_stats.keys()
        class_labels = sorted(class_labels, key=lambda x: -self.class_stats[x])

        majority_class_label = class_labels[0]

        # determining the majority class data
        X_maj = X[y == majority_class_label]

        # dict to store the results
        results = {}
        results[majority_class_label] = X_maj.copy()

        # running oversampling for all minority classes against all
        # oversampled classes
        for i in range(1, len(class_labels)):
            message = "Sampling minority class with label: %d"
            message = message % class_labels[i]
            _logger.info(self.__class__.__name__ + ": " + message)

            # extract current minority class
            minority_class_label = class_labels[i]
            X_min = X[y == minority_class_label]

            # prepare data to pass to oversampling
            X_training = np.vstack([X_maj, X_min])
            y_training = np.hstack(
                [np.repeat(0, len(X_maj)), np.repeat(1, len(X_min))])

            # prepare parameters by properly setting the proportion value
            params = self.oversampler.get_params()

            n_majority = self.class_stats[majority_class_label]
            n_class_i = self.class_stats[class_labels[i]]
            num_to_generate = n_majority - n_class_i

            num_to_gen_to_all = i * n_majority - n_class_i

            params['proportion'] = num_to_generate/num_to_gen_to_all

            # instantiating new oversampling object with the proper proportion
            # parameter
            oversampler = self.oversampler.__class__(**params)

            # executing the sampling
            X_samp, y_samp = oversampler.sample(X_training, y_training)

            # adding the newly oversampled minority class to the majority data
            X_maj = np.vstack([X_maj, X_samp[y_samp == 1]])

            # registaring the newly oversampled minority class in the output
            # set
            result_mask = y_samp[len(X_training):] == 1
            results[class_labels[i]] = X_samp[len(X_training):][result_mask]

        # constructing the output set
        X_final = results[class_labels[1]]
        y_final = np.repeat(class_labels[1], len(results[class_labels[1]]))

        for i in range(2, len(class_labels)):
            X_final = np.vstack([X_final, results[class_labels[i]]])
            y_new = np.repeat(class_labels[i], len(results[class_labels[i]]))
            y_final = np.hstack([y_final, y_new])

        return np.vstack([X, X_final]), np.hstack([y, y_final])

[docs]    def sample(self, X, y):
        """
        Does the sample generation according to the oversampling strategy.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """

        if self.strategy == "eq_1_vs_many_successive":
            return self.sample_equalize_1_vs_many_successive(X, y)
        elif self.strategy == "equalize_1_vs_many":
            return self.sample_equalize_1_vs_many(X, y)
        else:
            message = "Multiclass oversampling startegy %s not implemented."
            message = message % self.strategy
            raise ValueError(message)

[docs]    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the multiclass oversampling object
        """
        return {'oversampler': self.oversampler, 'strategy': self.strategy}


class OversamplingClassifier(BaseEstimator, ClassifierMixin):
    """
    This class wraps an oversampler and a classifier, making it compatible
    with sklearn based pipelines.
    """

    def __init__(self, oversampler, classifier):
        """
        Constructor of the wrapper.

        Args:
            oversampler (obj): an oversampler object
            classifier (obj): an sklearn-compatible classifier
        """

        self.oversampler = oversampler
        self.classifier = classifier

    def fit(self, X, y=None):
        """
        Carries out oversampling and fits the classifier.

        Args:
            X (np.ndarray): feature vectors
            y (np.array): target values

        Returns:
            obj: the object itself
        """

        X_samp, y_samp = self.oversampler.sample(X, y)
        self.classifier.fit(X_samp, y_samp)

        return self

    def predict(self, X):
        """
        Carries out the predictions.

        Args:
            X (np.ndarray): feature vectors
        """

        return self.classifier.predict(X)

    def predict_proba(self, X):
        """
        Carries out the predictions with probability estimations.

        Args:
            X (np.ndarray): feature vectors
        """

        return self.classifier.predict_proba(X)

    def get_params(self, deep=True):
        """
        Returns the dictionary of parameters.

        Args:
            deep (bool): wether to return parameters with deep discovery

        Returns:
            dict: the dictionary of parameters
        """

        return {'oversampler': self.oversampler, 'classifier': self.classifier}

    def set_params(self, **parameters):
        """
        Sets the parameters.

        Args:
            parameters (dict): the parameters to set.

        Returns:
            obj: the object itself
        """

        for parameter, value in parameters.items():
            setattr(self, parameter, value)

        return self


class MLPClassifierWrapper:
    """
    Wrapper over MLPClassifier of sklearn to provide easier parameterization
    """

    def __init__(self,
                 activation='relu',
                 hidden_layer_fraction=0.1,
                 alpha=0.0001,
                 random_state=None):
        """
        Constructor of the MLPClassifier

        Args:
            activation (str): name of the activation function
            hidden_layer_fraction (float): fraction of the hidden neurons of
                                            the number of input dimensions
            alpha (float): alpha parameter of the MLP classifier
            random_state (int/np.random.RandomState/None): initializer of the
                                                            random state
        """
        self.activation = activation
        self.hidden_layer_fraction = hidden_layer_fraction
        self.alpha = alpha
        self.random_state = random_state

    def fit(self, X, y):
        """
        Fit the model to the data

        Args:
            X (np.ndarray): features
            y (np.array): target labels

        Returns:
            obj: the MLPClassifierWrapper object
        """
        hidden_layer_size = max([1, int(len(X[0])*self.hidden_layer_fraction)])
        self.model = MLPClassifier(activation=self.activation,
                                   hidden_layer_sizes=(hidden_layer_size,),
                                   alpha=self.alpha,
                                   random_state=self.random_state).fit(X, y)
        return self

    def predict(self, X):
        """
        Predicts the labels of the unseen data

        Args:
            X (np.ndarray): unseen features

        Returns:
            np.array: predicted labels
        """
        return self.model.predict(X)

    def predict_proba(self, X):
        """
        Predicts the class probabilities of the unseen data

        Args:
            X (np.ndarray): unseen features

        Returns:
            np.matrix: predicted class probabilities
        """
        return self.model.predict_proba(X)

    def get_params(self, deep=False):
        """
        Returns the parameters of the classifier.

        Returns:
            dict: the parameters of the object
        """
        return {'activation': self.activation,
                'hidden_layer_fraction': self.hidden_layer_fraction,
                'alpha': self.alpha,
                'random_state': self.random_state}

    def copy(self):
        """
        Creates a copy of the classifier.

        Returns:
            obj: a copy of the classifier
        """
        return MLPClassifierWrapper(**self.get_params())


class Folding():
    """
    Cache-able folding of dataset for cross-validation
    """

    def __init__(self, dataset, validator, cache_path=None, random_state=None):
        """
        Constructor of Folding object

        Args:
            dataset (dict): dataset dictionary with keys 'data', 'target'
                            and 'DESCR'
            validator (obj): cross-validator object
            cache_path (str): path to cache directory
            random_state (int/np.random.RandomState/None): initializer of
                                                            the random state
        """
        self.dataset = dataset
        self.db_name = self.dataset['name']
        self.validator = validator
        self.cache_path = cache_path
        self.filename = 'folding_' + self.db_name + '.pickle'
        self.db_size = len(dataset['data'])
        self.db_n_attr = len(dataset['data'][0])
        self.imbalanced_ratio = np.sum(
            self.dataset['target'] == 0)/np.sum(self.dataset['target'] == 1)
        self.random_state = random_state

    def do_folding(self):
        """
        Does the folding or reads it from file if already available

        Returns:
            list(tuple): list of tuples of X_train, y_train, X_test, y_test
                            objects
        """

        self.validator.random_state = self.random_state

        if not hasattr(self, 'folding'):
            cond_cache_none = self.cache_path is None
            if not cond_cache_none:
                filename = os.path.join(self.cache_path, self.filename)
                cond_file_not_exists = not os.path.isfile(filename)
            else:
                cond_file_not_exists = False

            if cond_cache_none or cond_file_not_exists:
                _logger.info(self.__class__.__name__ +
                             (" doing folding %s" % self.filename))

                self.folding = {}
                self.folding['folding'] = []
                self.folding['db_size'] = len(self.dataset['data'])
                self.folding['db_n_attr'] = len(self.dataset['data'][0])
                n_maj = np.sum(self.dataset['target'] == 0)
                n_min = np.sum(self.dataset['target'] == 1)
                self.folding['imbalanced_ratio'] = n_maj / n_min

                X = self.dataset['data']
                y = self.dataset['target']

                data = self.dataset['data']
                target = self.dataset['target']

                for train, test in self.validator.split(data, target, target):
                    folding = (X[train], y[train], X[test], y[test])
                    self.folding['folding'].append(folding)
                if self.cache_path is not None:
                    _logger.info(self.__class__.__name__ +
                                 (" dumping to file %s" % self.filename))
                    random_filename = np.random.randint(1000000)
                    random_filename = str(random_filename) + '.pickle'
                    random_filename = os.path.join(self.cache_path,
                                                   random_filename)
                    pickle.dump(self.folding, open(random_filename, "wb"))
                    os.rename(random_filename, os.path.join(
                        self.cache_path, self.filename))
            else:
                _logger.info(self.__class__.__name__ +
                             (" reading from file %s" % self.filename))
                self.folding = pickle.load(
                    open(os.path.join(self.cache_path, self.filename), "rb"))
        return self.folding

    def get_params(self, deep=False):
        return {'db_name': self.db_name}

    def descriptor(self):
        return str(self.get_params())


class Sampling():
    """
    Cache-able sampling of dataset folds
    """

    def __init__(self,
                 folding,
                 sampler,
                 sampler_parameters,
                 scaler,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            folding (obj): Folding object
            sampler (class): class of a sampler object
            sampler_parameters (dict): a parameter combination for the sampler
                                        object
            scaler (obj): scaler object
            random_state (int/np.random.RandomState/None): initializer of the
                                                            random state
        """
        self.folding = folding
        self.db_name = folding.db_name
        self.sampler = sampler
        self.sampler_parameters = sampler_parameters
        self.sampler_parameters['random_state'] = random_state
        self.scaler = scaler
        self.cache_path = folding.cache_path
        self.filename = self.standardized_filename('sampling')
        self.random_state = random_state

    def standardized_filename(self,
                              prefix,
                              db_name=None,
                              sampler=None,
                              sampler_parameters=None):
        """
        standardizes the filename

        Args:
            filename (str): filename

        Returns:
            str: standardized name
        """
        import hashlib

        db_name = (db_name or self.db_name)

        sampler = (sampler or self.sampler)
        sampler = sampler.__name__
        sampler_parameters = sampler_parameters or self.sampler_parameters
        _logger.info(str(sampler_parameters))
        from collections import OrderedDict
        sampler_parameters_ordered = OrderedDict()
        for k in sorted(list(sampler_parameters.keys())):
            sampler_parameters_ordered[k] = sampler_parameters[k]

        message = " sampler parameter string "
        message = message + str(sampler_parameters_ordered)
        _logger.info(self.__class__.__name__ + message)
        sampler_parameter_str = hashlib.md5(
            str(sampler_parameters_ordered).encode('utf-8')).hexdigest()

        filename = '_'.join(
            [prefix, db_name, sampler, sampler_parameter_str]) + '.pickle'
        filename = re.sub('["\\,:(){}]', '', filename)
        filename = filename.replace("'", '')
        filename = filename.replace(": ", "_")
        filename = filename.replace(" ", "_")
        filename = filename.replace("\n", "_")

        return filename

    def cache_sampling(self):
        try:
            import mkl
            mkl.set_num_threads(1)
            _logger.info(self.__class__.__name__ +
                         (" mkl thread number set to 1 successfully"))
        except Exception as e:
            _logger.info(self.__class__.__name__ +
                         (" setting mkl thread number didn't succeed"))
            _logger.info(str(e))

        if not os.path.isfile(os.path.join(self.cache_path, self.filename)):
            # if the sampled dataset does not exist
            sampler_categories = self.sampler.categories
            is_extensive = OverSampling.cat_extensive in sampler_categories
            has_proportion = 'proportion' in self.sampler_parameters
            higher_prop_sampling_avail = None

            if is_extensive and has_proportion:
                proportion = self.sampler_parameters['proportion']
                all_pc = self.sampler.parameter_combinations()
                all_proportions = np.unique([p['proportion'] for p in all_pc])
                all_proportions = all_proportions[all_proportions > proportion]

                for p in all_proportions:
                    tmp_par = self.sampler_parameters.copy()
                    tmp_par['proportion'] = p
                    tmp_filename = self.standardized_filename(
                        'sampling', self.db_name, self.sampler, tmp_par)

                    filename = os.path.join(self.cache_path, tmp_filename)
                    if os.path.isfile(filename):
                        higher_prop_sampling_avail = (p, tmp_filename)
                        break

            if (not is_extensive or not has_proportion or
                    (is_extensive and has_proportion and
                        higher_prop_sampling_avail is None)):
                _logger.info(self.__class__.__name__ + " doing sampling")
                begin = time.time()
                sampling = []
                folds = self.folding.do_folding()
                for X_train, y_train, X_test, y_test in folds['folding']:
                    s = self.sampler(**self.sampler_parameters)

                    if self.scaler is not None:
                        print(self.scaler.__class__.__name__)
                        X_train = self.scaler.fit_transform(X_train, y_train)
                    X_samp, y_samp = s.sample_with_timing(X_train, y_train)

                    if hasattr(s, 'transform'):
                        X_test_trans = s.preprocessing_transform(X_test)
                    else:
                        X_test_trans = X_test.copy()

                    if self.scaler is not None:
                        X_samp = self.scaler.inverse_transform(X_samp)

                    sampling.append((X_samp, y_samp, X_test_trans, y_test))
                runtime = time.time() - begin
            else:
                higher_prop, higher_prop_filename = higher_prop_sampling_avail
                message = " reading and resampling from file %s to %s"
                message = message % (higher_prop_filename, self.filename)
                _logger.info(self.__class__.__name__ + message)
                filename = os.path.join(self.cache_path, higher_prop_filename)
                tmp_results = pickle.load(open(filename, 'rb'))
                tmp_sampling = tmp_results['sampling']
                tmp_runtime = tmp_results['runtime']

                sampling = []
                folds = self.folding.do_folding()
                nums = [len(X_train) for X_train, _, _, _ in folds['folding']]
                i = 0
                for X_train, y_train, X_test, y_test in tmp_sampling:
                    new_num = (len(X_train) - nums[i])/higher_prop*proportion
                    new_num = int(new_num)
                    offset = nums[i] + new_num
                    X_offset = X_train[:offset]
                    y_offset = y_train[:offset]
                    sampling.append((X_offset, y_offset, X_test, y_test))
                    i = i + 1
                runtime = tmp_runtime/p*proportion

            results = {}
            results['sampling'] = sampling
            results['runtime'] = runtime
            results['db_size'] = folds['db_size']
            results['db_n_attr'] = folds['db_n_attr']
            results['imbalanced_ratio'] = folds['imbalanced_ratio']

            _logger.info(self.__class__.__name__ +
                         (" dumping to file %s" % self.filename))

            random_filename = np.random.randint(1000000)
            random_filename = str(random_filename) + '.pickle'
            random_filename = os.path.join(self.cache_path, random_filename)
            pickle.dump(results, open(random_filename, "wb"))
            os.rename(random_filename, os.path.join(
                self.cache_path, self.filename))

    def do_sampling(self):
        self.cache_sampling()
        results = pickle.load(
            open(os.path.join(self.cache_path, self.filename), 'rb'))
        return results

    def get_params(self, deep=False):
        return {'folding': self.folding.get_params(),
                'sampler_name': self.sampler.__name__,
                'sampler_parameters': self.sampler_parameters}

    def descriptor(self):
        return str(self.get_params())


class Evaluation():
    """
    Cache-able evaluation of classifier on sampling
    """

    def __init__(self,
                 sampling,
                 classifiers,
                 n_threads=None,
                 random_state=None):
        """
        Constructor of an Evaluation object

        Args:
            sampling (obj): Sampling object
            classifiers (list(obj)): classifier objects
            n_threads (int/None): number of threads
            random_state (int/np.random.RandomState/None): random state
                                                            initializer
        """
        self.sampling = sampling
        self.classifiers = classifiers
        self.n_threads = n_threads
        self.cache_path = sampling.cache_path
        self.filename = self.sampling.standardized_filename('eval')
        self.random_state = random_state

        self.labels = []
        for i in range(len(classifiers)):
            from collections import OrderedDict
            sampling_parameters = OrderedDict()
            sp = self.sampling.sampler_parameters
            for k in sorted(list(sp.keys())):
                sampling_parameters[k] = sp[k]
            cp = classifiers[i].get_params()
            classifier_parameters = OrderedDict()
            for k in sorted(list(cp.keys())):
                classifier_parameters[k] = cp[k]

            label = str((self.sampling.db_name, sampling_parameters,
                         classifiers[i].__class__.__name__,
                         classifier_parameters))
            self.labels.append(label)

        print(self.labels)

    def calculate_metrics(self, all_pred, all_test, all_folds):
        """
        Calculates metrics of binary classifiction

        Args:
            all_pred (np.matrix): predicted probabilities
            all_test (np.matrix): true labels

        Returns:
            dict: all metrics of binary classification
        """

        results = {}
        if all_pred is not None:
            all_pred_labels = np.apply_along_axis(
                lambda x: np.argmax(x), 1, all_pred)

            results['tp'] = np.sum(np.logical_and(
                np.equal(all_test, all_pred_labels), (all_test == 1)))
            results['tn'] = np.sum(np.logical_and(
                np.equal(all_test, all_pred_labels), (all_test == 0)))
            results['fp'] = np.sum(np.logical_and(np.logical_not(
                np.equal(all_test, all_pred_labels)), (all_test == 0)))
            results['fn'] = np.sum(np.logical_and(np.logical_not(
                np.equal(all_test, all_pred_labels)), (all_test == 1)))
            results['p'] = results['tp'] + results['fn']
            results['n'] = results['fp'] + results['tn']
            results['acc'] = (results['tp'] + results['tn']) / \
                (results['p'] + results['n'])
            results['sens'] = results['tp']/results['p']
            results['spec'] = results['tn']/results['n']
            results['ppv'] = results['tp']/(results['tp'] + results['fp'])
            results['npv'] = results['tn']/(results['tn'] + results['fn'])
            results['fpr'] = 1.0 - results['spec']
            results['fdr'] = 1.0 - results['ppv']
            results['fnr'] = 1.0 - results['sens']
            results['bacc'] = (results['tp']/results['p'] +
                               results['tn']/results['n'])/2.0
            results['gacc'] = np.sqrt(
                results['tp']/results['p']*results['tn']/results['n'])
            results['f1'] = 2*results['tp'] / \
                (2*results['tp'] + results['fp'] + results['fn'])
            mcc_num = results['tp']*results['tn'] - results['fp']*results['fn']
            mcc_denom_0 = (results['tp'] + results['fp'])
            mcc_denom_1 = (results['tp'] + results['fn'])
            mcc_denom_2 = (results['tn'] + results['fp'])
            mcc_denom_3 = (results['tn'] + results['fn'])
            mcc_denom = mcc_denom_0 * mcc_denom_1 * mcc_denom_2*mcc_denom_3
            results['mcc'] = mcc_num/np.sqrt(mcc_denom)
            results['l'] = (results['p'] + results['n']) * \
                np.log(results['p'] + results['n'])
            tp_fp = (results['tp'] + results['fp'])
            tp_fn = (results['tp'] + results['fn'])
            tn_fp = (results['fp'] + results['tn'])
            tn_fn = (results['fn'] + results['tn'])
            results['ltp'] = results['tp']*np.log(results['tp']/(tp_fp*tp_fn))
            results['lfp'] = results['fp']*np.log(results['fp']/(tp_fp*tn_fp))
            results['lfn'] = results['fn']*np.log(results['fn']/(tp_fn*tn_fn))
            results['ltn'] = results['tn']*np.log(results['tn']/(tn_fp*tn_fn))
            results['lp'] = results['p'] * \
                np.log(results['p']/(results['p'] + results['n']))
            results['ln'] = results['n'] * \
                np.log(results['n']/(results['p'] + results['n']))
            uc_num = (results['l'] + results['ltp'] + results['lfp'] +
                      results['lfn'] + results['ltn'])
            uc_denom = (results['l'] + results['lp'] + results['ln'])
            results['uc'] = uc_num/uc_denom
            results['informedness'] = results['sens'] + results['spec'] - 1.0
            results['markedness'] = results['ppv'] + results['npv'] - 1.0
            results['log_loss'] = log_loss(all_test, all_pred)
            results['auc'] = roc_auc_score(all_test, all_pred[:, 1])
            aucs = [roc_auc_score(all_test[all_folds == i],
                                  all_pred[all_folds == i, 1])
                    for i in range(np.max(all_folds)+1)]
            results['auc_mean'] = np.mean(aucs)
            results['auc_std'] = np.std(aucs)
            test_labels, preds = zip(
                *sorted(zip(all_test, all_pred[:, 1]), key=lambda x: -x[1]))
            test_labels = np.array(test_labels)
            th = int(0.2*len(test_labels))
            results['p_top20'] = np.sum(test_labels[:th] == 1)/th
            results['brier'] = np.mean((all_pred[:, 1] - all_test)**2)
        else:
            results['tp'] = 0
            results['tn'] = 0
            results['fp'] = 0
            results['fn'] = 0
            results['p'] = 0
            results['n'] = 0
            results['acc'] = 0
            results['sens'] = 0
            results['spec'] = 0
            results['ppv'] = 0
            results['npv'] = 0
            results['fpr'] = 1
            results['fdr'] = 1
            results['fnr'] = 1
            results['bacc'] = 0
            results['gacc'] = 0
            results['f1'] = 0
            results['mcc'] = np.nan
            results['l'] = np.nan
            results['ltp'] = np.nan
            results['lfp'] = np.nan
            results['lfn'] = np.nan
            results['ltn'] = np.nan
            results['lp'] = np.nan
            results['ln'] = np.nan
            results['uc'] = np.nan
            results['informedness'] = 0
            results['markedness'] = 0
            results['log_loss'] = np.nan
            results['auc'] = 0
            results['auc_mean'] = 0
            results['auc_std'] = 0
            results['p_top20'] = 0
            results['brier'] = 1

        return results

    def do_evaluation(self):
        """
        Does the evaluation or reads it from file

        Returns:
            dict: all metrics
        """

        if self.n_threads is not None:
            try:
                import mkl
                mkl.set_num_threads(self.n_threads)
                message = " mkl thread number set to %d successfully"
                message = message % self.n_threads
                _logger.info(self.__class__.__name__ + message)
            except Exception as e:
                message = " setting mkl thread number didn't succeed"
                _logger.info(self.__class__.__name__ + message)

        evaluations = {}
        if os.path.isfile(os.path.join(self.cache_path, self.filename)):
            evaluations = pickle.load(
                open(os.path.join(self.cache_path, self.filename), 'rb'))

        already_evaluated = np.array([li in evaluations for li in self.labels])

        if not np.all(already_evaluated):
            samp = self.sampling.do_sampling()
        else:
            return list(evaluations.values())

        # setting random states
        for i in range(len(self.classifiers)):
            clf_params = self.classifiers[i].get_params()
            if 'random_state' in clf_params:
                clf_params['random_state'] = self.random_state
                self.classifiers[i] = self.classifiers[i].__class__(
                    **clf_params)
            if isinstance(self.classifiers[i], CalibratedClassifierCV):
                clf_params = self.classifiers[i].base_estimator.get_params()
                clf_params['random_state'] = self.random_state
                class_inst = self.classifiers[i].base_estimator.__class__
                new_inst = class_inst(**clf_params)
                self.classifiers[i].base_estimator = new_inst

        for i in range(len(self.classifiers)):
            if not already_evaluated[i]:
                message = " do the evaluation %s %s %s"
                message = message % (self.sampling.db_name,
                                     self.sampling.sampler.__name__,
                                     self.classifiers[i].__class__.__name__)
                _logger.info(self.__class__.__name__ + message)
                all_preds, all_tests, all_folds = [], [], []
                minority_class_label = None
                majority_class_label = None
                fold_idx = -1
                for X_train, y_train, X_test, y_test in samp['sampling']:
                    fold_idx += 1

                    # X_train[X_train == np.inf]= 0
                    # X_train[X_train == -np.inf]= 0
                    # X_test[X_test == np.inf]= 0
                    # X_test[X_test == -np.inf]= 0

                    class_labels = np.unique(y_train)
                    min_class_size = np.min(
                        [np.sum(y_train == c) for c in class_labels])

                    ss = StandardScaler()
                    X_train_trans = ss.fit_transform(X_train)
                    nonzero_var_idx = np.where(ss.var_ > 1e-8)[0]
                    X_test_trans = ss.transform(X_test)

                    enough_minority_samples = min_class_size > 4
                    y_train_big_enough = len(y_train) > 4
                    two_classes = len(class_labels) > 1
                    at_least_one_feature = (len(nonzero_var_idx) > 0)

                    if not enough_minority_samples:
                        message = " not enough minority samples: %d"
                        message = message % min_class_size
                        _logger.warning(
                            self.__class__.__name__ + message)
                    elif not y_train_big_enough:
                        message = (" number of minority training samples is "
                                   "not enough: %d")
                        message = message % len(y_train)
                        _logger.warning(self.__class__.__name__ + message)
                    elif not two_classes:
                        message = " there is only 1 class in training data"
                        _logger.warning(self.__class__.__name__ + message)
                    elif not at_least_one_feature:
                        _logger.warning(self.__class__.__name__ +
                                        (" no information in features"))
                    else:
                        all_tests.append(y_test)
                        if (minority_class_label is None or
                                majority_class_label is None):
                            class_labels = np.unique(y_train)
                            n_0 = sum(class_labels[0] == y_test)
                            n_1 = sum(class_labels[1] == y_test)
                            if n_0 < n_1:
                                minority_class_label = int(class_labels[0])
                                majority_class_label = int(class_labels[1])
                            else:
                                minority_class_label = int(class_labels[1])
                                majority_class_label = int(class_labels[0])

                        X_fit = X_train_trans[:, nonzero_var_idx]
                        self.classifiers[i].fit(X_fit, y_train)
                        clf = self.classifiers[i]
                        X_pred = X_test_trans[:, nonzero_var_idx]
                        pred = clf.predict_proba(X_pred)
                        all_preds.append(pred)
                        all_folds.append(
                            np.repeat(fold_idx, len(all_preds[-1])))

                if len(all_tests) > 0:
                    all_preds = np.vstack(all_preds)
                    all_tests = np.hstack(all_tests)
                    all_folds = np.hstack(all_folds)

                    evaluations[self.labels[i]] = self.calculate_metrics(
                        all_preds, all_tests, all_folds)
                else:
                    evaluations[self.labels[i]] = self.calculate_metrics(
                        None, None, None)

                evaluations[self.labels[i]]['runtime'] = samp['runtime']
                sampler_name = self.sampling.sampler.__name__
                evaluations[self.labels[i]]['sampler'] = sampler_name
                clf_name = self.classifiers[i].__class__.__name__
                evaluations[self.labels[i]]['classifier'] = clf_name
                sampler_parameters = self.sampling.sampler_parameters.copy()

                evaluations[self.labels[i]]['sampler_parameters'] = str(
                    sampler_parameters)
                evaluations[self.labels[i]]['classifier_parameters'] = str(
                    self.classifiers[i].get_params())
                evaluations[self.labels[i]]['sampler_categories'] = str(
                    self.sampling.sampler.categories)
                evaluations[self.labels[i]
                            ]['db_name'] = self.sampling.folding.db_name
                evaluations[self.labels[i]]['db_size'] = samp['db_size']
                evaluations[self.labels[i]]['db_n_attr'] = samp['db_n_attr']
                evaluations[self.labels[i]
                            ]['imbalanced_ratio'] = samp['imbalanced_ratio']

        if not np.all(already_evaluated):
            _logger.info(self.__class__.__name__ +
                         (" dumping to file %s" % self.filename))
            random_filename = os.path.join(self.cache_path, str(
                np.random.randint(1000000)) + '.pickle')
            pickle.dump(evaluations, open(random_filename, "wb"))
            os.rename(random_filename, os.path.join(
                self.cache_path, self.filename))

        return list(evaluations.values())


def trans(X):
    """
    Transformation function used to aggregate the evaluation results.

    Args:
        X (pd.DataFrame): a grouping of a data frame containing evaluation
                            results
    """
    auc_std = X.iloc[np.argmax(X['auc_mean'].values)]['auc_std']
    cp_auc = X.sort_values('auc')['classifier_parameters'].iloc[-1]
    cp_acc = X.sort_values('acc')['classifier_parameters'].iloc[-1]
    cp_gacc = X.sort_values('gacc')['classifier_parameters'].iloc[-1]
    cp_f1 = X.sort_values('f1')['classifier_parameters'].iloc[-1]
    cp_p_top20 = X.sort_values('p_top20')['classifier_parameters'].iloc[-1]
    cp_brier = X.sort_values('brier')['classifier_parameters'].iloc[-1]
    sp_auc = X.sort_values('auc')['sampler_parameters'].iloc[-1]
    sp_acc = X.sort_values('acc')['sampler_parameters'].iloc[-1]
    sp_gacc = X.sort_values('gacc')['sampler_parameters'].iloc[-1]
    sp_f1 = X.sort_values('f1')['sampler_parameters'].iloc[-1]
    sp_p_top20 = X.sort_values('p_top20')['sampler_parameters'].iloc[-1]
    sp_brier = X.sort_values('p_top20')['sampler_parameters'].iloc[0]

    return pd.DataFrame({'auc': np.max(X['auc']),
                         'auc_mean': np.max(X['auc_mean']),
                         'auc_std': auc_std,
                         'brier': np.min(X['brier']),
                         'acc': np.max(X['acc']),
                         'f1': np.max(X['f1']),
                         'p_top20': np.max(X['p_top20']),
                         'gacc': np.max(X['gacc']),
                         'runtime': np.mean(X['runtime']),
                         'db_size': X['db_size'].iloc[0],
                         'db_n_attr': X['db_n_attr'].iloc[0],
                         'imbalanced_ratio': X['imbalanced_ratio'].iloc[0],
                         'sampler_categories': X['sampler_categories'].iloc[0],
                         'classifier_parameters_auc': cp_auc,
                         'classifier_parameters_acc': cp_acc,
                         'classifier_parameters_gacc': cp_gacc,
                         'classifier_parameters_f1': cp_f1,
                         'classifier_parameters_p_top20': cp_p_top20,
                         'classifier_parameters_brier': cp_brier,
                         'sampler_parameters_auc': sp_auc,
                         'sampler_parameters_acc': sp_acc,
                         'sampler_parameters_gacc': sp_gacc,
                         'sampler_parameters_f1': sp_f1,
                         'sampler_parameters_p_top20': sp_p_top20,
                         'sampler_parameters_brier': sp_brier,
                         }, index=[0])


def _clone_classifiers(classifiers):
    """
    Clones a set of classifiers

    Args:
        classifiers (list): a list of classifier objects
    """
    results = []
    for c in classifiers:
        if isinstance(c, MLPClassifierWrapper):
            results.append(c.copy())
        else:
            results.append(clone(c))

    return results


def _cache_samplings(folding,
                     samplers,
                     scaler,
                     max_n_sampler_par_comb=35,
                     n_jobs=1,
                     random_state=None):
    """

    """
    _logger.info("create sampling objects, random_state: %s" %
                 str(random_state or ""))
    sampling_objs = []

    random_state_init = random_state
    random_state = np.random.RandomState(random_state_init)

    _logger.info("samplers: %s" % str(samplers))
    for s in samplers:
        sampling_par_comb = s.parameter_combinations()
        _logger.info(sampling_par_comb)
        domain = np.array(list(range(len(sampling_par_comb))))
        n_random = min([len(sampling_par_comb), max_n_sampler_par_comb])
        random_indices = random_state.choice(domain, n_random, replace=False)
        _logger.info("random_indices: %s" % random_indices)
        sampling_par_comb = [sampling_par_comb[i] for i in random_indices]
        _logger.info(sampling_par_comb)

        for spc in sampling_par_comb:
            sampling_objs.append(Sampling(folding,
                                          s,
                                          spc,
                                          scaler,
                                          random_state_init))

    # sorting sampling objects to optimize execution
    def key(x):
        if (isinstance(x.sampler, ADG) or isinstance(x.sampler, AMSCO) or
                isinstance(x.sampler, DSRBF)):
            if 'proportion' in x.sampler_parameters:
                return 30 + x.sampler_parameters['proportion']
            else:
                return 30
        elif 'proportion' in x.sampler_parameters:
            return x.sampler_parameters['proportion']
        elif OverSampling.cat_memetic in x.sampler.categories:
            return 20
        else:
            return 10

    sampling_objs = list(reversed(sorted(sampling_objs, key=key)))

    # executing sampling in parallel
    _logger.info("executing %d sampling in parallel" % len(sampling_objs))
    Parallel(n_jobs=n_jobs, batch_size=1)(delayed(s.cache_sampling)()
                                          for s in sampling_objs)

    return sampling_objs


def _cache_evaluations(sampling_objs,
                       classifiers,
                       n_jobs=1,
                       random_state=None):
    # create evaluation objects
    _logger.info("create classifier jobs")
    evaluation_objs = []

    num_threads = None if n_jobs is None or n_jobs == 1 else 1

    for s in sampling_objs:
        evaluation_objs.append(Evaluation(s, _clone_classifiers(
            classifiers), num_threads, random_state))

    _logger.info("executing %d evaluation jobs in parallel" %
                 (len(evaluation_objs)))
    # execute evaluation in parallel
    evals = Parallel(n_jobs=n_jobs, batch_size=1)(
        delayed(e.do_evaluation)() for e in evaluation_objs)

    return evals


def _read_db_results(cache_path_db):
    results = []
    evaluation_files = glob.glob(os.path.join(cache_path_db, 'eval*.pickle'))

    for f in evaluation_files:
        eval_results = pickle.load(open(f, 'rb'))
        results.append(list(eval_results.values()))

    return results


def read_oversampling_results(datasets, cache_path=None, all_results=False):
    """
    Reads the results of the evaluation

    Args:
        datasets (list): list of datasets and/or dataset loaders - a dataset
                            is a dict with 'data', 'target' and 'name' keys
        cache_path (str): path to a cache directory
        all_results (bool): True to return all results, False to return an
                                aggregation

    Returns:
        pd.DataFrame: all results or the aggregated results if all_results is
                        False
    """

    results = []
    for dataset_spec in datasets:

        # loading dataset if needed and determining dataset name
        if not isinstance(dataset_spec, dict):
            dataset = dataset_spec()
        else:
            dataset = dataset_spec

        if 'name' in dataset:
            dataset_name = dataset['name']
        else:
            dataset_name = dataset_spec.__name__

        dataset['name'] = dataset_name

        # determining dataset specific cache path
        cache_path_db = os.path.join(cache_path, dataset_name)

        # reading the results
        res = _read_db_results(cache_path_db)

        # concatenating the results
        _logger.info("concatenating results")
        db_res = [pd.DataFrame(r) for r in res]
        db_res = pd.concat(db_res).reset_index(drop=True)

        _logger.info("aggregating the results")
        if all_results is False:
            db_res = db_res.groupby(by=['db_name', 'classifier', 'sampler'])
            db_res.apply(trans).reset_index().drop('level_3', axis=1)

        results.append(db_res)

    return pd.concat(results).reset_index(drop=True)


[docs]def evaluate_oversamplers(datasets,
                          samplers,
                          classifiers,
                          cache_path,
                          validator=RepeatedStratifiedKFold(
                              n_splits=5, n_repeats=3),
                          scaler=None,
                          all_results=False,
                          remove_cache=False,
                          max_samp_par_comb=35,
                          n_jobs=1,
                          random_state=None):
    """
    Evaluates oversampling techniques using various classifiers on various
        datasets

    Args:
        datasets (list): list of datasets and/or dataset loaders - a dataset
                            is a dict with 'data', 'target' and 'name' keys
        samplers (list): list of oversampling classes/objects
        classifiers (list): list of classifier objects
        cache_path (str): path to a cache directory
        validator (obj): validator object
        scaler (obj): scaler object
        all_results (bool): True to return all results, False to return an
                                aggregation
        remove_cache (bool): True to remove sampling objects after
                                        evaluation
        max_samp_par_comb (int): maximum number of sampler parameter
                                    combinations to be tested
        n_jobs (int): number of parallel jobs
        random_state (int/np.random.RandomState/None): initializer of the
                                                        random state

    Returns:
        pd.DataFrame: all results or the aggregated results if all_results is
                        False

    Example::

        import smote_variants as sv
        import imbalanced_datasets as imbd

        from sklearn.tree import DecisionTreeClassifier
        from sklearn.neighbors import KNeighborsClassifier

        datasets= [imbd.load_glass2, imbd.load_ecoli4]
        oversamplers= [sv.SMOTE_ENN, sv.NEATER, sv.Lee]
        classifiers= [KNeighborsClassifier(n_neighbors= 3),
                      KNeighborsClassifier(n_neighbors= 5),
                      DecisionTreeClassifier()]

        cache_path= '/home/<user>/smote_validation/'

        results= evaluate_oversamplers(datasets,
                                       oversamplers,
                                       classifiers,
                                       cache_path)
    """

    if cache_path is None:
        raise ValueError('cache_path is not specified')

    results = []
    for dataset_spec in datasets:
        # loading dataset if needed and determining dataset name
        if not isinstance(dataset_spec, dict):
            dataset = dataset_spec()
        else:
            dataset = dataset_spec

        if 'name' in dataset:
            dataset_name = dataset['name']
        else:
            dataset_name = dataset_spec.__name__

        dataset['name'] = dataset_name

        dataset_original_target = dataset['target'].copy()
        class_labels = np.unique(dataset['target'])
        n_0 = sum(dataset['target'] == class_labels[0])
        n_1 = sum(dataset['target'] == class_labels[1])
        if n_0 < n_1:
            min_label = class_labels[0]
            maj_label = class_labels[1]
        else:
            min_label = class_labels[1]
            maj_label = class_labels[0]
        min_ind = np.where(dataset['target'] == min_label)[0]
        maj_ind = np.where(dataset['target'] == maj_label)[0]
        np.put(dataset['target'], min_ind, 1)
        np.put(dataset['target'], maj_ind, 0)

        cache_path_db = os.path.join(cache_path, dataset_name)
        if not os.path.isdir(cache_path_db):
            _logger.info("creating cache directory")
            os.makedirs(cache_path_db)

        # checking of samplings and evaluations are available
        samplings_available = False
        evaluations_available = False

        samplings = glob.glob(os.path.join(cache_path_db, 'sampling*.pickle'))
        if len(samplings) > 0:
            samplings_available = True

        evaluations = glob.glob(os.path.join(cache_path_db, 'eval*.pickle'))
        if len(evaluations) > 0:
            evaluations_available = True

        message = ("dataset: %s, samplings_available: %s, "
                   "evaluations_available: %s")
        message = message % (dataset_name, str(samplings_available),
                             str(evaluations_available))
        _logger.info(message)

        if (remove_cache and evaluations_available and
                not samplings_available):
            # remove_cache is enabled and evaluations are available,
            # they are being read
            message = ("reading result from cache, sampling and evaluation is"
                       " not executed")
            _logger.info(message)
            res = _read_db_results(cache_path_db)
        else:
            _logger.info("doing the folding")
            folding = Folding(dataset, validator, cache_path_db, random_state)
            folding.do_folding()

            _logger.info("do the samplings")
            sampling_objs = _cache_samplings(folding,
                                             samplers,
                                             scaler,
                                             max_samp_par_comb,
                                             n_jobs,
                                             random_state)

            _logger.info("do the evaluations")
            res = _cache_evaluations(
                sampling_objs, classifiers, n_jobs, random_state)

        dataset['target'] = dataset_original_target

        # removing samplings once everything is done
        if remove_cache:
            filenames = glob.glob(os.path.join(cache_path_db, 'sampling*'))
            _logger.info("removing unnecessary sampling files")
            if len(filenames) > 0:
                for f in filenames:
                    os.remove(f)

        _logger.info("concatenating the results")
        db_res = [pd.DataFrame(r) for r in res]
        db_res = pd.concat(db_res).reset_index(drop=True)

        random_filename = os.path.join(cache_path_db, str(
            np.random.randint(1000000)) + '.pickle')
        pickle.dump(db_res, open(random_filename, "wb"))
        os.rename(random_filename, os.path.join(
            cache_path_db, 'results.pickle'))

        _logger.info("aggregating the results")
        if all_results is False:
            db_res = db_res.groupby(by=['db_name', 'classifier', 'sampler'])
            db_res = db_res.apply(trans).reset_index().drop('level_3', axis=1)

        results.append(db_res)

    return pd.concat(results).reset_index(drop=True)


[docs]def model_selection(dataset,
                    samplers,
                    classifiers,
                    cache_path,
                    score='auc',
                    validator=RepeatedStratifiedKFold(n_splits=5, n_repeats=3),
                    remove_cache=False,
                    max_samp_par_comb=35,
                    n_jobs=1,
                    random_state=None):
    """
    Evaluates oversampling techniques on various classifiers and a dataset
    and returns the oversampling and classifier objects giving the best
    performance

    Args:
        dataset (dict): a dataset is a dict with 'data', 'target' and 'name'
                        keys
        samplers (list): list of oversampling classes/objects
        classifiers (list): list of classifier objects
        cache_path (str): path to a cache directory
        score (str): 'auc'/'acc'/'gacc'/'f1'/'brier'/'p_top20'
        validator (obj): validator object
        all_results (bool): True to return all results, False to return an
                            aggregation
        remove_cache (bool): True to remove sampling objects after
                                        evaluation
        max_samp_par_comb (int): maximum number of sampler parameter
                                    combinations to be tested
        n_jobs (int): number of parallel jobs
        random_state (int/np.random.RandomState/None): initializer of the
                                                        random state

    Returns:
        obj, obj: the best performing sampler object and the best performing
                    classifier object

    Example::

        import smote_variants as sv
        import imbalanced_datasets as imbd

        from sklearn.tree import DecisionTreeClassifier
        from sklearn.neighbors import KNeighborsClassifier

        datasets= imbd.load_glass2()
        oversamplers= [sv.SMOTE_ENN, sv.NEATER, sv.Lee]
        classifiers= [KNeighborsClassifier(n_neighbors= 3),
                      KNeighborsClassifier(n_neighbors= 5),
                      DecisionTreeClassifier()]

        cache_path= '/home/<user>/smote_validation/'

        sampler, classifier= model_selection(dataset,
                                             oversamplers,
                                             classifiers,
                                             cache_path,
                                             'auc')
    """

    if score not in ['auc', 'acc', 'gacc', 'f1', 'brier', 'p_top20']:
        raise ValueError("score %s not supported" % score)

    results = evaluate_oversamplers(datasets=[dataset],
                                    samplers=samplers,
                                    classifiers=classifiers,
                                    cache_path=cache_path,
                                    validator=validator,
                                    remove_cache=remove_cache,
                                    max_samp_par_comb=max_samp_par_comb,
                                    n_jobs=n_jobs,
                                    random_state=random_state)

    # extracting the best performing classifier and oversampler parameters
    # regarding AUC
    highest_score = results[score].idxmax()
    cl_par_name = 'classifier_parameters_' + score
    samp_par_name = 'sampler_parameters_' + score
    cl, cl_par, samp, samp_par = results.loc[highest_score][['classifier',
                                                             cl_par_name,
                                                             'sampler',
                                                             samp_par_name]]

    # instantiating the best performing oversampler and classifier objects
    samp_obj = eval(samp)(**eval(samp_par))
    cl_obj = eval(cl)(**eval(cl_par))

    return samp_obj, cl_obj


[docs]def cross_validate(dataset,
                   sampler,
                   classifier,
                   validator=RepeatedStratifiedKFold(n_splits=5, n_repeats=3),
                   scaler=StandardScaler(),
                   random_state=None):
    """
    Evaluates oversampling techniques on various classifiers and a dataset
    and returns the oversampling and classifier objects giving the best
    performance

    Args:
        dataset (dict): a dataset is a dict with 'data', 'target' and 'name'
                        keys
        samplers (list): list of oversampling classes/objects
        classifiers (list): list of classifier objects
        validator (obj): validator object
        scaler (obj): scaler object
        random_state (int/np.random.RandomState/None): initializer of the
                                                        random state

    Returns:
        pd.DataFrame: the cross-validation scores

    Example::

        import smote_variants as sv
        import imbalanced_datasets as imbd

        from sklearn.neighbors import KNeighborsClassifier

        dataset= imbd.load_glass2()
        sampler= sv.SMOTE_ENN
        classifier= KNeighborsClassifier(n_neighbors= 3)

        sampler, classifier= model_selection(dataset,
                                             oversampler,
                                             classifier)
    """

    class_labels = np.unique(dataset['target'])
    binary_problem = (len(class_labels) == 2)

    dataset_orig_target = dataset['target'].copy()
    if binary_problem:
        _logger.info("The problem is binary")
        n_0 = sum(dataset['target'] == class_labels[0])
        n_1 = sum(dataset['target'] == class_labels[1])
        if n_0 < n_1:
            min_label = class_labels[0]
            maj_label = class_labels[1]
        else:
            min_label = class_labels[0]
            maj_label = class_labels[1]

        min_ind = np.where(dataset['target'] == min_label)[0]
        maj_ind = np.where(dataset['target'] == maj_label)[0]
        np.put(dataset['target'], min_ind, 1)
        np.put(dataset['target'], maj_ind, 0)
    else:
        _logger.info("The problem is not binary")
        label_indices = {}
        for c in class_labels:
            label_indices[c] = np.where(dataset['target'] == c)[0]
        mapping = {}
        for i, c in enumerate(class_labels):
            np.put(dataset['target'], label_indices[c], i)
            mapping[i] = c

    runtimes = []
    all_preds, all_tests = [], []

    for train, test in validator.split(dataset['data'], dataset['target']):
        _logger.info("Executing fold")
        X_train, y_train = dataset['data'][train], dataset['target'][train]
        X_test, y_test = dataset['data'][test], dataset['target'][test]

        begin = time.time()
        X_samp, y_samp = sampler.sample(X_train, y_train)
        runtimes.append(time.time() - begin)

        X_samp_trans = scaler.fit_transform(X_samp)
        nonzero_var_idx = np.where(scaler.var_ > 1e-8)[0]
        X_test_trans = scaler.transform(X_test)

        all_tests.append(y_test)

        classifier.fit(X_samp_trans[:, nonzero_var_idx], y_samp)
        all_preds.append(classifier.predict_proba(
            X_test_trans[:, nonzero_var_idx]))

    if len(all_tests) > 0:
        all_preds = np.vstack(all_preds)
        all_tests = np.hstack(all_tests)

    dataset['target'] = dataset_orig_target

    _logger.info("Computing the results")

    results = {}
    results['runtime'] = np.mean(runtimes)
    results['sampler'] = sampler.__class__.__name__
    results['classifier'] = classifier.__class__.__name__
    results['sampler_parameters'] = str(sampler.get_params())
    results['classifier_parameters'] = str(classifier.get_params())
    results['db_size'] = len(dataset['data'])
    results['db_n_attr'] = len(dataset['data'][0])
    results['db_n_classes'] = len(class_labels)

    if binary_problem:
        results['imbalance_ratio'] = sum(
            dataset['target'] == maj_label)/sum(dataset['target'] == min_label)
        all_pred_labels = np.apply_along_axis(
            lambda x: np.argmax(x), 1, all_preds)

        results['tp'] = np.sum(np.logical_and(
            np.equal(all_tests, all_pred_labels), (all_tests == 1)))
        results['tn'] = np.sum(np.logical_and(
            np.equal(all_tests, all_pred_labels), (all_tests == 0)))
        results['fp'] = np.sum(np.logical_and(np.logical_not(
            np.equal(all_tests, all_pred_labels)), (all_tests == 0)))
        results['fn'] = np.sum(np.logical_and(np.logical_not(
            np.equal(all_tests, all_pred_labels)), (all_tests == 1)))
        results['p'] = results['tp'] + results['fn']
        results['n'] = results['fp'] + results['tn']
        results['acc'] = (results['tp'] + results['tn']) / \
            (results['p'] + results['n'])
        results['sens'] = results['tp']/results['p']
        results['spec'] = results['tn']/results['n']
        results['ppv'] = results['tp']/(results['tp'] + results['fp'])
        results['npv'] = results['tn']/(results['tn'] + results['fn'])
        results['fpr'] = 1.0 - results['spec']
        results['fdr'] = 1.0 - results['ppv']
        results['fnr'] = 1.0 - results['sens']
        results['bacc'] = (results['tp']/results['p'] +
                           results['tn']/results['n'])/2.0
        results['gacc'] = np.sqrt(
            results['tp']/results['p']*results['tn']/results['n'])
        results['f1'] = 2*results['tp'] / \
            (2*results['tp'] + results['fp'] + results['fn'])
        mcc_num = (results['tp']*results['tn'] - results['fp']*results['fn'])
        tp_fp = (results['tp'] + results['fp'])
        tp_fn = (results['tp'] + results['fn'])
        tn_fp = (results['tn'] + results['fp'])
        tn_fn = (results['tn'] + results['fn'])
        mcc_denom = np.sqrt(tp_fp * tp_fn * tn_fp * tn_fn)
        results['mcc'] = mcc_num/mcc_denom
        results['l'] = (results['p'] + results['n']) * \
            np.log(results['p'] + results['n'])
        results['ltp'] = results['tp']*np.log(results['tp']/(
            (results['tp'] + results['fp'])*(results['tp'] + results['fn'])))
        results['lfp'] = results['fp']*np.log(results['fp']/(
            (results['fp'] + results['tp'])*(results['fp'] + results['tn'])))
        results['lfn'] = results['fn']*np.log(results['fn']/(
            (results['fn'] + results['tp'])*(results['fn'] + results['tn'])))
        results['ltn'] = results['tn']*np.log(results['tn']/(
            (results['tn'] + results['fp'])*(results['tn'] + results['fn'])))
        results['lp'] = results['p'] * \
            np.log(results['p']/(results['p'] + results['n']))
        results['ln'] = results['n'] * \
            np.log(results['n']/(results['p'] + results['n']))
        ucc_num = (results['l'] + results['ltp'] + results['lfp'] +
                   results['lfn'] + results['ltn'])
        results['uc'] = ucc_num/(results['l'] + results['lp'] + results['ln'])
        results['informedness'] = results['sens'] + results['spec'] - 1.0
        results['markedness'] = results['ppv'] + results['npv'] - 1.0
        results['log_loss'] = log_loss(all_tests, all_preds)
        results['auc'] = roc_auc_score(all_tests, all_preds[:, 1])
        test_labels, preds = zip(
            *sorted(zip(all_tests, all_preds[:, 1]), key=lambda x: -x[1]))
        test_labels = np.array(test_labels)
        th = int(0.2*len(test_labels))
        results['p_top20'] = np.sum(test_labels[:th] == 1)/th
        results['brier'] = np.mean((all_preds[:, 1] - all_tests)**2)
    else:
        all_pred_labels = np.apply_along_axis(
            lambda x: np.argmax(x), 1, all_preds)

        results['acc'] = accuracy_score(all_tests, all_pred_labels)
        results['confusion_matrix'] = confusion_matrix(
            all_tests, all_pred_labels)
        sum_confusion = np.sum(results['confusion_matrix'], axis=0)
        results['gacc'] = gmean(np.diagonal(
            results['confusion_matrix'])/sum_confusion)
        results['class_label_mapping'] = mapping

    return pd.DataFrame({'value': list(results.values())},
                        index=results.keys())