Source code for smote_variants.noise_removal._neighborhoodcleaningrule

"""
This module implements the neighborhood cleaning rule.
"""

import numpy as np

from ..base import mode, coalesce
from ._noisefilter import NoiseFilter
from ..base import NearestNeighborsWithMetricTensor

from .._logger import logger

_logger = logger

__all__ = ["NeighborhoodCleaningRule"]



[docs]
class NeighborhoodCleaningRule(NoiseFilter):
    """
    References:
        * BibTex::

            @article{smoteNoise0,
                     author = {Batista, Gustavo E. A. P. A. and Prati,
                                Ronaldo C. and Monard, Maria Carolina},
                     title = {A Study of the Behavior of Several Methods for
                                Balancing Machine Learning Training Data},
                     journal = {SIGKDD Explor. Newsl.},
                     issue_date = {June 2004},
                     volume = {6},
                     number = {1},
                     month = jun,
                     year = {2004},
                     issn = {1931-0145},
                     pages = {20--29},
                     numpages = {10},
                     url = {http://doi.acm.org/10.1145/1007730.1007735},
                     doi = {10.1145/1007730.1007735},
                     acmid = {1007735},
                     publisher = {ACM},
                     address = {New York, NY, USA}
                    }
    """


[docs]
    def __init__(self, nn_params=None, n_jobs=1, **_kwargs):
        """
        Constructor of the noise removal object

        Args:
            nn_params (dict): additional parameters for nearest neighbor calculations, any
                                parameter NearestNeighbors accepts, and additionally use
                                {'metric': 'precomputed', 'metric_learning': '<method>', ...}
                                with <method> in 'ITML', 'LSML' to enable the learning of
                                the metric to be used for neighborhood calculations
            n_jobs (int): number of parallel jobs
        """
        super().__init__()

        self.check_n_jobs(n_jobs, "n_jobs")

        self.nn_params = coalesce(nn_params, {})
        self.n_jobs = n_jobs



[docs]
    def get_params(self, deep=False):
        return {
            "nn_params": self.nn_params,
            "n_jobs": self.n_jobs,
            **NoiseFilter.get_params(self, deep),
        }



[docs]
    def remove_noise(self, X, y):
        """
        Removes noise

        Args:
            X (np.array): features
            y (np.array): target labels

        Returns:
            np.array, np.array: cleaned features and target labels
        """
        _logger.info("%s: Running noise removal", self.__class__.__name__)
        self.class_label_statistics(y)

        # fitting nearest neighbors with proposed parameter
        # using 4 neighbors because the first neighbor is the point itself
        nn_params = {**self.nn_params}
        nn_params["metric_tensor"] = self.metric_tensor_from_nn_params(nn_params, X, y)

        nnmt = NearestNeighborsWithMetricTensor(
            n_neighbors=4, n_jobs=self.n_jobs, **(nn_params)
        )
        nnmt.fit(X)
        indices = nnmt.kneighbors(X, return_distance=False)

        # identifying the samples to be removed
        to_remove = []
        for idx in range(len(X)):
            if y[idx] == self.maj_label and mode(y[indices[idx][1:]]) == self.min_label:
                # if sample i is majority and the decision based on
                # neighbors is minority
                to_remove.append(idx)
            elif (
                y[idx] == self.min_label and mode(y[indices[idx][1:]]) == self.maj_label
            ):
                # if sample i is minority and the decision based on
                # neighbors is majority
                for jdx in indices[idx][1:]:
                    if y[jdx] == self.maj_label:
                        to_remove.append(jdx)

        # removing the noisy samples and returning the results
        to_remove = list(set(to_remove))
        return np.delete(X, to_remove, axis=0), np.delete(y, to_remove)