Source code for skmine.base

"""Base classes for all miners."""
# pylint: disable= unused-argument

import inspect
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin

"""
Notes : for algorithm LCM, SLIM, SlimClassifier and PeriodicPatternMiner sklearn base classes are used.
TODO: Other skmine algorithm should follow the same.
"""


[docs]class BaseMiner(ABC):
    """Base class for all miners in scikit-mine."""

[docs]    @abstractmethod
    def fit(self, D, y):
        """Fit method to be implemented."""
        return self

[docs]    @abstractmethod
    def discover(self, *args, **kwargs):
        """discover method to be implemented."""
        return pd.Series()

    def _get_tags(self):
        return {
            "non_deterministic": False,
            "requires_positive_X": True,
            "requires_positive_y": False,
            "X_types": ['2darray'],  # ["categorical"],
            "poor_score": False,
            "no_validation": False,
            "multioutput": False,
            "allow_nan": False,
            "stateless": True,
            "multilabel": False,
            "_skip_test": False,
            "_xfail_checks": False,
            "multioutput_only": False,
            "binary_only": False,
            "requires_fit": True,
            "preserves_dtype": [np.float64],
            "requires_y": False,
            "pairwise": False,
        }

    @classmethod
    def _get_param_names(cls):
        """Get parameter names for the estimator"""
        # fetch the constructor or the original constructor before
        # deprecation wrapping if any
        init = getattr(cls.__init__, "deprecated_original", cls.__init__)
        if init is object.__init__:
            # No explicit constructor to introspect
            return []

        # introspect the constructor arguments to find the model parameters
        # to represent
        init_signature = inspect.signature(init)
        # Consider the constructor parameters excluding 'self'
        parameters = [p for p in init_signature.parameters.values()
                      if p.name != "self" and p.kind != p.VAR_KEYWORD
                      ]
        # Extract and sort argument names excluding 'self'
        return sorted([p.name for p in parameters])

[docs]    def get_params(self, deep=False):
        """
        Get parameters for this estimator.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """
        out = dict()
        for key in self._get_param_names():
            value = getattr(self, key)
            out[key] = value
        return out

[docs]    def set_params(self, **params):
        """
        Set the parameters of this estimator.
        The method works on simple estimators as well as on nested objects
        (such as pipelines). The latter have parameters of the form
        ``<component>__<parameter>`` so that it's possible to update each
        component of a nested object.

        Parameters
        ----------
        **params : dict
            Estimator parameters.
        Returns
        -------
        self : object
            Estimator instance.
        """
        # Simple optimization to gain speed (inspect is slow)
        if not params:
            return self

        valid_params = self.get_params()

        for key, value in params.items():
            if key not in valid_params:
                raise ValueError("Invalid parameter %s for estimator %s. Check the list of available parameters "
                                 "with `estimator.get_params().keys()`." % (key, self))
            setattr(self, key, value)
            valid_params[key] = value

        return self


[docs]class DiscovererMixin:
    """Mixin for all pattern discovery models in scikit-mine"""

[docs]    def fit_discover(self, D, y=None, **kwargs):
        """
        Fit to data, the extract patterns

        Parameters
        ----------
        D: {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)

        Returns
        -------
        pd.Series
            patterns discovered by a mining algorithm
        """
        if y is None:
            return self.fit(D).discover(**kwargs)
        else:
            return self.fit(D, y=y).discover(**kwargs)


class TransformerMixin(TransformerMixin):
    """Base Mixin for transformers in scikit-mine"""

    def fit_transform(self, X, y=None, **tsf_params):
        """
        Override sklearn transformer method to apply optional parameters `tsf_params` on transform  and not to fit
        Returns a transformed version of `X`: i.e. the fitted codetable

        Returns
        -------
        df_new : pandas.Dataframe
            Codetable fitted from X data.
        """

        if y is None:
            return self.fit(X).transform(X, **tsf_params)
        else:
            return self.fit(X, y).transform(X, **tsf_params)


[docs]class MDLOptimizer(ABC):
    """
    Base interface for all models applying the `Minimum Description Length principle
    <https://en.wikipedia.org/wiki/Minimum_description_length>`_.
    """

[docs]    @abstractmethod
    def generate_candidates(self, *args, **kwargs):
        """
        Generate new candidates, to be sent for later evaluation.

        Calling this function is equivalent to sending a new message given an encoding scheme,
        while calling ``.evaluate`` is equivalent to receiving this message, and evaluating the gain
        of information it provides.

        Returns
        -------
        object or Iterable[object]
            A set of new candidates
        """
        return list()

[docs]    @abstractmethod
    def evaluate(self, candidate, *args, **kwargs):
        """
        Evaluate the gain, i.e the gain of information when accepting the candidate.

        Parameters
        ----------
        candidate: object
            A candidate to evaluate

        Returns
        -------
        tuple (data_size, model_size, ...)
            Should return a tuple, with first two values corresponding to new data size
            and model size in the case of accepting the candidate.

            Data size and model size should be returned separately as we encourage
            usage of `(two-part) crude MDL
            <https://en.wikipedia.org/wiki/Minimum_description_length#Two-Part_Codes>`_.
        """
        return (0, 0,)

    def _repr_html_(self):
        s = self.discover()  # call discover with default parameters
        df = s.to_frame(name="usage")
        if not df.empty:
            return df._repr_html_()  # pylint: disable=protected-access
        else:
            return repr(self)


class InteractiveMiner(ABC):
    """Base class for interactive mining

    Interactive miners should allow us to
    1. ingest some input data, by calling `prefit`
    2. generate candidates
    3. loop over generated candidate, and call `update` with this candidate as argument,
    depending on some external input (like a positive answer from a user in CLI mode)
    """

    @abstractmethod
    def prefit(self, D):
        """ingest data `D` and track basic information to be used later"""
        return self

    @abstractmethod
    def update(self, *args, **kwargs):
        """inplace edition of underlying datastructures"""
        return None