Source code for skmine.emerging.mbdldorber

"""
MBD-LLBorder
"""
from functools import partial
from itertools import combinations

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from ..base import BaseMiner, DiscovererMixin
from ..itemsets.lcm import LCMMax
from ..utils import _check_growth_rate, _check_min_supp, filter_maximal, filter_minimal


def border_diff(U, S):
    """
    Given a pair of borders <{∅}, {U}> and <{∅}, {S}>,
    ``border_diff`` derives another border <L2, {U}>
    such that [L2, {U}] = [{∅}, {U}] - [{∅}, {S}]

    Parameters
    ----------
    U : set
        non empty part from the border to differentiate

    S : list of set
        non-empty part of a border.
        Noted as ``R1`` in the original paper

    References
    ----------
    .. [1]
        Dong, Li
        Efficient Mining of Emerging Patterns Discovering

    Notes
    -----
    See ``BORDER-DIFF`` in section 4.1
    """
    # assert len(R1) < len(U)  # assert we iterate on the smallest ensemble
    L2 = [{x} for x in U - S[0]]
    for i in range(1, len(S)):
        _L2 = [X | {x} for x in U - S[i] for X in L2]
        L2 = list(filter_minimal(_L2))

    return L2, U


def mbdllborder(isets1, isets2):
    """
    References
    ----------
    .. [1]
        Dong, Li
        Efficient Mining of Emerging Patterns Discovering

    Notes
    -----
    main algorithm, as explained in section 4.2
    """
    borders = list()

    for iset in isets2:
        if any((e > iset for e in isets1)):
            continue
        inter = (iset & e for e in isets1)
        R = filter_maximal(inter)

        diff = border_diff(iset, R)
        borders.append(diff)

    return borders


def borders_to_patterns(left, right, min_size=None):
    """
    Operates in a bread-first manner, outputting all
    valid patterns of a given size for each level.
    Bigger patterns first.

    Parameters
    ----------
    left: list of set
    right: set
    min_size: int
        only accepts patterns with greater or equal size

    Returns
    -------
    pd.Series
    """
    min_size = min_size or min(map(len, left))
    patterns = list()
    for size in range(len(right) - 1, min_size - 1, -1):  # bigger patterns first
        combs = combinations(right, size)
        for pat in combs:
            if any((e.issuperset(set(pat)) for e in left)):
                continue
            patterns.append(pat)

    return pd.Series(patterns)


[docs]class MBDLLBorder(BaseMiner, DiscovererMixin):
    """
    MBD-LLBorder aims at discovering patterns characterizing the difference
    between two collections of data.

    It first discovers ``two sets of maximal itemsets``, one for each collection.
    It then looks for borders of these sets, and characterizes the difference
    by only manipulating theses borders.

    This results in the algorithm only keeping a ``concise description (borders)``
    as an internal state.

    Last but not least, it enumerates the set of emerging patterns from the
    borders.

    Parameters
    ----------
    min_growth_rate: int or float, default=2
        A pattern is considered as emerging iff its support in the first collection
        is at least min_growth_rate times its support in the second collection.

    min_supp: int or float, default=.1
        Minimum support in each of the collection
        Must be a relative support between 0 and 1
        Default to 0.1 (10%)

    n_jobs : int, default=1
        The number of jobs to use for the computation.


    Attributes
    ----------
    borders_: list of tuple[list[set], set]
        List of pairs representing left and right borders
        For every pair, emerging patterns will be uncovered by enumerating itemsets
        from the right side, while checking for non membership in the left side


    References
    ----------
    .. [1]
        Guozhu Dong, Jinyan Li
        "Efficient Mining of Emerging Patterns : Discovering Trends and Differences", 1999
    """

    def __init__(self, min_growth_rate=2, min_supp=0.1, n_jobs=1):
        _check_min_supp(min_supp, accept_absolute=False)
        self.min_supp = min_supp
        self.min_growth_rate = _check_growth_rate(min_growth_rate)
        self.n_jobs = n_jobs

[docs]    def fit(self, D, y):
        """
        fit MBD-LLBorder on D, splitted on y

        This is done in two steps
            1. Discover maximal itemsets for the two disinct collections contained in D
            2. Mine borders from these maximal itemsets

        Parameters
        ----------
        D : pd.Series
            The input transactional database
            Where every entry contains singular items
            Items must be both hashable and comparable

        y : array-like of shape (n_samples,)
            Targets on which to split D
            Must contain only two disctinct values, i.e len(np.unique(y)) == 2
        """

        labels = np.unique(y)
        assert len(labels) == 2
        assert isinstance(D, pd.Series)  # TODO : accept tabular data

        D1, D2 = D[y == labels[0]], D[y == labels[1]]

        # TODO : replace LCMMax by some more efficient method
        right_border_d1 = LCMMax(min_supp=self.min_supp).fit_transform(D1)
        right_border_d2 = LCMMax(min_supp=self.min_growth_rate * self.min_supp).fit_transform(D2)

        right_border_d1 = right_border_d1.itemset.map(set).tolist()
        right_border_d2 = right_border_d2.itemset.map(set).tolist()

        self.borders_ = mbdllborder(right_border_d1, right_border_d2)

        return self

[docs]    def discover(self, min_size=3):
        """
        Enumerate emerging patterns from borders
        Subsets are drawn from the right borders, and are accepted iff
        they do not belong to the corresponding left border.

        This implementation is parallel, we consider every couple
        of right/left border in a separate worker and run the computation

        Parameters
        ----------
        min_size: int
            minimum size for an itemset to be valid
        """
        btp = delayed(partial(borders_to_patterns, min_size=min_size))
        series = Parallel(n_jobs=self.n_jobs, prefer="processes")(
            btp(L, R) for L, R in self.borders_
        )
        return pd.concat(series) if series else pd.Series(dtype="object")