"""
MBD-LLBorder
"""
from functools import partial
from itertools import combinations
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from ..base import BaseMiner, DiscovererMixin
from ..itemsets.lcm import LCMMax
from ..utils import _check_growth_rate, _check_min_supp, filter_maximal, filter_minimal
def border_diff(U, S):
"""
Given a pair of borders <{∅}, {U}> and <{∅}, {S}>,
``border_diff`` derives another border <L2, {U}>
such that [L2, {U}] = [{∅}, {U}] - [{∅}, {S}]
Parameters
----------
U : set
non empty part from the border to differentiate
S : list of set
non-empty part of a border.
Noted as ``R1`` in the original paper
References
----------
.. [1]
Dong, Li
Efficient Mining of Emerging Patterns Discovering
Notes
-----
See ``BORDER-DIFF`` in section 4.1
"""
# assert len(R1) < len(U) # assert we iterate on the smallest ensemble
L2 = [{x} for x in U - S[0]]
for i in range(1, len(S)):
_L2 = [X | {x} for x in U - S[i] for X in L2]
L2 = list(filter_minimal(_L2))
return L2, U
def mbdllborder(isets1, isets2):
"""
References
----------
.. [1]
Dong, Li
Efficient Mining of Emerging Patterns Discovering
Notes
-----
main algorithm, as explained in section 4.2
"""
borders = list()
for iset in isets2:
if any((e > iset for e in isets1)):
continue
inter = (iset & e for e in isets1)
R = filter_maximal(inter)
diff = border_diff(iset, R)
borders.append(diff)
return borders
def borders_to_patterns(left, right, min_size=None):
"""
Operates in a bread-first manner, outputting all
valid patterns of a given size for each level.
Bigger patterns first.
Parameters
----------
left: list of set
right: set
min_size: int
only accepts patterns with greater or equal size
Returns
-------
pd.Series
"""
min_size = min_size or min(map(len, left))
patterns = list()
for size in range(len(right) - 1, min_size - 1, -1): # bigger patterns first
combs = combinations(right, size)
for pat in combs:
if any((e.issuperset(set(pat)) for e in left)):
continue
patterns.append(pat)
return pd.Series(patterns)
[docs]class MBDLLBorder(BaseMiner, DiscovererMixin):
"""
MBD-LLBorder aims at discovering patterns characterizing the difference
between two collections of data.
It first discovers ``two sets of maximal itemsets``, one for each collection.
It then looks for borders of these sets, and characterizes the difference
by only manipulating theses borders.
This results in the algorithm only keeping a ``concise description (borders)``
as an internal state.
Last but not least, it enumerates the set of emerging patterns from the
borders.
Parameters
----------
min_growth_rate: int or float, default=2
A pattern is considered as emerging iff its support in the first collection
is at least min_growth_rate times its support in the second collection.
min_supp: int or float, default=.1
Minimum support in each of the collection
Must be a relative support between 0 and 1
Default to 0.1 (10%)
n_jobs : int, default=1
The number of jobs to use for the computation.
Attributes
----------
borders_: list of tuple[list[set], set]
List of pairs representing left and right borders
For every pair, emerging patterns will be uncovered by enumerating itemsets
from the right side, while checking for non membership in the left side
References
----------
.. [1]
Guozhu Dong, Jinyan Li
"Efficient Mining of Emerging Patterns : Discovering Trends and Differences", 1999
"""
def __init__(self, min_growth_rate=2, min_supp=0.1, n_jobs=1):
_check_min_supp(min_supp, accept_absolute=False)
self.min_supp = min_supp
self.min_growth_rate = _check_growth_rate(min_growth_rate)
self.n_jobs = n_jobs
[docs] def fit(self, D, y):
"""
fit MBD-LLBorder on D, splitted on y
This is done in two steps
1. Discover maximal itemsets for the two disinct collections contained in D
2. Mine borders from these maximal itemsets
Parameters
----------
D : pd.Series
The input transactional database
Where every entry contains singular items
Items must be both hashable and comparable
y : array-like of shape (n_samples,)
Targets on which to split D
Must contain only two disctinct values, i.e len(np.unique(y)) == 2
"""
labels = np.unique(y)
assert len(labels) == 2
assert isinstance(D, pd.Series) # TODO : accept tabular data
D1, D2 = D[y == labels[0]], D[y == labels[1]]
# TODO : replace LCMMax by some more efficient method
right_border_d1 = LCMMax(min_supp=self.min_supp).fit_transform(D1)
right_border_d2 = LCMMax(min_supp=self.min_growth_rate * self.min_supp).fit_transform(D2)
right_border_d1 = right_border_d1.itemset.map(set).tolist()
right_border_d2 = right_border_d2.itemset.map(set).tolist()
self.borders_ = mbdllborder(right_border_d1, right_border_d2)
return self
[docs] def discover(self, min_size=3):
"""
Enumerate emerging patterns from borders
Subsets are drawn from the right borders, and are accepted iff
they do not belong to the corresponding left border.
This implementation is parallel, we consider every couple
of right/left border in a separate worker and run the computation
Parameters
----------
min_size: int
minimum size for an itemset to be valid
"""
btp = delayed(partial(borders_to_patterns, min_size=min_size))
series = Parallel(n_jobs=self.n_jobs, prefer="processes")(
btp(L, R) for L, R in self.borders_
)
return pd.concat(series) if series else pd.Series(dtype="object")