"""
Generate samples of synthetic datasets.
Mainly for benchmarks and experiments
"""
import numpy as np
import pandas as pd
[docs]def make_transactions(
n_transactions=1000, n_items=100, density=0.5, random_state=None, item_start=0
):
"""
Generate a transactional dataset with predefined properties
see: https://liris.cnrs.fr/Documents/Liris-3716.pdf
Transaction sizes follow a normal distribution, centered around ``density * n_items``.
Individual items are integer values between 0 and ``n_items``.
Parameters
---------
n_transactions: int, default=1000
The number of transactions to generate
n_items: int, default=100
The number of indidual items, i.e the size of the set of symbols
density: float, default=0.5
Density of the resulting dataset
random_state : int, RandomState instance, default=None
Determines random number generation for dataset creation. Pass an int
for reproducible output across multiple function calls.
References
----------
.. F. Flouvat, F. De Marchi, JM. Petit
"A new classification of datasets for frequent itemsets", 2009
Example
-------
>>> from skmine.datasets import make_transactions
>>> make_transactions(n_transactions=5, n_items=20, density=.25) # doctest: +SKIP
0 [0, 6, 18, 10, 1, 12]
1 [2, 18, 10, 14]
2 [4, 5, 1]
3 [10, 11, 16, 19]
4 [9, 4, 19, 8, 12, 5]
dtype: object
Notes
-----
With a binary matrix representation of the resulting dataset, we have the following equality
.. math:: density = { Number\ of\ ones \over Number\ of\ cells }
This is equivalent to
.. math:: density = { Average\ transaction\ size \over number\ of\ items }
Returns
-------
pd.Series: a Series of shape (``n_transactions``,)
Earch entry is a list of integer values
"""
if not 0.0 < density < 1.0:
raise ValueError("density should be a float value between 0 and 1")
avg_transaction_size = density * n_items
generator = np.random.RandomState(random_state) # pylint: disable= no-member
item_stop = item_start + n_items
choices = np.arange(start=item_start, stop=item_stop)
t_sizes = generator.binomial(
n=avg_transaction_size * 2,
p=0.5, # centered around avg_transaction_size
size=n_transactions,
)
max_size = t_sizes.max()
if max_size > n_items:
delta = max_size - n_items
t_sizes = np.clip(t_sizes, a_min=t_sizes.min() + delta, a_max=n_items)
D = [generator.choice(choices, size, replace=False) for size in t_sizes]
return pd.Series(D)
[docs]def make_classification(
n_samples=100,
n_items_per_class=100,
*, # pylint: disable= too-many-locals
n_classes=2,
weights=None,
class_sep=0.2,
shuffle=True,
random_state=None,
densities=None
):
"""
Generate a random n-class classification problem
Acts like sklearn version of make_classification, but produces
transactional data instead. Transactions are drawn from a ``n_items_per_class``
number of items, respecting the ``class_sep`` parameter to ensure transactions
are drawn from different alphabets for different classes.
A ``class_sep`` value of 0.0 will result in transactions being drawn from the
same set of symbols.
Densities can be defined for each class given the ``densities``
parameter.
Parameters
----------
n_samples: int, default=100
The number of samples
n_items_per_class: int, default=100
The number of items per class. This is similar to the ``n_features``
parameters in scikit-learn, but operates at a class level.
n_classes: int, default=2
The number of classes (or labels) of the classification problem
weigths, array-like of shape (n_classes,) default=None
The proportions of samples assigned to each class. If None, then classes are balanced
class_sep: float, default=0.2
The factor of different items in different between classes.
Setting this to 1.0 will make classification dummy.
shuffle: boolean, default=True
Shuffle the samples and the labels
random_state: int RandomState instance, default=None
Determines random number generation for dataset creation. Pass an int
for reproducible output across multiple function calls.
Returns
-------
D: pd.Series of shape [n_samples, ]
The generated samples
y: pd.Series of shape [n_samples]
Labels associated to D
See also
--------
make_transactions : which is used internally to generate samples
"""
assert n_classes > 0
if densities is None:
densities = [0.5] * n_classes
if weights is None:
weights = [1 / n_classes] * n_classes # balanced by default
assert len(weights) == len(densities) == n_classes
assert 0 <= class_sep <= 1.0
np.testing.assert_almost_equal(np.sum(weights), 1.0, decimal=2)
res = dict()
padding = 0
for _class in range(n_classes):
_n_samples = int(weights[_class] * n_samples)
density = densities[_class]
transactions = make_transactions(
n_transactions=_n_samples,
n_items=n_items_per_class,
random_state=random_state,
item_start=padding,
density=density,
)
res[_class] = transactions
# if class_sep == 1.0, then separation is strict
padding += int(n_items_per_class - (n_items_per_class * (1 - class_sep)))
dfs = list()
for _class, transactions in res.items():
df = transactions.to_frame(name="transaction")
df.loc[:, "class"] = _class
dfs.append(df)
df = pd.concat(dfs, axis=0)
if shuffle:
df = df.sample(frac=1)
return df["transaction"], df["class"]