Source code for skmine.datasets.utils

"""
utils methods for skmine.datasets
"""

from itertools import chain


[docs]def describe(D): """Give some high level properties on transactions ======================== =============== Number of items int Number of transactions int Average transaction size float Density float in [0, 1] ======================== =============== Parameters ---------- D: pd.Series A transactional dataset Notes ----- .. math:: density = { avg\_transaction\_size \over n\_items } Example ------- >>> from skmine.datasets.fimi import fetch_connect >>> from skmine.datasets.utils import describe >>> describe(fetch_connect()) # doctest: +SKIP {'n_items': 75, 'avg_transaction_size': 37.0, 'n_transactions': 3196, 'density': 0.4933} """ avg_transaction_size = D.map(len).mean() n_transactions = D.shape[0] n_items = len(set(chain(*D))) return dict( n_items=n_items, avg_transaction_size=avg_transaction_size, n_transactions=n_transactions, density=avg_transaction_size / n_items, )
[docs]def describe_logs(D): """Give some high level properties on logs ============================== ===== Number of events int Average delta per event float Average nb of points per event float ============================== ===== Parameters ---------- D: pd.Series A dataset containing logs Example ------- >>> from skmine.datasets.periodic import fetch_health_app >>> from skmine.datasets.utils import describe_logs >>> describe(fetch_health_app()) # doctest: +SKIP {'n_events': 20, 'avg_delta_per_event': Timedelta('0 days 00:53:24.984000'), 'avg_nb_points_per_event': 100.0} """ gb = D.groupby(D.values) a = gb.apply(lambda df: (df.index.max() - df.index.min(), len(df))) avg_nb_points = a.str[1].mean() return dict( n_events=len(gb), avg_delta_per_event=a.str[0].mean(), avg_nb_points_per_event=avg_nb_points, )