Source code for skmine.datasets.fimi

"""
Base IO for all FIMI datasets
All datasets are available here : `http://fimi.uantwerpen.be/data/`
"""
import gzip
import os

import pandas as pd
import wget

from ._base import get_data_home

BASE_URL_FIMI = "http://fimi.uantwerpen.be/data/"
BASE_URL_CGI = "https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/DataSets/"


def _read_dat(filepath, int_values=True, separator=' ', zip=False):
    """Read a local dataset file whose separator can be customized and whose values are either integers or strings.

    Parameters
    ----------
    filepath : str
        Indicate the path of the file to be read

    int_values : bool
        Specify if the items in the file are all integers. If not, then the items are considered as strings.

    separator : str
        Specify a separator between items other than the default space

    Returns
    -------
    list
        Return a transaction list composed for each transaction of a list of items
    """
    try:
        if zip:
            with gzip.open(filepath, 'rt') as f:
                lines = f.read().splitlines()
        else:
            with open(filepath, 'r') as f:
                lines = f.read().splitlines()
    except UnicodeDecodeError:
        print(f"The file {filepath} is already present in your data_home but it is a binary file, it must be "
              f"deleted.")
        raise

    transactions = [[int(item) if int_values else item for item in line.rstrip().split(separator)] for line in lines]

    return transactions


[docs]def fetch_file(filepath, separator=' ', int_values=False):
    """Loader for files in FIMI format

    Parameters
    ----------
    filepath : str
        Path of the file to load

    separator : str
        Indicate a custom separator between the items. By default, it is a space.

    int_values : bool, default=False
        Specify if the items in the file are all integers. If not, then the items are considered as strings.
        With integers, the algorithms are more efficient.

    Returns
    -------
    pd.Series
        Transactions from the requested dataset,
        as an in-memory pandas Series
    """
    transactions = _read_dat(filepath, int_values=int_values, separator=separator)
    s = pd.Series(transactions, name=os.path.splitext(os.path.basename(filepath))[0])

    return s


[docs]def fetch_any(filename, base_url=BASE_URL_FIMI, data_home=None):
    """Base loader for all datasets from the FIMI and CGI repository
    Each unique transaction will be represented as a Python list in the resulting pandas Series

    see: http://fimi.uantwerpen.be/data/
    https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/DataSets/dataSets.html

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `~/scikit_mine_data/` subfolders.

    filename : str
        Name of the file to fetch

    base_url : str
        URL indicating where to fetch the dataset

    Returns
    -------
    pd.Series
        Transactions from the requested dataset,
        as an in-memory pandas Series
    """
    data_home = data_home or get_data_home()
    filepath = os.path.join(data_home, filename)
    name, _ = os.path.splitext(filename)
    if filename in os.listdir(data_home):  # already fetched
        it = _read_dat(filepath, zip=True) if base_url == BASE_URL_CGI else _read_dat(filepath)
    else:  # not fetched yet
        url = base_url + filename
        wget.download(url, filepath)
        it = _read_dat(filepath, zip=True) if base_url == BASE_URL_CGI else _read_dat(filepath)

    s = pd.Series(it, name=name)
    return s


[docs]def fetch_chess(data_home=None):
    """Fetch and return the chess dataset (Frequent Itemset Mining)

    ====================   ==============
    Nb of items                        75
    Nb of transactions               3196
    Avg transaction size             37.0
    Density                         0.493
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the chess dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("chess.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_connect(data_home=None):
    """Fetch and return the connect dataset (Frequent Itemset Mining).

    ====================   ==============
    Nb of items                       129
    Nb of transactions              67557
    Avg transaction size             43.0
    Density                         0.333
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the connect dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("connect.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_mushroom(data_home=None, return_y=False):
    """Fetch and return the mushroom dataset (Frequent Itemset Mining)

    The Mushroom data set includes descriptions of hypothetical samples corresponding
    to 23 species of gilled mushrooms in the Agaricus and Lepiota Family.

    It contains information about 8124 mushrooms (transactions).
    4208 (51.8%) are edible and 3916 (48.2%) are poisonous.

    The data contains 22 nomoinal features plus the class attribure (edible or not).
    These features were translated into 117 items.

    ====================   ==============
    Nb of items                       117
    Nb of transactions               8124
    Avg transaction size             22.0
    Density                         0.188
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    return_y: bool, default=False.
        If True, returns a tuple for both the data and the associated labels
        (0 for edible, 1 for poisonous)

    Returns
    -------
    mush : pd.Series
        Transactions from the mushroom dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.

    (mush, y) : tuple
        if ``return_y`` is True

    Examples
    --------
    >>> from skmine.datasets.fimi import fetch_mushroom
    >>> from skmine.datasets.utils import describe
    >>> X, y = fetch_mushroom(return_y=True)
    >>> describe(X)['n_items']
    117
    >>> y.value_counts()
    0    4208
    1    3916
    Name: mushroom, dtype: int64
    """
    mush = fetch_any("mushroom.dat", base_url=BASE_URL_FIMI, data_home=data_home)
    if return_y:
        y = mush.str[0].replace(2, 0)  # 2 is edible, 1 is poisonous
        X = mush.str[1:]
        return X, y
    else:
        return mush


[docs]def fetch_pumsb(data_home=None):
    """Fetch and return the pumsb dataset (Frequent Itemset Mining)

    The Pumsb dataset contains census data for population and housing.

    ====================   ==============
    Nb of items                      2113
    Nb of transactions              49046
    Avg transaction size             74.0
    Density                         0.035
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the pumsb dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("pumsb.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_pumsb_star(data_home=None):
    """Fetch and return the pumsb_star dataset (Frequent Itemset Mining)

    ====================   ==============
    Nb of items                      2088
    Nb of transactions              49046
    Avg transaction size            50.48
    Density                         0.024
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the pumsb_star dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("pumsb_star.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_kosarak(data_home=None):
    """Fetch and return the kosarak dataset (Frequent Itemset Mining)

    Click-stream data from a hungarian on-line news portal.

    ====================   ==============
    Nb of items                     36855
    Nb of transactions             990002
    Avg transaction size              8.1
    Density                      0.000220
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the kosarak dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("kosarak.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_retail(data_home=None):
    """Fetch and return the retail dataset (Frequent Itemset Mining)

    Contains market basket data from a Belgian retail store, anonymized.

    see: http://fimi.uantwerpen.be/data/retail.pdf

    ====================   ==============
    Nb of items                     16470
    Nb of transactions              88162
    Avg transaction size             10.3
    Densisty                     0.000626
    ====================   ==============

    Retail market basket data set supplied by a anonymous Belgian retail supermarket store.

    Results in approximately 5 months of data.
    The total amount of receipts being collected equals 88,163.

    In total, 5,133 customers have purchased at least one product during the data collection period

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the retail dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.
    """
    return fetch_any("retail.dat", base_url=BASE_URL_FIMI, data_home=data_home)


[docs]def fetch_accidents(data_home=None):
    """Fetch and return the accidents dataset (Frequent Itemset Mining)

    Traffic accident data, anonymized.

    see: http://fimi.uantwerpen.be/data/accidents.pdf

    ====================   ==============
    Nb of items                       468
    Nb of transactions             340183
    Avg transaction size           33.807
    Density                         0.072
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `~/scikit_mine_data`.

    Returns
    -------
    pd.Series
        Transactions from the accidents dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.

    """
    return fetch_any("accidents.dat", base_url=BASE_URL_FIMI, data_home=data_home)


def fetch_iris(data_home=None, return_y=False):
    """Fetch and return the discretized iris dataset (Frequent Itemset Mining)

    This dataset corresponds to the iris dataset which has been discretized into 19 items.
    The last column (items: 17, 18, 19) corresponds to the targets and can be useful for classification.

    see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#iris

    ====================   ==============
    Nb of items                        19
    Nb of transactions                150
    Avg transaction size                5
    Density                         26.32
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `~/scikit_mine_data`.

    return_y : bool, default: False
        If True, returns a tuple for both the data and the associated labels.

    Returns
    -------
    pd.Series
        Transactions from the iris dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.

    """
    iris = fetch_any("iris.D19.N150.C3.num.gz", base_url=BASE_URL_CGI, data_home=data_home)
    if return_y:
        y = iris.str[-1]
        X = iris.str[:-1]
        return X, y
    else:
        return iris


def fetch_breast(data_home=None, return_y=False):
    """Fetch and return the discretized breast dataset (Frequent Itemset Mining)

    This dataset corresponds to the breast dataset which has been discretized into 20 items.
    The last column (items: 19, 20) corresponds to the targets and can be useful for classification.

    see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#breast

    ====================   ==============
    Nb of items                        20
    Nb of transactions                699
    Avg transaction size             9.98
    Density                            50
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `~/scikit_mine_data`.

    return_y : bool, default: False
        If True, returns a tuple for both the data and the associated labels.

    Returns
    -------
    pd.Series
        Transactions from the breast dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.

    """
    breast = fetch_any("breast.D20.N699.C2.num", base_url=BASE_URL_CGI, data_home=data_home)
    if return_y:
        y = breast.str[-1]
        X = breast.str[:-1]
        return X, y
    else:
        return breast


def fetch_tictactoe(data_home=None, return_y=False):
    """Fetch and return the discretized tictactoe dataset (Frequent Itemset Mining)

    This dataset corresponds to the tictactoe dataset which has been discretized into 29 items.
    The last column (items: 28, 29) corresponds to the targets and can be useful for classification.

    see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#tictactoe

    ====================   ==============
    Nb of items                        29
    Nb of transactions                958
    Nb of transactions                699
    Avg transaction size               10
    Density                         34.48
    ====================   ==============

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `~/scikit_mine_data`.

    return_y : bool, default: False
        If True, returns a tuple for both the data and the associated labels.

    Returns
    -------
    pd.Series
        Transactions from the breast dataset, as an in-memory pandas Series.
        Each unique transaction is represented as a Python list.

    """
    tictactoe = fetch_any("ticTacToe.D29.N958.C2.num", base_url=BASE_URL_CGI, data_home=data_home)
    if return_y:
        y = tictactoe.str[-1]
        X = tictactoe.str[:-1]
        return X, y
    else:
        return tictactoe