Source code for skmine.datasets.fimi

"""
Base IO for all FIMI datasets
All datasets are available here : `http://fimi.uantwerpen.be/data/`
"""
import gzip
import os

import pandas as pd
import wget

from ._base import get_data_home

BASE_URL_FIMI = "http://fimi.uantwerpen.be/data/"
BASE_URL_CGI = "https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/DataSets/"


def _read_dat(filepath, int_values=True, separator=' ', zip=False):
    """Read a local dataset file whose separator can be customized and whose values are either integers or strings.

    Parameters
    ----------
    filepath : str
        Indicate the path of the file to be read

    int_values : bool
        Specify if the items in the file are all integers. If not, then the items are considered as strings.

    separator : str
        Specify a separator between items other than the default space

    Returns
    -------
    list
        Return a transaction list composed for each transaction of a list of items
    """
    try:
        if zip:
            with gzip.open(filepath, 'rt') as f:
                lines = f.read().splitlines()
        else:
            with open(filepath, 'r') as f:
                lines = f.read().splitlines()
    except UnicodeDecodeError:
        print(f"The file {filepath} is already present in your data_home but it is a binary file, it must be "
              f"deleted.")
        raise

    transactions = [[int(item) if int_values else item for item in line.rstrip().split(separator)] for line in lines]

    return transactions


[docs]def fetch_file(filepath, separator=' ', int_values=False): """Loader for files in FIMI format Parameters ---------- filepath : str Path of the file to load separator : str Indicate a custom separator between the items. By default, it is a space. int_values : bool, default=False Specify if the items in the file are all integers. If not, then the items are considered as strings. With integers, the algorithms are more efficient. Returns ------- pd.Series Transactions from the requested dataset, as an in-memory pandas Series """ transactions = _read_dat(filepath, int_values=int_values, separator=separator) s = pd.Series(transactions, name=os.path.splitext(os.path.basename(filepath))[0]) return s
[docs]def fetch_any(filename, base_url=BASE_URL_FIMI, data_home=None): """Base loader for all datasets from the FIMI and CGI repository Each unique transaction will be represented as a Python list in the resulting pandas Series see: http://fimi.uantwerpen.be/data/ https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/DataSets/dataSets.html Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `~/scikit_mine_data/` subfolders. filename : str Name of the file to fetch base_url : str URL indicating where to fetch the dataset Returns ------- pd.Series Transactions from the requested dataset, as an in-memory pandas Series """ data_home = data_home or get_data_home() filepath = os.path.join(data_home, filename) name, _ = os.path.splitext(filename) if filename in os.listdir(data_home): # already fetched it = _read_dat(filepath, zip=True) if base_url == BASE_URL_CGI else _read_dat(filepath) else: # not fetched yet url = base_url + filename wget.download(url, filepath) it = _read_dat(filepath, zip=True) if base_url == BASE_URL_CGI else _read_dat(filepath) s = pd.Series(it, name=name) return s
[docs]def fetch_chess(data_home=None): """Fetch and return the chess dataset (Frequent Itemset Mining) ==================== ============== Nb of items 75 Nb of transactions 3196 Avg transaction size 37.0 Density 0.493 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the chess dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("chess.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_connect(data_home=None): """Fetch and return the connect dataset (Frequent Itemset Mining). ==================== ============== Nb of items 129 Nb of transactions 67557 Avg transaction size 43.0 Density 0.333 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the connect dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("connect.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_mushroom(data_home=None, return_y=False): """Fetch and return the mushroom dataset (Frequent Itemset Mining) The Mushroom data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. It contains information about 8124 mushrooms (transactions). 4208 (51.8%) are edible and 3916 (48.2%) are poisonous. The data contains 22 nomoinal features plus the class attribure (edible or not). These features were translated into 117 items. ==================== ============== Nb of items 117 Nb of transactions 8124 Avg transaction size 22.0 Density 0.188 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. return_y: bool, default=False. If True, returns a tuple for both the data and the associated labels (0 for edible, 1 for poisonous) Returns ------- mush : pd.Series Transactions from the mushroom dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. (mush, y) : tuple if ``return_y`` is True Examples -------- >>> from skmine.datasets.fimi import fetch_mushroom >>> from skmine.datasets.utils import describe >>> X, y = fetch_mushroom(return_y=True) >>> describe(X)['n_items'] 117 >>> y.value_counts() 0 4208 1 3916 Name: mushroom, dtype: int64 """ mush = fetch_any("mushroom.dat", base_url=BASE_URL_FIMI, data_home=data_home) if return_y: y = mush.str[0].replace(2, 0) # 2 is edible, 1 is poisonous X = mush.str[1:] return X, y else: return mush
[docs]def fetch_pumsb(data_home=None): """Fetch and return the pumsb dataset (Frequent Itemset Mining) The Pumsb dataset contains census data for population and housing. ==================== ============== Nb of items 2113 Nb of transactions 49046 Avg transaction size 74.0 Density 0.035 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the pumsb dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("pumsb.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_pumsb_star(data_home=None): """Fetch and return the pumsb_star dataset (Frequent Itemset Mining) ==================== ============== Nb of items 2088 Nb of transactions 49046 Avg transaction size 50.48 Density 0.024 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the pumsb_star dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("pumsb_star.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_kosarak(data_home=None): """Fetch and return the kosarak dataset (Frequent Itemset Mining) Click-stream data from a hungarian on-line news portal. ==================== ============== Nb of items 36855 Nb of transactions 990002 Avg transaction size 8.1 Density 0.000220 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the kosarak dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("kosarak.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_retail(data_home=None): """Fetch and return the retail dataset (Frequent Itemset Mining) Contains market basket data from a Belgian retail store, anonymized. see: http://fimi.uantwerpen.be/data/retail.pdf ==================== ============== Nb of items 16470 Nb of transactions 88162 Avg transaction size 10.3 Densisty 0.000626 ==================== ============== Retail market basket data set supplied by a anonymous Belgian retail supermarket store. Results in approximately 5 months of data. The total amount of receipts being collected equals 88,163. In total, 5,133 customers have purchased at least one product during the data collection period Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `scikit-mine_data`. Returns ------- pd.Series Transactions from the retail dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("retail.dat", base_url=BASE_URL_FIMI, data_home=data_home)
[docs]def fetch_accidents(data_home=None): """Fetch and return the accidents dataset (Frequent Itemset Mining) Traffic accident data, anonymized. see: http://fimi.uantwerpen.be/data/accidents.pdf ==================== ============== Nb of items 468 Nb of transactions 340183 Avg transaction size 33.807 Density 0.072 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `~/scikit_mine_data`. Returns ------- pd.Series Transactions from the accidents dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ return fetch_any("accidents.dat", base_url=BASE_URL_FIMI, data_home=data_home)
def fetch_iris(data_home=None, return_y=False): """Fetch and return the discretized iris dataset (Frequent Itemset Mining) This dataset corresponds to the iris dataset which has been discretized into 19 items. The last column (items: 17, 18, 19) corresponds to the targets and can be useful for classification. see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#iris ==================== ============== Nb of items 19 Nb of transactions 150 Avg transaction size 5 Density 26.32 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `~/scikit_mine_data`. return_y : bool, default: False If True, returns a tuple for both the data and the associated labels. Returns ------- pd.Series Transactions from the iris dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ iris = fetch_any("iris.D19.N150.C3.num.gz", base_url=BASE_URL_CGI, data_home=data_home) if return_y: y = iris.str[-1] X = iris.str[:-1] return X, y else: return iris def fetch_breast(data_home=None, return_y=False): """Fetch and return the discretized breast dataset (Frequent Itemset Mining) This dataset corresponds to the breast dataset which has been discretized into 20 items. The last column (items: 19, 20) corresponds to the targets and can be useful for classification. see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#breast ==================== ============== Nb of items 20 Nb of transactions 699 Avg transaction size 9.98 Density 50 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `~/scikit_mine_data`. return_y : bool, default: False If True, returns a tuple for both the data and the associated labels. Returns ------- pd.Series Transactions from the breast dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ breast = fetch_any("breast.D20.N699.C2.num", base_url=BASE_URL_CGI, data_home=data_home) if return_y: y = breast.str[-1] X = breast.str[:-1] return X, y else: return breast def fetch_tictactoe(data_home=None, return_y=False): """Fetch and return the discretized tictactoe dataset (Frequent Itemset Mining) This dataset corresponds to the tictactoe dataset which has been discretized into 29 items. The last column (items: 28, 29) corresponds to the targets and can be useful for classification. see: https://cgi.csc.liv.ac.uk/~frans/KDD/Software/LUCS-KDD-DN/exmpleDNnotes.html#tictactoe ==================== ============== Nb of items 29 Nb of transactions 958 Nb of transactions 699 Avg transaction size 10 Density 34.48 ==================== ============== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default, all scikit-mine data is stored in `~/scikit_mine_data`. return_y : bool, default: False If True, returns a tuple for both the data and the associated labels. Returns ------- pd.Series Transactions from the breast dataset, as an in-memory pandas Series. Each unique transaction is represented as a Python list. """ tictactoe = fetch_any("ticTacToe.D29.N958.C2.num", base_url=BASE_URL_CGI, data_home=data_home) if return_y: y = tictactoe.str[-1] X = tictactoe.str[:-1] return X, y else: return tictactoe