Source code for skmine.datasets.periodic

"""
Base IO for all periodic datasets
"""
import os
import re
from datetime import datetime, timedelta

import pandas as pd

from skmine.datasets._base import get_data_home

UBIQ_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00369/UbiqLog4UCI.zip"
health_app_url = "https://raw.githubusercontent.com/logpai/loghub/master/HealthApp/HealthApp_2k.log"
canadianTV_url = "https://zenodo.org/record/4671512/files/canadian_tv.txt"


def fetch_file(filepath, separator=',', format=None):
    """Loader for files in periodic format (timestamp,event\n). The first element can be a datetime or an integer and
    the second is a string.
    This file reader can also work for files with only one value per line (the event).
    The indexes then correspond to the line numbers.

    Parameters
    ----------
    filepath : str
        Path of the file to load

    separator : str
        Indicate a custom separator between timestamps and events. By default, it is a comma.
        If the file contains only one column, this parameter is not useful.
    format : str
        format for datetime, like "%d/%m/%Y %H:%M:%S" for day/month/year hour:min:sec
         see https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for all possibilities

    Returns
    -------
    pd.Series
        Logs from the custom dataset, as an in-memory pandas Series.
        Events are indexed by timestamps.
    """
    s = pd.read_csv(filepath, sep=separator, header=None, dtype="string", skipinitialspace=True).squeeze(axis="columns")
    if type(s) == pd.DataFrame:
        s = pd.Series(s[1].values, index=s[0])
        try:
            s.index = pd.to_datetime(s.index, format=format)
        except ValueError:
            s.index = s.index.astype("int64")
    s.index.name = "timestamp"
    s.name = filepath
    return s


[docs]def fetch_health_app(data_home=None, filename="health_app.csv"):
    """Fetch and return the health app log dataset

    see: https://github.com/logpai/loghub

    HealthApp is a mobile application for Android devices.
    Logs were collected from an Android smartphone after 10+ days of use.

    Logs have been grouped by their types, hence resulting
    in only 20 different events.

    ==============================      ===================================
    Number of occurrences               2000
    Number of events                    20
    Average delta per event             Timedelta('0 days 00:53:24.984000')
    Average nb of points per event      100.0
    ==============================      ===================================

    Parameters
    ----------
    filename : str, default: health_app.csv
        Name of the file (without the data_home directory) where the dataset will be or is already downloaded.

    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        System logs from the health app dataset, as an in-memory pandas Series.
        Events are indexed by timestamps.
    """
    data_home = data_home or get_data_home()
    p = os.path.join(data_home, filename)
    kwargs = dict(header=None, index_col=0, dtype="string")
    if filename in os.listdir(data_home):
        s = pd.read_csv(p, **kwargs).squeeze(axis="columns")
    else:
        s = pd.read_csv(health_app_url, sep="|", on_bad_lines='skip', usecols=[0, 1], **kwargs).squeeze(axis="columns")
        s.to_csv(p, header=False)
    s.index.name = "timestamp"
    s.index = pd.to_datetime(s.index, format="%Y%m%d-%H:%M:%S:%f")

    return s


[docs]def fetch_canadian_tv(data_home=None, filename="canadian_tv.txt"):
    """
    Fetch and return canadian TV logs from August 2020

    see: https://zenodo.org/record/4671512

    If the dataset has never been downloaded before, it will be downloaded and stored.

    The returned dataset contains only TV series programs indexed by their associated timestamps.
    Adverts are ignored when loading the dataset.

    ==============================      =======================================
    Number of occurrences               2093
    Number of events                    98
    Average delta per event             Timedelta('19 days 02:13:36.122448979')
    Average nb of points per event      21.35714285714285
    ==============================      =======================================

    Parameters
    ----------
    filename : str, default: canadian_tv.txt
        Name of the file (without the data_home directory) where the dataset will be or is already downloaded.

    data_home : optional, default: None
        Specify another download and cache folder for the datasets.
        By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
        TV series events from canadian TV, as an in-memory pandas Series.
        Events are indexed by timestamps.

    Notes
    -----
    For now the entire .zip file is downloaded, being ~90mb on disk
    Downloading preprocessed dataset from zenodo.org is something we consider.

    See Also
    -------
    skmine.datasets.get_data_home
    """
    data_home = data_home or get_data_home()
    p = os.path.join(data_home, filename)
    kwargs = dict(header=None, dtype="string", index_col=0)

    if filename not in os.listdir(data_home):
        s = pd.read_csv(canadianTV_url, **kwargs).squeeze(axis="columns")
        s.to_csv(p, index=True, header=False)
    else:
        s = pd.read_csv(p, **kwargs).squeeze(axis="columns")

    s.index = pd.to_datetime(s.index)
    s.index.name = "timestamp"
    s.name = "canadian_tv"
    return s


def fetch_ubiq(user_filename="25_F_ISE_data.dat", data_home=None):  # pragma : no cover
    """
    Fetch and return smartphone lifelogging event from different users
    see : https://archive.ics.uci.edu/ml/datasets/UbiqLog+%28smartphone+lifelogging%29

    If the dataset has never been downloaded before, it will be downloaded and stored.

    Parameters
    ----------
    user_filename : str, default: 1_M_IS_data.dat
       file to be loaded , by default 1_M is user , IS for normal dataset where timestamps are dropped and replace by
       1, 2, 3, 4...                                ISE for real timed dataset(events are annotated with
       Instantaneous _I, Start _S, End _E) like file 2_F_ISE_data.dat

    data_home : optional, default: None
       Specify another download and cache folder for the datasets.
       By default, all scikit-mine data is stored in `scikit-mine_data`.

    Returns
    -------
    pd.Series
       smartphone lifelogging event for the sepcified user
       Events are indexed by timestamps.

    Notes
    -----
    For now the entire .zip file is downloaded, being ~64mb on disk

    See Also
    -------
    skmine.datasets.get_data_home
    """
    data_home = data_home or get_data_home()
    ubiq_dir = os.path.join(data_home, 'UbiqLog')
    user_ubiq_dir = os.path.join(ubiq_dir, 'users_ubiq')
    if not os.path.exists(ubiq_dir):
        os.makedirs(ubiq_dir, exist_ok=True)
        os.chdir(ubiq_dir)

        infile = "all_log_applications_nonbin.txt"
        os.system("wget " + UBIQ_url)
        os.system("unzip UbiqLog4UCI.zip")
        os.system("rm __MACOSX -rf")
        os.system('grep "\\"Application\\":" UbiqLog4UCI/*/log_*.txt > ' + infile)
        os.makedirs(user_ubiq_dir)
        parse_all_user(infile, user_ubiq_dir, min_occ=10)

    filename = os.path.join(user_ubiq_dir, user_filename)
    if not os.path.exists(filename):
        raise FileNotFoundError("Searching for :" + filename)

    s, user, start_time = read_ubiq_user(filename)
    s.index.name = "timestamp"
    s.name = "Ubiq" + user

    print(f"Series loaded from {user_filename} : user {user}, start time {start_time}, nb_event {len(s)}")
    typ = "absolute time" if "ISE" in user_filename else "relative time"
    print("timestamps are in ", typ)
    return s


def read_ubiq_user(filename: str) -> tuple:
    """   Read user-event file (csv format with tabulation) and process it to return a pd.Series with event and
    timestamps as index

    Parameters
    ----------
    filename : str, default: 1_M_IS_data.dat
       file to be loaded , by default 1_M is user , IS for normal dataset (ISE if events are annotated with
       Instantaneous, Start, End)

    Returns
    -------
    tuple
        (df, user, start_time)
        df is the returned pd.Series, user is 1_M for example and , start time is the first timestamp
       smartphone lifelogging event for the specified user
       Events are indexed by timestamps, time unit used is second
    """
    sep = "\t"
    df = pd.read_csv(filename, sep=sep, header=None, dtype="string")
    user_info, start_time_str = df.loc[0]
    datetime_str = start_time_str.split('=')[1]
    user = user_info.split('=')[1]
    start_time = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')

    df.drop(index=df.index[0], axis=0, inplace=True)
    df.rename(columns={0: 'diff_time', 1: 'event'}, inplace=True)
    df = df.astype({"diff_time": int, "event": str})
    if filename.endswith('_IS_data.dat'):
        df['time'] = df.index  # succession of events , index = 1 2 3 4 5 ....
    elif filename.endswith('_ISE_data.dat'):
        df['time'] = df['diff_time'].apply(lambda x: start_time + timedelta(seconds=x))
    else:
        raise ValueError("cant parse such files")
    df = df[['time', 'event']]
    df.set_index('time', inplace=True)
    serie = pd.Series(df['event'], index=df.index).astype('string')  # cast from object to string
    return serie, user, start_time


def parse_all_user(infile: str, out_dir: str, min_occ=10) -> None:  # pragma : no cover
    """   Parse global file with multiple user and construct csv files , one per user

    Parameters
    ----------
    infile :  str
       global event file for all users . all_log_applications_nonbin.txt by example

    out_dir: str
        directory where to write each user csv.file

    min_occ: int
        minimum of occurence : FIXME to be explained

    """
    # exemple line to parse
    # UbiqLog4UCI/10_M/log_11-29-2013.txt:{"Application":{"ProcessName":"com.broadcom.bt.app.system",
    # "Start":"11-29-2013 08:15:57","End":"11-29-2013 08:18:18"}}

    users = {}
    users_drop = set()
    with open(infile) as fp:
        for li, line in enumerate(fp):
            # print(line)
            line = '/'.join(line.strip().split('/')[1:])  # drop UbiqLog4UCI/
            # print(line)
            tmp = re.match('(?P<user>[0-9]*_[FM])/(?P<file>log_[0-9\-]+.txt):.*"ProcessName":"(?P<process>[^"]*)",'
                           '.*"Start":"(?P<start_time>[^"]*)",.*"End":"(?P<end_time>[^"]*)"', line)
            if tmp is not None:
                user = tmp.group("user")
                d = None
                if user not in users_drop:
                    try:
                        d = (datetime.strptime(tmp.group("start_time"), '%m-%d-%Y %H:%M:%S'),
                             datetime.strptime(tmp.group("end_time"), '%m-%d-%Y %H:%M:%S'))
                    except ValueError:
                        users_drop.add(user)
                        d = None

                if user not in users_drop and d is not None:
                    if user not in users:
                        users[user] = {"ev": [], "counts": {}}

                    delta = (d[1] - d[0]).total_seconds()
                    if delta < 60:  # last less than a minute
                        evs = [(d[0], "%s_I" % tmp.group("process"))]
                    else:
                        evs = [(d[0], "%s_S" % tmp.group("process")), (d[1], "%s_E" % tmp.group("process"))]
                    for (tt, ev) in evs:
                        users[user]["ev"].append((tt, ev))
                        users[user]["counts"][ev] = users[user]["counts"].get(ev, 0) + 1

    print("DROP", users_drop)
    for user, dt in users.items():
        if user not in users_drop:
            evs_tmp = [d for d in dt["ev"] if dt["counts"].get(d[1], 0) > min_occ]
            if len(evs_tmp) > min_occ:
                evs_tmp = sorted(evs_tmp)
                evs = sorted([(int((d[0] - evs_tmp[0][0]).total_seconds() / 60), d[-1]) for d in evs_tmp])
                with open("%s/%s_ISE_data.dat" % (out_dir, user), "w") as fo:
                    fo.write("### user=%s\tstart_time=%s\n" % (user, evs_tmp[0][0]))
                    prev = None
                    for pair in evs:
                        if pair != prev:
                            fo.write("%d\t%s\n" % pair)
                            prev = pair

                with open("%s/%s_IS_data.dat" % (out_dir, user), "w") as fo:
                    fo.write("### user=%s\tstart_time=%s\n" % (user, evs_tmp[0][0]))
                    prev = None
                    for tt in evs:
                        db = tt[-1].split("_")
                        if db[-1] in ["I", "S"]:
                            pair = (tt[0], "_".join(db[:-1]))
                            if pair != prev:
                                fo.write("%d\t%s\n" % pair)
                                prev = pair