Miscellaneous stats across the paper¶

import json
import pathlib

import numpy as np
import pandas as pd
import xgboost as xgb
from tcbench.modeling import backbone

Section 3¶

total number of campaigns¶

def find_artifacts_folder(folder):
    if folder.name == "artifacts":
        return [folder]

    res = []
    for item in folder.iterdir():
        if item.is_dir():
            res += find_artifacts_folder(item)
    return res

# "campaigns/mirage19/augmentation-at-loading-no-dropout/minpkts10/

folders = find_artifacts_folder(pathlib.Path("./campaigns/"))
len(folders)

total number of runs¶

sum([len(list(path.iterdir())) for path in folders])

Section 4¶

Average depth of xgboost models¶

class Node:
    def __init__(self, node_id, left=None, right=None):
        self.node_id = node_id
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None

def build_graph(tree_data):
    df_tmp = tree_data.fillna(-1)
    nodes = {node_id: Node(node_id) for node_id in df_tmp["ID"]}
    nodes[-1] = None

    for idx, node in zip(range(len(df_tmp)), nodes.values()):
        left_id, right_id = df_tmp.iloc[idx][["Yes", "No"]]
        node.left = nodes.get(left_id, None)
        node.right = nodes.get(right_id, None)

    return next(iter(nodes.values()))

def _graph_max_depth(node, depth=0):
    if node.is_leaf():
        return depth
    return max(
        _graph_max_depth(node.left, depth + 1), _graph_max_depth(node.right, depth + 1)
    )

def _tree_max_depth(df_tree):
    root = build_graph(df_tree)
    return _graph_max_depth(root)

def trees_avg_depth(fname):
    xgb_model = backbone.xgboost_factory().xgb_model
    xgb_model.load_model(fname)
    booster_data = xgb_model.get_booster().trees_to_dataframe()
    return booster_data.groupby("Tree").apply(_tree_max_depth).mean()

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/xgboost/noaugmentation-timeseries/artifacts/"
)

np.array([trees_avg_depth(fname) for fname in folder.glob("*/*.json")]).mean()

1.6982666666666666

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/xgboost/noaugmentation-flowpic/artifacts/"
)

np.array([trees_avg_depth(fname) for fname in folder.glob("*/*.json")]).mean()

1.3896

Section 4¶

average experiment duration¶

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/augmentation-at-loading-with-dropout/campaign_summary/augment-at-loading-with-dropout/"
)

# all test splits are evaluated at the same time
# so it's enough to check one of them
runs_1500 = pd.read_parquet(folder / "runsinfo_flowpic_dim_1500.parquet")
runs_1500[runs_1500["test_split_name"] == "test-script"]["run_duration"].mean()

1512.8632845379057

runs_32 = pd.read_parquet(folder / "runsinfo_flowpic_dim_32.parquet")
runs_32[runs_32["test_split_name"] == "test-script"]["run_duration"].mean()

55.191846643175396

runs_64 = pd.read_parquet(folder / "runsinfo_flowpic_dim_64.parquet")
runs_64[runs_64["test_split_name"] == "test-script"]["run_duration"].mean()

70.5957797731672

number of samples when doing a 80/20 train/val split based on all samples available¶

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/larger-trainset/augmentation-at-loading"
)

# this is reported in the logs so we can simply check one run
# that does not have any augmentation

runs = pd.read_parquet(
    folder
    / "campaign_summary/augment-at-loading-larger-trainset/runsinfo_flowpic_dim_32.parquet"
)

run_hash = runs[runs["aug_name"] == "noaug"]["hash"].values[0]

fname_log = folder / "artifacts" / run_hash / "log.txt"
fname_log.read_text().splitlines()[:32]

['',
'connecting to AIM repo at: /mnt/storage/finamore/imc23-submission/camera-ready/campaigns/ucdavis-icdm19/augment-at-loading_larger-trainset/__staging__/netml05_gpu0',
'created aim run hash=d0af742e1b0846169452b04a',
'artifacts folder at: /mnt/storage/finamore/imc23-submission/camera-ready/campaigns/ucdavis-icdm19/augment-at-loading_larger-trainset/__staging__/netml05_gpu0/artifacts/d0af742e1b0846169452b04a',
'WARNING: the artifact folder is not a subfolder of the AIM repo',
'--- run hparams ---',
'flowpic_dim: 32',
'flowpic_block_duration: 15',
'split_index: -1',
'max_samples_per_class: -1',
'aug_name: noaug',
'patience_steps: 5',
'suppress_val_augmentation: False',
'dataset: ucdavis-icdm19',
'dataset_minpkts: -1',
'seed: 25',
'with_dropout: False',
'campaign_id: augment-at-loading-larger-trainset',
'campaign_exp_idx: 20',
'-------------------',
'loaded: /opt/anaconda/anaconda3/envs/tcbench/lib/python3.10/site-packages/tcbench/libtcdatasets/datasets/ucdavis-icdm19/preprocessed/ucdavis-icdm19.parquet',
'no augmentation',
'no augmentation',
'dataset samples count',
'               train  val',
'app                      ',
'google-search   1532  383',
'google-drive    1307  327',
'google-doc       976  245',
'youtube          861  216',
'google-music     473  119',
'']

comparing SimCLR results between 100 samples and larger training¶

df_100_samples = pd.read_csv(
    "campaigns/ucdavis-icdm19/simclr-dropout-and-projection/campaign_summary/simclr-dropout-and-projection/summary_flowpic_dim_32.csv",
    header = [0, 1],
    index_col = [0, 1, 2]
)

ser_100samples = df_100_samples["acc"].xs(30, level=1, axis=0).xs(False, level=1, axis=0)["mean"]
ser_100samples

test-human     74.690909
test-script    92.184000
Name: mean, dtype: float64

df_largerdataset = pd.read_csv(
    "campaigns/ucdavis-icdm19/larger-trainset/simclr/campaign_summary/simclr-larger-trainset/summary_flowpic_dim_32.csv",
    header = [0, 1],
    index_col = [0, 1]
)

ser_largerdataset = df_largerdataset["acc"]["mean"].droplevel(1, axis=0)

ser_largerdataset

test-human     80.454545
test-script    93.900000
Name: mean, dtype: float64

(ser_largerdataset - ser_100samples).round(2)

test-human     5.76
test-script    1.72
Name: mean, dtype: float64

min and max from Table 3¶

df_script = pd.read_csv(
    "table3_ucdavis-icdm19_comparing_data_augmentations_functions_test_on_script.csv",
    header=[0, 1, 2],
    index_col=[0],
)

df_human = pd.read_csv(
    "table3_ucdavis-icdm19_comparing_data_augmentations_functions_test_on_human.csv",
    header=[0, 1, 2],
    index_col=[0],
)

ser_script = df_script["ours"]["32"]["mean"].drop("mean_diff", axis=0)
ser_script.name = "script"

ser_human = df_human["ours"]["32"]["mean"].drop("mean_diff", axis=0)
ser_human.name = "human"

df_tmp = pd.concat((ser_script, ser_human), axis=1)
df_tmp.max() - df_tmp.min()

script    2.09
human     3.22
dtype: float64

min and max from Table 8¶

df_others = pd.read_csv(
    "table8_augmentation-at-loading_on_other_datasets.csv", header=[0, 1], index_col=[0]
)
df_tmp = df_others.xs("mean", level=1, axis=1)
df_tmp.max() - df_tmp.min()

mirage22 - minpkts10          5.50
mirage22 - minpkts1000       10.08
utmobilenet21 - minpkts10     9.84
mirage19 - minpkts10         13.93
dtype: float64