Skip to content

Miscellaneous stats across the paper

import json
import pathlib

import numpy as np
import pandas as pd
import xgboost as xgb
from tcbench.modeling import backbone

Section 3

total number of campaigns

def find_artifacts_folder(folder):
    if folder.name == "artifacts":
        return [folder]

    res = []
    for item in folder.iterdir():
        if item.is_dir():
            res += find_artifacts_folder(item)
    return res

# "campaigns/mirage19/augmentation-at-loading-no-dropout/minpkts10/
folders = find_artifacts_folder(pathlib.Path("./campaigns/"))
len(folders)
13

total number of runs

sum([len(list(path.iterdir())) for path in folders])
2760

Section 4

Average depth of xgboost models

class Node:
    def __init__(self, node_id, left=None, right=None):
        self.node_id = node_id
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None
def build_graph(tree_data):
    df_tmp = tree_data.fillna(-1)
    nodes = {node_id: Node(node_id) for node_id in df_tmp["ID"]}
    nodes[-1] = None

    for idx, node in zip(range(len(df_tmp)), nodes.values()):
        left_id, right_id = df_tmp.iloc[idx][["Yes", "No"]]
        node.left = nodes.get(left_id, None)
        node.right = nodes.get(right_id, None)

    return next(iter(nodes.values()))
def _graph_max_depth(node, depth=0):
    if node.is_leaf():
        return depth
    return max(
        _graph_max_depth(node.left, depth + 1), _graph_max_depth(node.right, depth + 1)
    )
def _tree_max_depth(df_tree):
    root = build_graph(df_tree)
    return _graph_max_depth(root)
def trees_avg_depth(fname):
    xgb_model = backbone.xgboost_factory().xgb_model
    xgb_model.load_model(fname)
    booster_data = xgb_model.get_booster().trees_to_dataframe()
    return booster_data.groupby("Tree").apply(_tree_max_depth).mean()
folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/xgboost/noaugmentation-timeseries/artifacts/"
)

np.array([trees_avg_depth(fname) for fname in folder.glob("*/*.json")]).mean()
1.6982666666666666
folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/xgboost/noaugmentation-flowpic/artifacts/"
)

np.array([trees_avg_depth(fname) for fname in folder.glob("*/*.json")]).mean()
1.3896

Section 4

average experiment duration

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/augmentation-at-loading-with-dropout/campaign_summary/augment-at-loading-with-dropout/"
)
# all test splits are evaluated at the same time
# so it's enough to check one of them
runs_1500 = pd.read_parquet(folder / "runsinfo_flowpic_dim_1500.parquet")
runs_1500[runs_1500["test_split_name"] == "test-script"]["run_duration"].mean()
1512.8632845379057
runs_32 = pd.read_parquet(folder / "runsinfo_flowpic_dim_32.parquet")
runs_32[runs_32["test_split_name"] == "test-script"]["run_duration"].mean()
55.191846643175396
runs_64 = pd.read_parquet(folder / "runsinfo_flowpic_dim_64.parquet")
runs_64[runs_64["test_split_name"] == "test-script"]["run_duration"].mean()
70.5957797731672

number of samples when doing a 80/20 train/val split based on all samples available

folder = pathlib.Path(
    "campaigns/ucdavis-icdm19/larger-trainset/augmentation-at-loading"
)
# this is reported in the logs so we can simply check one run
# that does not have any augmentation

runs = pd.read_parquet(
    folder
    / "campaign_summary/augment-at-loading-larger-trainset/runsinfo_flowpic_dim_32.parquet"
)
run_hash = runs[runs["aug_name"] == "noaug"]["hash"].values[0]
fname_log = folder / "artifacts" / run_hash / "log.txt"
fname_log.read_text().splitlines()[:32]
['',
'connecting to AIM repo at: /mnt/storage/finamore/imc23-submission/camera-ready/campaigns/ucdavis-icdm19/augment-at-loading_larger-trainset/__staging__/netml05_gpu0',
'created aim run hash=d0af742e1b0846169452b04a',
'artifacts folder at: /mnt/storage/finamore/imc23-submission/camera-ready/campaigns/ucdavis-icdm19/augment-at-loading_larger-trainset/__staging__/netml05_gpu0/artifacts/d0af742e1b0846169452b04a',
'WARNING: the artifact folder is not a subfolder of the AIM repo',
'--- run hparams ---',
'flowpic_dim: 32',
'flowpic_block_duration: 15',
'split_index: -1',
'max_samples_per_class: -1',
'aug_name: noaug',
'patience_steps: 5',
'suppress_val_augmentation: False',
'dataset: ucdavis-icdm19',
'dataset_minpkts: -1',
'seed: 25',
'with_dropout: False',
'campaign_id: augment-at-loading-larger-trainset',
'campaign_exp_idx: 20',
'-------------------',
'loaded: /opt/anaconda/anaconda3/envs/tcbench/lib/python3.10/site-packages/tcbench/libtcdatasets/datasets/ucdavis-icdm19/preprocessed/ucdavis-icdm19.parquet',
'no augmentation',
'no augmentation',
'dataset samples count',
'               train  val',
'app                      ',
'google-search   1532  383',
'google-drive    1307  327',
'google-doc       976  245',
'youtube          861  216',
'google-music     473  119',
'']

comparing SimCLR results between 100 samples and larger training

df_100_samples = pd.read_csv(
    "campaigns/ucdavis-icdm19/simclr-dropout-and-projection/campaign_summary/simclr-dropout-and-projection/summary_flowpic_dim_32.csv",
    header = [0, 1],
    index_col = [0, 1, 2]
)
ser_100samples = df_100_samples["acc"].xs(30, level=1, axis=0).xs(False, level=1, axis=0)["mean"]
ser_100samples
test-human     74.690909
test-script    92.184000
Name: mean, dtype: float64
df_largerdataset = pd.read_csv(
    "campaigns/ucdavis-icdm19/larger-trainset/simclr/campaign_summary/simclr-larger-trainset/summary_flowpic_dim_32.csv",
    header = [0, 1],
    index_col = [0, 1]
)
ser_largerdataset = df_largerdataset["acc"]["mean"].droplevel(1, axis=0)
ser_largerdataset
test-human     80.454545
test-script    93.900000
Name: mean, dtype: float64
(ser_largerdataset - ser_100samples).round(2)
test-human     5.76
test-script    1.72
Name: mean, dtype: float64

min and max from Table 3

df_script = pd.read_csv(
    "table3_ucdavis-icdm19_comparing_data_augmentations_functions_test_on_script.csv",
    header=[0, 1, 2],
    index_col=[0],
)

df_human = pd.read_csv(
    "table3_ucdavis-icdm19_comparing_data_augmentations_functions_test_on_human.csv",
    header=[0, 1, 2],
    index_col=[0],
)
ser_script = df_script["ours"]["32"]["mean"].drop("mean_diff", axis=0)
ser_script.name = "script"

ser_human = df_human["ours"]["32"]["mean"].drop("mean_diff", axis=0)
ser_human.name = "human"

df_tmp = pd.concat((ser_script, ser_human), axis=1)
df_tmp.max() - df_tmp.min()
script    2.09
human     3.22
dtype: float64

min and max from Table 8

df_others = pd.read_csv(
    "table8_augmentation-at-loading_on_other_datasets.csv", header=[0, 1], index_col=[0]
)
df_tmp = df_others.xs("mean", level=1, axis=1)
df_tmp.max() - df_tmp.min()
mirage22 - minpkts10          5.50
mirage22 - minpkts1000       10.08
utmobilenet21 - minpkts10     9.84
mirage19 - minpkts10         13.93
dtype: float64