Skip to content

Table 2 : Datasets properties

import pandas as pd
import tcbench as tcb

ucdavis-icdm19

df = tcb.load_parquet(tcb.DATASETS.UCDAVISICDM19)

# add number of packets
df = df.assign(packets=df["pkts_size"].apply(len))

# number of samples
df_tmp = pd.DataFrame(
    df.groupby(["partition", "app"])["app"].value_counts()
).reset_index()
df_tmp = df_tmp.pivot(index="partition", columns="app", values="count")
df_tmp = df_tmp.assign(
    count=df_tmp.sum(axis=1),
    flows_min=df_tmp.min(axis=1),
    flows_max=df_tmp.max(axis=1),
    rho=(df_tmp.max(axis=1) / df_tmp.min(axis=1)).round(1),
    classes=len(df["app"].cat.categories),
)

# mean pkts per flow
mean_pkts = df.groupby("partition")["packets"].mean().round(0)
mean_pkts.name = "mean_pkts"
flows_all = df.groupby("partition")["partition"].count()
flows_all.name = "flows_all"

# combining everything together
df_tmp = pd.concat((df_tmp, mean_pkts, flows_all), axis=1)
df_tmp = df_tmp[["classes", "flows_all", "flows_min", "flows_max", "rho", "mean_pkts"]]
display(df_tmp)

stats_ucdavis19 = df_tmp
classes flows_all flows_min flows_max rho mean_pkts
partition
pretraining 5 6439 592 1915 3.2 6653.0
retraining-human-triggered 5 83 15 20 1.3 7666.0
retraining-script-triggered 5 150 30 30 1.0 7131.0

mirage19

The unfiltered version of the dataset has an extra class, which corresponds to "background" traffic

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.MIRAGE19)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)
# min_pkts = 10
df = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=10"],
)
df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)
display(df_tmp)
stats_mirage19 = df_tmp
classes flows_all flows_min flows_max rho mean_pkts
unfiltered 21 122007 1986 11737 5.9 23.0
min_pkts=10 20 64172 1013 7505 7.4 17.0

mirage22

The unfiltered version of the dataset has an extra class, which corresponds to "background" traffic

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)
# min_pkts = 10
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=10"],
)
# min_pkts = 1000
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=1000)

ser = df["app"].value_counts()
df_minpkts1000 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=1000"],
)
df_tmp = pd.concat((df_unfiltered, df_minpkts10, df_minpkts1000), axis=0)
display(df_tmp)
stats_mirage22 = df_tmp
classes flows_all flows_min flows_max rho mean_pkts
unfiltered 10 59071 2252 18882 8.4 3068.0
min_pkts=10 9 26773 970 4437 4.6 6598.0
min_pkts=1000 9 4569 190 2220 11.7 38321.0

utmobilenet21

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)
# unfiltered
df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["minpkts=10"],
)
df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)
display(df_tmp)
stats_utmobilenet21 = df_tmp
classes flows_all flows_min flows_max rho mean_pkts
unfiltered 17 34378 159 5591 35.2 664.0
minpkts=10 14 9460 130 2496 19.2 2366.0

alltogether

df_tmp = pd.concat(
    (
        (stats_ucdavis19.assign(dataset="ucdavis-icdm19")).set_index(
            ["dataset", stats_ucdavis19.index]
        ),
        (stats_mirage19.assign(dataset="mirage19")).set_index(
            ["dataset", stats_mirage19.index]
        ),
        (stats_mirage22.assign(dataset="mirage22")).set_index(
            ["dataset", stats_mirage22.index]
        ),
        (stats_utmobilenet21.assign(dataset="utmobilenet21")).set_index(
            ["dataset", stats_utmobilenet21.index]
        ),
    )
).rename(
    {
        "retraining-human-triggered": "human",
        "retraining-script-triggered": "script",
    },
    axis=0,
)
display(df_tmp)
df_tmp.to_csv("table2_datasets_properties.csv")
classes flows_all flows_min flows_max rho mean_pkts
dataset
ucdavis-icdm19 pretraining 5 6439 592 1915 3.2 6653.0
human 5 83 15 20 1.3 7666.0
script 5 150 30 30 1.0 7131.0
mirage19 unfiltered 21 122007 1986 11737 5.9 23.0
min_pkts=10 20 64172 1013 7505 7.4 17.0
mirage22 unfiltered 10 59071 2252 18882 8.4 3068.0
min_pkts=10 9 26773 970 4437 4.6 6598.0
min_pkts=1000 9 4569 190 2220 11.7 38321.0
utmobilenet21 unfiltered 17 34378 159 5591 35.2 664.0
minpkts=10 14 9460 130 2496 19.2 2366.0