Table 2 : Datasets properties¶

import pandas as pd
import tcbench as tcb

ucdavis-icdm19¶

df = tcb.load_parquet(tcb.DATASETS.UCDAVISICDM19)

# add number of packets
df = df.assign(packets=df["pkts_size"].apply(len))

# number of samples
df_tmp = pd.DataFrame(
    df.groupby(["partition", "app"])["app"].value_counts()
).reset_index()
df_tmp = df_tmp.pivot(index="partition", columns="app", values="count")
df_tmp = df_tmp.assign(
    count=df_tmp.sum(axis=1),
    flows_min=df_tmp.min(axis=1),
    flows_max=df_tmp.max(axis=1),
    rho=(df_tmp.max(axis=1) / df_tmp.min(axis=1)).round(1),
    classes=len(df["app"].cat.categories),
)

# mean pkts per flow
mean_pkts = df.groupby("partition")["packets"].mean().round(0)
mean_pkts.name = "mean_pkts"
flows_all = df.groupby("partition")["partition"].count()
flows_all.name = "flows_all"

# combining everything together
df_tmp = pd.concat((df_tmp, mean_pkts, flows_all), axis=1)
df_tmp = df_tmp[["classes", "flows_all", "flows_min", "flows_max", "rho", "mean_pkts"]]
display(df_tmp)

stats_ucdavis19 = df_tmp

	classes	flows_all	flows_min	flows_max	rho	mean_pkts
partition
pretraining	5	6439	592	1915	3.2	6653.0
retraining-human-triggered	5	83	15	20	1.3	7666.0
retraining-script-triggered	5	150	30	30	1.0	7131.0

mirage19¶

The unfiltered version of the dataset has an extra class, which corresponds to "background" traffic

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.MIRAGE19)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)

# min_pkts = 10
df = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=10"],
)

df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)
display(df_tmp)
stats_mirage19 = df_tmp

	classes	flows_all	flows_min	flows_max	rho	mean_pkts
unfiltered	21	122007	1986	11737	5.9	23.0
min_pkts=10	20	64172	1013	7505	7.4	17.0

mirage22¶

The unfiltered version of the dataset has an extra class, which corresponds to "background" traffic

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)

# min_pkts = 10
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=10"],
)

# min_pkts = 1000
df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=1000)

ser = df["app"].value_counts()
df_minpkts1000 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["min_pkts=1000"],
)

df_tmp = pd.concat((df_unfiltered, df_minpkts10, df_minpkts1000), axis=0)
display(df_tmp)
stats_mirage22 = df_tmp

	classes	flows_all	flows_min	flows_max	rho	mean_pkts
unfiltered	10	59071	2252	18882	8.4	3068.0
min_pkts=10	9	26773	970	4437	4.6	6598.0
min_pkts=1000	9	4569	190	2220	11.7	38321.0

utmobilenet21¶

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21)

ser = df["app"].value_counts()
df_unfiltered = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["unfiltered"],
)

# unfiltered
df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21, min_pkts=10)

ser = df["app"].value_counts()
df_minpkts10 = pd.DataFrame(
    [
        dict(
            classes=len(ser),
            flows_all=ser.sum(),
            flows_min=ser.min(),
            flows_max=ser.max(),
            rho=(ser.max() / ser.min()).round(1),
            mean_pkts=df["packets"].mean().round(0),
        )
    ],
    index=["minpkts=10"],
)

df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)
display(df_tmp)
stats_utmobilenet21 = df_tmp

	classes	flows_all	flows_min	flows_max	rho	mean_pkts
unfiltered	17	34378	159	5591	35.2	664.0
minpkts=10	14	9460	130	2496	19.2	2366.0

alltogether¶

df_tmp = pd.concat(
    (
        (stats_ucdavis19.assign(dataset="ucdavis-icdm19")).set_index(
            ["dataset", stats_ucdavis19.index]
        ),
        (stats_mirage19.assign(dataset="mirage19")).set_index(
            ["dataset", stats_mirage19.index]
        ),
        (stats_mirage22.assign(dataset="mirage22")).set_index(
            ["dataset", stats_mirage22.index]
        ),
        (stats_utmobilenet21.assign(dataset="utmobilenet21")).set_index(
            ["dataset", stats_utmobilenet21.index]
        ),
    )
).rename(
    {
        "retraining-human-triggered": "human",
        "retraining-script-triggered": "script",
    },
    axis=0,
)
display(df_tmp)
df_tmp.to_csv("table2_datasets_properties.csv")

		classes	flows_all	flows_min	flows_max	rho	mean_pkts
dataset
ucdavis-icdm19	pretraining	5	6439	592	1915	3.2	6653.0
	human	5	83	15	20	1.3	7666.0
	script	5	150	30	30	1.0	7131.0
mirage19	unfiltered	21	122007	1986	11737	5.9	23.0
mirage19	min_pkts=10	20	64172	1013	7505	7.4	17.0
mirage22	unfiltered	10	59071	2252	18882	8.4	3068.0
	min_pkts=10	9	26773	970	4437	4.6	6598.0
	min_pkts=1000	9	4569	190	2220	11.7	38321.0
utmobilenet21	unfiltered	17	34378	159	5591	35.2	664.0
utmobilenet21	minpkts=10	14	9460	130	2496	19.2	2366.0