{ "cells": [ { "cell_type": "markdown", "id": "305e1928-81a8-4a17-9166-339e25835705", "metadata": {}, "source": [ "# Table 2 : Datasets properties" ] }, { "cell_type": "markdown", "id": "acb408e0-74e5-49d7-9640-24aa83a99018", "metadata": {}, "source": [ "[:simple-jupyter: :material-download:](/papers/imc23/notebooks/table2_datasets_properties.ipynb)" ] }, { "cell_type": "code", "execution_count": 55, "id": "f6a0b7d4-7575-44e6-8777-0d3c4edb5e99", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:52:27.576339Z", "iopub.status.busy": "2023-09-08T10:52:27.575973Z", "iopub.status.idle": "2023-09-08T10:52:27.580116Z", "shell.execute_reply": "2023-09-08T10:52:27.579360Z", "shell.execute_reply.started": "2023-09-08T10:52:27.576310Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import tcbench as tcb" ] }, { "cell_type": "markdown", "id": "0e4a9e81-2453-421f-bae2-440b788a201a", "metadata": {}, "source": [ "## ucdavis-icdm19" ] }, { "cell_type": "code", "execution_count": 56, "id": "5150b4c8-c0db-46a2-81ae-7b5622d68009", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:52:28.164815Z", "iopub.status.busy": "2023-09-08T10:52:28.164480Z", "iopub.status.idle": "2023-09-08T10:52:32.027431Z", "shell.execute_reply": "2023-09-08T10:52:32.026564Z", "shell.execute_reply.started": "2023-09-08T10:52:28.164788Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classesflows_allflows_minflows_maxrhomean_pkts
partition
pretraining5643959219153.26653.0
retraining-human-triggered58315201.37666.0
retraining-script-triggered515030301.07131.0
\n", "
" ], "text/plain": [ " classes flows_all flows_min flows_max rho \\\n", "partition \n", "pretraining 5 6439 592 1915 3.2 \n", "retraining-human-triggered 5 83 15 20 1.3 \n", "retraining-script-triggered 5 150 30 30 1.0 \n", "\n", " mean_pkts \n", "partition \n", "pretraining 6653.0 \n", "retraining-human-triggered 7666.0 \n", "retraining-script-triggered 7131.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df = tcb.load_parquet(tcb.DATASETS.UCDAVISICDM19)\n", "\n", "# add number of packets\n", "df = df.assign(packets=df[\"pkts_size\"].apply(len))\n", "\n", "# number of samples\n", "df_tmp = pd.DataFrame(\n", " df.groupby([\"partition\", \"app\"])[\"app\"].value_counts()\n", ").reset_index()\n", "df_tmp = df_tmp.pivot(index=\"partition\", columns=\"app\", values=\"count\")\n", "df_tmp = df_tmp.assign(\n", " count=df_tmp.sum(axis=1),\n", " flows_min=df_tmp.min(axis=1),\n", " flows_max=df_tmp.max(axis=1),\n", " rho=(df_tmp.max(axis=1) / df_tmp.min(axis=1)).round(1),\n", " classes=len(df[\"app\"].cat.categories),\n", ")\n", "\n", "# mean pkts per flow\n", "mean_pkts = df.groupby(\"partition\")[\"packets\"].mean().round(0)\n", "mean_pkts.name = \"mean_pkts\"\n", "flows_all = df.groupby(\"partition\")[\"partition\"].count()\n", "flows_all.name = \"flows_all\"\n", "\n", "# combining everything together\n", "df_tmp = pd.concat((df_tmp, mean_pkts, flows_all), axis=1)\n", "df_tmp = df_tmp[[\"classes\", \"flows_all\", \"flows_min\", \"flows_max\", \"rho\", \"mean_pkts\"]]\n", "display(df_tmp)\n", "\n", "stats_ucdavis19 = df_tmp" ] }, { "cell_type": "markdown", "id": "ed063f85-a3be-42f1-bde2-7a360b6e40a0", "metadata": {}, "source": [ "## mirage19" ] }, { "cell_type": "markdown", "id": "586e176e-5e19-44c7-9038-a09a825512b6", "metadata": {}, "source": [ "The unfiltered version of the dataset has an extra class, which corresponds to `\"background\"` traffic" ] }, { "cell_type": "code", "execution_count": 57, "id": "f84d72c1-9d8d-46cc-97c5-82d7d9ee1e45", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:52:32.029597Z", "iopub.status.busy": "2023-09-08T10:52:32.029209Z", "iopub.status.idle": "2023-09-08T10:52:59.954118Z", "shell.execute_reply": "2023-09-08T10:52:59.952713Z", "shell.execute_reply.started": "2023-09-08T10:52:32.029564Z" } }, "outputs": [], "source": [ "# unfiltered\n", "df = tcb.load_parquet(tcb.DATASETS.MIRAGE19)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_unfiltered = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"unfiltered\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 58, "id": "2ec589ec-1c24-43ff-bc82-d5515d38b7e3", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:52:59.956364Z", "iopub.status.busy": "2023-09-08T10:52:59.955911Z", "iopub.status.idle": "2023-09-08T10:53:14.587000Z", "shell.execute_reply": "2023-09-08T10:53:14.586068Z", "shell.execute_reply.started": "2023-09-08T10:52:59.956325Z" } }, "outputs": [], "source": [ "# min_pkts = 10\n", "df = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_minpkts10 = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"min_pkts=10\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 59, "id": "dfa955c6-f808-467d-94ce-8ac8f855d999", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:53:14.590118Z", "iopub.status.busy": "2023-09-08T10:53:14.589708Z", "iopub.status.idle": "2023-09-08T10:53:14.603879Z", "shell.execute_reply": "2023-09-08T10:53:14.603101Z", "shell.execute_reply.started": "2023-09-08T10:53:14.590085Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classesflows_allflows_minflows_maxrhomean_pkts
unfiltered211220071986117375.923.0
min_pkts=102064172101375057.417.0
\n", "
" ], "text/plain": [ " classes flows_all flows_min flows_max rho mean_pkts\n", "unfiltered 21 122007 1986 11737 5.9 23.0\n", "min_pkts=10 20 64172 1013 7505 7.4 17.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)\n", "display(df_tmp)\n", "stats_mirage19 = df_tmp" ] }, { "cell_type": "markdown", "id": "0d91627a-3eaf-4f86-9cb8-37ec1f28e35d", "metadata": {}, "source": [ "## mirage22" ] }, { "cell_type": "markdown", "id": "93eff8ad-39df-40f1-b9e2-009902ce6f5b", "metadata": {}, "source": [ "The unfiltered version of the dataset has an extra class, which corresponds to `\"background\"` traffic" ] }, { "cell_type": "code", "execution_count": 60, "id": "ddc67fea-a5e5-4e8f-a35e-7bf2810728a1", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:53:14.605364Z", "iopub.status.busy": "2023-09-08T10:53:14.604988Z", "iopub.status.idle": "2023-09-08T10:53:50.787877Z", "shell.execute_reply": "2023-09-08T10:53:50.786963Z", "shell.execute_reply.started": "2023-09-08T10:53:14.605332Z" } }, "outputs": [], "source": [ "# unfiltered\n", "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_unfiltered = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"unfiltered\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 61, "id": "ccb1e098-424e-449a-be52-bc6d5361088b", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:53:50.789592Z", "iopub.status.busy": "2023-09-08T10:53:50.789182Z", "iopub.status.idle": "2023-09-08T10:54:00.911523Z", "shell.execute_reply": "2023-09-08T10:54:00.910618Z", "shell.execute_reply.started": "2023-09-08T10:53:50.789560Z" } }, "outputs": [], "source": [ "# min_pkts = 10\n", "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=10)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_minpkts10 = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"min_pkts=10\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 62, "id": "5109cc5e-c900-4f16-be15-fea9151df7c0", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:00.913235Z", "iopub.status.busy": "2023-09-08T10:54:00.912829Z", "iopub.status.idle": "2023-09-08T10:54:08.278615Z", "shell.execute_reply": "2023-09-08T10:54:08.277233Z", "shell.execute_reply.started": "2023-09-08T10:54:00.913200Z" } }, "outputs": [], "source": [ "# min_pkts = 1000\n", "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=1000)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_minpkts1000 = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"min_pkts=1000\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 63, "id": "99cc5482-de63-45c2-822b-4a75010174ad", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:08.280934Z", "iopub.status.busy": "2023-09-08T10:54:08.280378Z", "iopub.status.idle": "2023-09-08T10:54:08.296172Z", "shell.execute_reply": "2023-09-08T10:54:08.295396Z", "shell.execute_reply.started": "2023-09-08T10:54:08.280893Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classesflows_allflows_minflows_maxrhomean_pkts
unfiltered10590712252188828.43068.0
min_pkts=1092677397044374.66598.0
min_pkts=100094569190222011.738321.0
\n", "
" ], "text/plain": [ " classes flows_all flows_min flows_max rho mean_pkts\n", "unfiltered 10 59071 2252 18882 8.4 3068.0\n", "min_pkts=10 9 26773 970 4437 4.6 6598.0\n", "min_pkts=1000 9 4569 190 2220 11.7 38321.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_tmp = pd.concat((df_unfiltered, df_minpkts10, df_minpkts1000), axis=0)\n", "display(df_tmp)\n", "stats_mirage22 = df_tmp" ] }, { "cell_type": "markdown", "id": "cd153f2b-1173-44e9-b178-ea2357ca5221", "metadata": {}, "source": [ "## utmobilenet21" ] }, { "cell_type": "code", "execution_count": 64, "id": "c24356c3-5af3-45eb-8d17-77a0ddd7e3da", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:08.298048Z", "iopub.status.busy": "2023-09-08T10:54:08.297349Z", "iopub.status.idle": "2023-09-08T10:54:09.717282Z", "shell.execute_reply": "2023-09-08T10:54:09.716359Z", "shell.execute_reply.started": "2023-09-08T10:54:08.298016Z" } }, "outputs": [], "source": [ "# unfiltered\n", "df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_unfiltered = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"unfiltered\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 65, "id": "293e5eae-b66f-4c1b-bb90-551f588c02b2", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:09.721136Z", "iopub.status.busy": "2023-09-08T10:54:09.720658Z", "iopub.status.idle": "2023-09-08T10:54:10.966771Z", "shell.execute_reply": "2023-09-08T10:54:10.965872Z", "shell.execute_reply.started": "2023-09-08T10:54:09.721103Z" } }, "outputs": [], "source": [ "# unfiltered\n", "df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21, min_pkts=10)\n", "\n", "ser = df[\"app\"].value_counts()\n", "df_minpkts10 = pd.DataFrame(\n", " [\n", " dict(\n", " classes=len(ser),\n", " flows_all=ser.sum(),\n", " flows_min=ser.min(),\n", " flows_max=ser.max(),\n", " rho=(ser.max() / ser.min()).round(1),\n", " mean_pkts=df[\"packets\"].mean().round(0),\n", " )\n", " ],\n", " index=[\"minpkts=10\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 66, "id": "18ea2b9d-7233-4e00-911e-fcf734cde1fa", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:10.968434Z", "iopub.status.busy": "2023-09-08T10:54:10.968032Z", "iopub.status.idle": "2023-09-08T10:54:10.981438Z", "shell.execute_reply": "2023-09-08T10:54:10.980665Z", "shell.execute_reply.started": "2023-09-08T10:54:10.968402Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classesflows_allflows_minflows_maxrhomean_pkts
unfiltered1734378159559135.2664.0
minpkts=10149460130249619.22366.0
\n", "
" ], "text/plain": [ " classes flows_all flows_min flows_max rho mean_pkts\n", "unfiltered 17 34378 159 5591 35.2 664.0\n", "minpkts=10 14 9460 130 2496 19.2 2366.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)\n", "display(df_tmp)\n", "stats_utmobilenet21 = df_tmp" ] }, { "cell_type": "markdown", "id": "a3948ee4-15e5-4b4e-b051-bd68dc33bd7c", "metadata": {}, "source": [ "# alltogether" ] }, { "cell_type": "code", "execution_count": 67, "id": "c39fba26-1ef1-455a-a31b-7ba45e4cf636", "metadata": { "execution": { "iopub.execute_input": "2023-09-08T10:54:10.982937Z", "iopub.status.busy": "2023-09-08T10:54:10.982569Z", "iopub.status.idle": "2023-09-08T10:54:11.017303Z", "shell.execute_reply": "2023-09-08T10:54:11.016525Z", "shell.execute_reply.started": "2023-09-08T10:54:10.982906Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classesflows_allflows_minflows_maxrhomean_pkts
dataset
ucdavis-icdm19pretraining5643959219153.26653.0
human58315201.37666.0
script515030301.07131.0
mirage19unfiltered211220071986117375.923.0
min_pkts=102064172101375057.417.0
mirage22unfiltered10590712252188828.43068.0
min_pkts=1092677397044374.66598.0
min_pkts=100094569190222011.738321.0
utmobilenet21unfiltered1734378159559135.2664.0
minpkts=10149460130249619.22366.0
\n", "
" ], "text/plain": [ " classes flows_all flows_min flows_max rho \\\n", "dataset \n", "ucdavis-icdm19 pretraining 5 6439 592 1915 3.2 \n", " human 5 83 15 20 1.3 \n", " script 5 150 30 30 1.0 \n", "mirage19 unfiltered 21 122007 1986 11737 5.9 \n", " min_pkts=10 20 64172 1013 7505 7.4 \n", "mirage22 unfiltered 10 59071 2252 18882 8.4 \n", " min_pkts=10 9 26773 970 4437 4.6 \n", " min_pkts=1000 9 4569 190 2220 11.7 \n", "utmobilenet21 unfiltered 17 34378 159 5591 35.2 \n", " minpkts=10 14 9460 130 2496 19.2 \n", "\n", " mean_pkts \n", "dataset \n", "ucdavis-icdm19 pretraining 6653.0 \n", " human 7666.0 \n", " script 7131.0 \n", "mirage19 unfiltered 23.0 \n", " min_pkts=10 17.0 \n", "mirage22 unfiltered 3068.0 \n", " min_pkts=10 6598.0 \n", " min_pkts=1000 38321.0 \n", "utmobilenet21 unfiltered 664.0 \n", " minpkts=10 2366.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_tmp = pd.concat(\n", " (\n", " (stats_ucdavis19.assign(dataset=\"ucdavis-icdm19\")).set_index(\n", " [\"dataset\", stats_ucdavis19.index]\n", " ),\n", " (stats_mirage19.assign(dataset=\"mirage19\")).set_index(\n", " [\"dataset\", stats_mirage19.index]\n", " ),\n", " (stats_mirage22.assign(dataset=\"mirage22\")).set_index(\n", " [\"dataset\", stats_mirage22.index]\n", " ),\n", " (stats_utmobilenet21.assign(dataset=\"utmobilenet21\")).set_index(\n", " [\"dataset\", stats_utmobilenet21.index]\n", " ),\n", " )\n", ").rename(\n", " {\n", " \"retraining-human-triggered\": \"human\",\n", " \"retraining-script-triggered\": \"script\",\n", " },\n", " axis=0,\n", ")\n", "display(df_tmp)\n", "df_tmp.to_csv(\"table2_datasets_properties.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }