{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "305e1928-81a8-4a17-9166-339e25835705",
   "metadata": {},
   "source": [
    "# Table 2 : Datasets properties"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acb408e0-74e5-49d7-9640-24aa83a99018",
   "metadata": {},
   "source": [
    "[:simple-jupyter: :material-download:](/papers/imc23/notebooks/table2_datasets_properties.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "f6a0b7d4-7575-44e6-8777-0d3c4edb5e99",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:52:27.576339Z",
     "iopub.status.busy": "2023-09-08T10:52:27.575973Z",
     "iopub.status.idle": "2023-09-08T10:52:27.580116Z",
     "shell.execute_reply": "2023-09-08T10:52:27.579360Z",
     "shell.execute_reply.started": "2023-09-08T10:52:27.576310Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import tcbench as tcb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e4a9e81-2453-421f-bae2-440b788a201a",
   "metadata": {},
   "source": [
    "## ucdavis-icdm19"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "5150b4c8-c0db-46a2-81ae-7b5622d68009",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:52:28.164815Z",
     "iopub.status.busy": "2023-09-08T10:52:28.164480Z",
     "iopub.status.idle": "2023-09-08T10:52:32.027431Z",
     "shell.execute_reply": "2023-09-08T10:52:32.026564Z",
     "shell.execute_reply.started": "2023-09-08T10:52:28.164788Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classes</th>\n",
       "      <th>flows_all</th>\n",
       "      <th>flows_min</th>\n",
       "      <th>flows_max</th>\n",
       "      <th>rho</th>\n",
       "      <th>mean_pkts</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>partition</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>pretraining</th>\n",
       "      <td>5</td>\n",
       "      <td>6439</td>\n",
       "      <td>592</td>\n",
       "      <td>1915</td>\n",
       "      <td>3.2</td>\n",
       "      <td>6653.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>retraining-human-triggered</th>\n",
       "      <td>5</td>\n",
       "      <td>83</td>\n",
       "      <td>15</td>\n",
       "      <td>20</td>\n",
       "      <td>1.3</td>\n",
       "      <td>7666.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>retraining-script-triggered</th>\n",
       "      <td>5</td>\n",
       "      <td>150</td>\n",
       "      <td>30</td>\n",
       "      <td>30</td>\n",
       "      <td>1.0</td>\n",
       "      <td>7131.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             classes  flows_all  flows_min  flows_max  rho  \\\n",
       "partition                                                                    \n",
       "pretraining                        5       6439        592       1915  3.2   \n",
       "retraining-human-triggered         5         83         15         20  1.3   \n",
       "retraining-script-triggered        5        150         30         30  1.0   \n",
       "\n",
       "                             mean_pkts  \n",
       "partition                               \n",
       "pretraining                     6653.0  \n",
       "retraining-human-triggered      7666.0  \n",
       "retraining-script-triggered     7131.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = tcb.load_parquet(tcb.DATASETS.UCDAVISICDM19)\n",
    "\n",
    "# add number of packets\n",
    "df = df.assign(packets=df[\"pkts_size\"].apply(len))\n",
    "\n",
    "# number of samples\n",
    "df_tmp = pd.DataFrame(\n",
    "    df.groupby([\"partition\", \"app\"])[\"app\"].value_counts()\n",
    ").reset_index()\n",
    "df_tmp = df_tmp.pivot(index=\"partition\", columns=\"app\", values=\"count\")\n",
    "df_tmp = df_tmp.assign(\n",
    "    count=df_tmp.sum(axis=1),\n",
    "    flows_min=df_tmp.min(axis=1),\n",
    "    flows_max=df_tmp.max(axis=1),\n",
    "    rho=(df_tmp.max(axis=1) / df_tmp.min(axis=1)).round(1),\n",
    "    classes=len(df[\"app\"].cat.categories),\n",
    ")\n",
    "\n",
    "# mean pkts per flow\n",
    "mean_pkts = df.groupby(\"partition\")[\"packets\"].mean().round(0)\n",
    "mean_pkts.name = \"mean_pkts\"\n",
    "flows_all = df.groupby(\"partition\")[\"partition\"].count()\n",
    "flows_all.name = \"flows_all\"\n",
    "\n",
    "# combining everything together\n",
    "df_tmp = pd.concat((df_tmp, mean_pkts, flows_all), axis=1)\n",
    "df_tmp = df_tmp[[\"classes\", \"flows_all\", \"flows_min\", \"flows_max\", \"rho\", \"mean_pkts\"]]\n",
    "display(df_tmp)\n",
    "\n",
    "stats_ucdavis19 = df_tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed063f85-a3be-42f1-bde2-7a360b6e40a0",
   "metadata": {},
   "source": [
    "## mirage19"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "586e176e-5e19-44c7-9038-a09a825512b6",
   "metadata": {},
   "source": [
    "The unfiltered version of the dataset has an extra class, which corresponds to `\"background\"` traffic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "f84d72c1-9d8d-46cc-97c5-82d7d9ee1e45",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:52:32.029597Z",
     "iopub.status.busy": "2023-09-08T10:52:32.029209Z",
     "iopub.status.idle": "2023-09-08T10:52:59.954118Z",
     "shell.execute_reply": "2023-09-08T10:52:59.952713Z",
     "shell.execute_reply.started": "2023-09-08T10:52:32.029564Z"
    }
   },
   "outputs": [],
   "source": [
    "# unfiltered\n",
    "df = tcb.load_parquet(tcb.DATASETS.MIRAGE19)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_unfiltered = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"unfiltered\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "2ec589ec-1c24-43ff-bc82-d5515d38b7e3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:52:59.956364Z",
     "iopub.status.busy": "2023-09-08T10:52:59.955911Z",
     "iopub.status.idle": "2023-09-08T10:53:14.587000Z",
     "shell.execute_reply": "2023-09-08T10:53:14.586068Z",
     "shell.execute_reply.started": "2023-09-08T10:52:59.956325Z"
    }
   },
   "outputs": [],
   "source": [
    "# min_pkts = 10\n",
    "df = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_minpkts10 = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"min_pkts=10\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "dfa955c6-f808-467d-94ce-8ac8f855d999",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:53:14.590118Z",
     "iopub.status.busy": "2023-09-08T10:53:14.589708Z",
     "iopub.status.idle": "2023-09-08T10:53:14.603879Z",
     "shell.execute_reply": "2023-09-08T10:53:14.603101Z",
     "shell.execute_reply.started": "2023-09-08T10:53:14.590085Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classes</th>\n",
       "      <th>flows_all</th>\n",
       "      <th>flows_min</th>\n",
       "      <th>flows_max</th>\n",
       "      <th>rho</th>\n",
       "      <th>mean_pkts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unfiltered</th>\n",
       "      <td>21</td>\n",
       "      <td>122007</td>\n",
       "      <td>1986</td>\n",
       "      <td>11737</td>\n",
       "      <td>5.9</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=10</th>\n",
       "      <td>20</td>\n",
       "      <td>64172</td>\n",
       "      <td>1013</td>\n",
       "      <td>7505</td>\n",
       "      <td>7.4</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             classes  flows_all  flows_min  flows_max  rho  mean_pkts\n",
       "unfiltered        21     122007       1986      11737  5.9       23.0\n",
       "min_pkts=10       20      64172       1013       7505  7.4       17.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)\n",
    "display(df_tmp)\n",
    "stats_mirage19 = df_tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d91627a-3eaf-4f86-9cb8-37ec1f28e35d",
   "metadata": {},
   "source": [
    "## mirage22"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93eff8ad-39df-40f1-b9e2-009902ce6f5b",
   "metadata": {},
   "source": [
    "The unfiltered version of the dataset has an extra class, which corresponds to `\"background\"` traffic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "ddc67fea-a5e5-4e8f-a35e-7bf2810728a1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:53:14.605364Z",
     "iopub.status.busy": "2023-09-08T10:53:14.604988Z",
     "iopub.status.idle": "2023-09-08T10:53:50.787877Z",
     "shell.execute_reply": "2023-09-08T10:53:50.786963Z",
     "shell.execute_reply.started": "2023-09-08T10:53:14.605332Z"
    }
   },
   "outputs": [],
   "source": [
    "# unfiltered\n",
    "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_unfiltered = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"unfiltered\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "ccb1e098-424e-449a-be52-bc6d5361088b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:53:50.789592Z",
     "iopub.status.busy": "2023-09-08T10:53:50.789182Z",
     "iopub.status.idle": "2023-09-08T10:54:00.911523Z",
     "shell.execute_reply": "2023-09-08T10:54:00.910618Z",
     "shell.execute_reply.started": "2023-09-08T10:53:50.789560Z"
    }
   },
   "outputs": [],
   "source": [
    "# min_pkts = 10\n",
    "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=10)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_minpkts10 = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"min_pkts=10\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "5109cc5e-c900-4f16-be15-fea9151df7c0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:00.913235Z",
     "iopub.status.busy": "2023-09-08T10:54:00.912829Z",
     "iopub.status.idle": "2023-09-08T10:54:08.278615Z",
     "shell.execute_reply": "2023-09-08T10:54:08.277233Z",
     "shell.execute_reply.started": "2023-09-08T10:54:00.913200Z"
    }
   },
   "outputs": [],
   "source": [
    "# min_pkts = 1000\n",
    "df = tcb.load_parquet(tcb.DATASETS.MIRAGE22, min_pkts=1000)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_minpkts1000 = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"min_pkts=1000\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "99cc5482-de63-45c2-822b-4a75010174ad",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:08.280934Z",
     "iopub.status.busy": "2023-09-08T10:54:08.280378Z",
     "iopub.status.idle": "2023-09-08T10:54:08.296172Z",
     "shell.execute_reply": "2023-09-08T10:54:08.295396Z",
     "shell.execute_reply.started": "2023-09-08T10:54:08.280893Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classes</th>\n",
       "      <th>flows_all</th>\n",
       "      <th>flows_min</th>\n",
       "      <th>flows_max</th>\n",
       "      <th>rho</th>\n",
       "      <th>mean_pkts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unfiltered</th>\n",
       "      <td>10</td>\n",
       "      <td>59071</td>\n",
       "      <td>2252</td>\n",
       "      <td>18882</td>\n",
       "      <td>8.4</td>\n",
       "      <td>3068.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=10</th>\n",
       "      <td>9</td>\n",
       "      <td>26773</td>\n",
       "      <td>970</td>\n",
       "      <td>4437</td>\n",
       "      <td>4.6</td>\n",
       "      <td>6598.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=1000</th>\n",
       "      <td>9</td>\n",
       "      <td>4569</td>\n",
       "      <td>190</td>\n",
       "      <td>2220</td>\n",
       "      <td>11.7</td>\n",
       "      <td>38321.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               classes  flows_all  flows_min  flows_max   rho  mean_pkts\n",
       "unfiltered          10      59071       2252      18882   8.4     3068.0\n",
       "min_pkts=10          9      26773        970       4437   4.6     6598.0\n",
       "min_pkts=1000        9       4569        190       2220  11.7    38321.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_tmp = pd.concat((df_unfiltered, df_minpkts10, df_minpkts1000), axis=0)\n",
    "display(df_tmp)\n",
    "stats_mirage22 = df_tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd153f2b-1173-44e9-b178-ea2357ca5221",
   "metadata": {},
   "source": [
    "## utmobilenet21"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "c24356c3-5af3-45eb-8d17-77a0ddd7e3da",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:08.298048Z",
     "iopub.status.busy": "2023-09-08T10:54:08.297349Z",
     "iopub.status.idle": "2023-09-08T10:54:09.717282Z",
     "shell.execute_reply": "2023-09-08T10:54:09.716359Z",
     "shell.execute_reply.started": "2023-09-08T10:54:08.298016Z"
    }
   },
   "outputs": [],
   "source": [
    "# unfiltered\n",
    "df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_unfiltered = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"unfiltered\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "293e5eae-b66f-4c1b-bb90-551f588c02b2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:09.721136Z",
     "iopub.status.busy": "2023-09-08T10:54:09.720658Z",
     "iopub.status.idle": "2023-09-08T10:54:10.966771Z",
     "shell.execute_reply": "2023-09-08T10:54:10.965872Z",
     "shell.execute_reply.started": "2023-09-08T10:54:09.721103Z"
    }
   },
   "outputs": [],
   "source": [
    "# unfiltered\n",
    "df = tcb.load_parquet(tcb.DATASETS.UTMOBILENET21, min_pkts=10)\n",
    "\n",
    "ser = df[\"app\"].value_counts()\n",
    "df_minpkts10 = pd.DataFrame(\n",
    "    [\n",
    "        dict(\n",
    "            classes=len(ser),\n",
    "            flows_all=ser.sum(),\n",
    "            flows_min=ser.min(),\n",
    "            flows_max=ser.max(),\n",
    "            rho=(ser.max() / ser.min()).round(1),\n",
    "            mean_pkts=df[\"packets\"].mean().round(0),\n",
    "        )\n",
    "    ],\n",
    "    index=[\"minpkts=10\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "18ea2b9d-7233-4e00-911e-fcf734cde1fa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:10.968434Z",
     "iopub.status.busy": "2023-09-08T10:54:10.968032Z",
     "iopub.status.idle": "2023-09-08T10:54:10.981438Z",
     "shell.execute_reply": "2023-09-08T10:54:10.980665Z",
     "shell.execute_reply.started": "2023-09-08T10:54:10.968402Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classes</th>\n",
       "      <th>flows_all</th>\n",
       "      <th>flows_min</th>\n",
       "      <th>flows_max</th>\n",
       "      <th>rho</th>\n",
       "      <th>mean_pkts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unfiltered</th>\n",
       "      <td>17</td>\n",
       "      <td>34378</td>\n",
       "      <td>159</td>\n",
       "      <td>5591</td>\n",
       "      <td>35.2</td>\n",
       "      <td>664.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>minpkts=10</th>\n",
       "      <td>14</td>\n",
       "      <td>9460</td>\n",
       "      <td>130</td>\n",
       "      <td>2496</td>\n",
       "      <td>19.2</td>\n",
       "      <td>2366.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            classes  flows_all  flows_min  flows_max   rho  mean_pkts\n",
       "unfiltered       17      34378        159       5591  35.2      664.0\n",
       "minpkts=10       14       9460        130       2496  19.2     2366.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_tmp = pd.concat((df_unfiltered, df_minpkts10), axis=0)\n",
    "display(df_tmp)\n",
    "stats_utmobilenet21 = df_tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3948ee4-15e5-4b4e-b051-bd68dc33bd7c",
   "metadata": {},
   "source": [
    "# alltogether"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "c39fba26-1ef1-455a-a31b-7ba45e4cf636",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T10:54:10.982937Z",
     "iopub.status.busy": "2023-09-08T10:54:10.982569Z",
     "iopub.status.idle": "2023-09-08T10:54:11.017303Z",
     "shell.execute_reply": "2023-09-08T10:54:11.016525Z",
     "shell.execute_reply.started": "2023-09-08T10:54:10.982906Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>classes</th>\n",
       "      <th>flows_all</th>\n",
       "      <th>flows_min</th>\n",
       "      <th>flows_max</th>\n",
       "      <th>rho</th>\n",
       "      <th>mean_pkts</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dataset</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">ucdavis-icdm19</th>\n",
       "      <th>pretraining</th>\n",
       "      <td>5</td>\n",
       "      <td>6439</td>\n",
       "      <td>592</td>\n",
       "      <td>1915</td>\n",
       "      <td>3.2</td>\n",
       "      <td>6653.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>human</th>\n",
       "      <td>5</td>\n",
       "      <td>83</td>\n",
       "      <td>15</td>\n",
       "      <td>20</td>\n",
       "      <td>1.3</td>\n",
       "      <td>7666.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>script</th>\n",
       "      <td>5</td>\n",
       "      <td>150</td>\n",
       "      <td>30</td>\n",
       "      <td>30</td>\n",
       "      <td>1.0</td>\n",
       "      <td>7131.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">mirage19</th>\n",
       "      <th>unfiltered</th>\n",
       "      <td>21</td>\n",
       "      <td>122007</td>\n",
       "      <td>1986</td>\n",
       "      <td>11737</td>\n",
       "      <td>5.9</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=10</th>\n",
       "      <td>20</td>\n",
       "      <td>64172</td>\n",
       "      <td>1013</td>\n",
       "      <td>7505</td>\n",
       "      <td>7.4</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">mirage22</th>\n",
       "      <th>unfiltered</th>\n",
       "      <td>10</td>\n",
       "      <td>59071</td>\n",
       "      <td>2252</td>\n",
       "      <td>18882</td>\n",
       "      <td>8.4</td>\n",
       "      <td>3068.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=10</th>\n",
       "      <td>9</td>\n",
       "      <td>26773</td>\n",
       "      <td>970</td>\n",
       "      <td>4437</td>\n",
       "      <td>4.6</td>\n",
       "      <td>6598.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_pkts=1000</th>\n",
       "      <td>9</td>\n",
       "      <td>4569</td>\n",
       "      <td>190</td>\n",
       "      <td>2220</td>\n",
       "      <td>11.7</td>\n",
       "      <td>38321.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">utmobilenet21</th>\n",
       "      <th>unfiltered</th>\n",
       "      <td>17</td>\n",
       "      <td>34378</td>\n",
       "      <td>159</td>\n",
       "      <td>5591</td>\n",
       "      <td>35.2</td>\n",
       "      <td>664.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>minpkts=10</th>\n",
       "      <td>14</td>\n",
       "      <td>9460</td>\n",
       "      <td>130</td>\n",
       "      <td>2496</td>\n",
       "      <td>19.2</td>\n",
       "      <td>2366.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              classes  flows_all  flows_min  flows_max   rho  \\\n",
       "dataset                                                                        \n",
       "ucdavis-icdm19 pretraining          5       6439        592       1915   3.2   \n",
       "               human                5         83         15         20   1.3   \n",
       "               script               5        150         30         30   1.0   \n",
       "mirage19       unfiltered          21     122007       1986      11737   5.9   \n",
       "               min_pkts=10         20      64172       1013       7505   7.4   \n",
       "mirage22       unfiltered          10      59071       2252      18882   8.4   \n",
       "               min_pkts=10          9      26773        970       4437   4.6   \n",
       "               min_pkts=1000        9       4569        190       2220  11.7   \n",
       "utmobilenet21  unfiltered          17      34378        159       5591  35.2   \n",
       "               minpkts=10          14       9460        130       2496  19.2   \n",
       "\n",
       "                              mean_pkts  \n",
       "dataset                                  \n",
       "ucdavis-icdm19 pretraining       6653.0  \n",
       "               human             7666.0  \n",
       "               script            7131.0  \n",
       "mirage19       unfiltered          23.0  \n",
       "               min_pkts=10         17.0  \n",
       "mirage22       unfiltered        3068.0  \n",
       "               min_pkts=10       6598.0  \n",
       "               min_pkts=1000    38321.0  \n",
       "utmobilenet21  unfiltered         664.0  \n",
       "               minpkts=10        2366.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_tmp = pd.concat(\n",
    "    (\n",
    "        (stats_ucdavis19.assign(dataset=\"ucdavis-icdm19\")).set_index(\n",
    "            [\"dataset\", stats_ucdavis19.index]\n",
    "        ),\n",
    "        (stats_mirage19.assign(dataset=\"mirage19\")).set_index(\n",
    "            [\"dataset\", stats_mirage19.index]\n",
    "        ),\n",
    "        (stats_mirage22.assign(dataset=\"mirage22\")).set_index(\n",
    "            [\"dataset\", stats_mirage22.index]\n",
    "        ),\n",
    "        (stats_utmobilenet21.assign(dataset=\"utmobilenet21\")).set_index(\n",
    "            [\"dataset\", stats_utmobilenet21.index]\n",
    "        ),\n",
    "    )\n",
    ").rename(\n",
    "    {\n",
    "        \"retraining-human-triggered\": \"human\",\n",
    "        \"retraining-script-triggered\": \"script\",\n",
    "    },\n",
    "    axis=0,\n",
    ")\n",
    "display(df_tmp)\n",
    "df_tmp.to_csv(\"table2_datasets_properties.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}