Table 2 : Datasets properties
import pandas as pd
import tcbench as tcb
ucdavis-icdm19
df = tcb . load_parquet ( tcb . DATASETS . UCDAVISICDM19 )
# add number of packets
df = df . assign ( packets = df [ "pkts_size" ] . apply ( len ))
# number of samples
df_tmp = pd . DataFrame (
df . groupby ([ "partition" , "app" ])[ "app" ] . value_counts ()
) . reset_index ()
df_tmp = df_tmp . pivot ( index = "partition" , columns = "app" , values = "count" )
df_tmp = df_tmp . assign (
count = df_tmp . sum ( axis = 1 ),
flows_min = df_tmp . min ( axis = 1 ),
flows_max = df_tmp . max ( axis = 1 ),
rho = ( df_tmp . max ( axis = 1 ) / df_tmp . min ( axis = 1 )) . round ( 1 ),
classes = len ( df [ "app" ] . cat . categories ),
)
# mean pkts per flow
mean_pkts = df . groupby ( "partition" )[ "packets" ] . mean () . round ( 0 )
mean_pkts . name = "mean_pkts"
flows_all = df . groupby ( "partition" )[ "partition" ] . count ()
flows_all . name = "flows_all"
# combining everything together
df_tmp = pd . concat (( df_tmp , mean_pkts , flows_all ), axis = 1 )
df_tmp = df_tmp [[ "classes" , "flows_all" , "flows_min" , "flows_max" , "rho" , "mean_pkts" ]]
display ( df_tmp )
stats_ucdavis19 = df_tmp
mirage19
The unfiltered version of the dataset has an extra class, which corresponds to "background"
traffic
# unfiltered
df = tcb . load_parquet ( tcb . DATASETS . MIRAGE19 )
ser = df [ "app" ] . value_counts ()
df_unfiltered = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "unfiltered" ],
)
# min_pkts = 10
df = tcb . load_parquet ( tcb . DATASETS . MIRAGE19 , min_pkts = 10 )
ser = df [ "app" ] . value_counts ()
df_minpkts10 = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "min_pkts=10" ],
)
df_tmp = pd . concat (( df_unfiltered , df_minpkts10 ), axis = 0 )
display ( df_tmp )
stats_mirage19 = df_tmp
mirage22
The unfiltered version of the dataset has an extra class, which corresponds to "background"
traffic
# unfiltered
df = tcb . load_parquet ( tcb . DATASETS . MIRAGE22 )
ser = df [ "app" ] . value_counts ()
df_unfiltered = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "unfiltered" ],
)
# min_pkts = 10
df = tcb . load_parquet ( tcb . DATASETS . MIRAGE22 , min_pkts = 10 )
ser = df [ "app" ] . value_counts ()
df_minpkts10 = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "min_pkts=10" ],
)
# min_pkts = 1000
df = tcb . load_parquet ( tcb . DATASETS . MIRAGE22 , min_pkts = 1000 )
ser = df [ "app" ] . value_counts ()
df_minpkts1000 = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "min_pkts=1000" ],
)
df_tmp = pd . concat (( df_unfiltered , df_minpkts10 , df_minpkts1000 ), axis = 0 )
display ( df_tmp )
stats_mirage22 = df_tmp
utmobilenet21
# unfiltered
df = tcb . load_parquet ( tcb . DATASETS . UTMOBILENET21 )
ser = df [ "app" ] . value_counts ()
df_unfiltered = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "unfiltered" ],
)
# unfiltered
df = tcb . load_parquet ( tcb . DATASETS . UTMOBILENET21 , min_pkts = 10 )
ser = df [ "app" ] . value_counts ()
df_minpkts10 = pd . DataFrame (
[
dict (
classes = len ( ser ),
flows_all = ser . sum (),
flows_min = ser . min (),
flows_max = ser . max (),
rho = ( ser . max () / ser . min ()) . round ( 1 ),
mean_pkts = df [ "packets" ] . mean () . round ( 0 ),
)
],
index = [ "minpkts=10" ],
)
df_tmp = pd . concat (( df_unfiltered , df_minpkts10 ), axis = 0 )
display ( df_tmp )
stats_utmobilenet21 = df_tmp
alltogether
df_tmp = pd . concat (
(
( stats_ucdavis19 . assign ( dataset = "ucdavis-icdm19" )) . set_index (
[ "dataset" , stats_ucdavis19 . index ]
),
( stats_mirage19 . assign ( dataset = "mirage19" )) . set_index (
[ "dataset" , stats_mirage19 . index ]
),
( stats_mirage22 . assign ( dataset = "mirage22" )) . set_index (
[ "dataset" , stats_mirage22 . index ]
),
( stats_utmobilenet21 . assign ( dataset = "utmobilenet21" )) . set_index (
[ "dataset" , stats_utmobilenet21 . index ]
),
)
) . rename (
{
"retraining-human-triggered" : "human" ,
"retraining-script-triggered" : "script" ,
},
axis = 0 ,
)
display ( df_tmp )
df_tmp . to_csv ( "table2_datasets_properties.csv" )