CIRA-CIC-DoHBrw-2020 Dataset (Exploratory Data Analysis)


  • Canadian Institute for Cybersecurity (CIC) project funded by Canadian Internet Registration Authority (CIRA)
  • Credit:

    Mohammadreza MontazeriShatoori, Logan Davidson, Gurdip Kaur, and Arash Habibi Lashkari, “Detection of DoH Tunnels using Time-series Classification of Encrypted Traffic”, The 5th IEEE Cyber Science and Technology Congress, Calgary, Canada, August 2020

  • Dataset link

In [110]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
sns.set(font_scale=1.5)

import category_encoders as ce
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

BENIGH_PATH = 'CSVs/'
MALICIOUS_PATH = 'CSVs 2/'
In [35]:
df_chrome = pd.read_csv(BENIGH_PATH + 'Chrome/all.csv')
df_firefox = pd.read_csv(BENIGH_PATH + 'Firefox/all.csv')
df_tcp = pd.read_csv(MALICIOUS_PATH + 'dns2tcp/all.csv')
df_cat = pd.read_csv(MALICIOUS_PATH + 'dnscat2/all.csv')
df_iodine = pd.read_csv(MALICIOUS_PATH + 'iodine/all.csv')

df_b = pd.concat([df_chrome, df_firefox])
df_m = pd.concat([df_tcp, df_cat, df_iodine])
df_b['label'] = 0.
df_m['label'] = 1.
df = pd.concat([df_b, df_m])

print(f'Chrome  === Num rows: {df_chrome.shape[0]}   Num features: {df_chrome.shape[1]}')
print(f'FireFox === Num rows: {df_firefox.shape[0]}   Num features: {df_firefox.shape[1]}')
print(f'dns2tcp === Num rows: {df_tcp.shape[0]}   Num features: {df_tcp.shape[1]}')
print(f'dnscat2 === Num rows: {df_cat.shape[0]}   Num features: {df_cat.shape[1]}')
print(f'iodine  === Num rows: {df_iodine.shape[0]}   Num features: {df_iodine.shape[1]}')

df.describe()
Chrome  === Num rows: 545464   Num features: 35
FireFox === Num rows: 371836   Num features: 35
dns2tcp === Num rows: 167517   Num features: 35
dnscat2 === Num rows: 35854   Num features: 35
iodine  === Num rows: 46598   Num features: 35
Out[35]:
SourcePort DestinationPort Duration FlowBytesSent FlowSentRate FlowBytesReceived FlowReceivedRate PacketLengthVariance PacketLengthStandardDeviation PacketLengthMean ... PacketTimeCoefficientofVariation ResponseTimeTimeVariance ResponseTimeTimeStandardDeviation ResponseTimeTimeMean ResponseTimeTimeMedian ResponseTimeTimeMode ResponseTimeTimeSkewFromMedian ResponseTimeTimeSkewFromMode ResponseTimeTimeCoefficientofVariation label
count 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 ... 1.167269e+06 1.167269e+06 1.167269e+06 1.167269e+06 1.159241e+06 1.167269e+06 1.159241e+06 1.167269e+06 1.167269e+06 1.167269e+06
mean 4.873587e+04 3.326805e+03 2.030161e+01 1.725992e+04 9.787933e+03 5.066542e+04 2.353974e+04 3.447198e+05 3.732243e+02 2.821058e+02 ... 1.012266e+00 1.490005e+00 2.356735e-01 2.478310e-01 2.120437e-01 1.031969e-01 -2.670836e+00 -1.720749e+00 8.181588e-01 2.141486e-01
std 1.463053e+04 1.165381e+04 3.438528e+01 1.002612e+05 2.512222e+05 3.558865e+05 1.845950e+05 1.090804e+06 4.532368e+02 3.018135e+02 ... 5.984030e-01 1.220040e+01 1.203420e+00 1.488746e+00 1.635177e+00 1.200889e+00 4.442455e+00 4.838195e+00 1.548699e+00 4.102306e-01
min 4.430000e+02 4.430000e+02 0.000000e+00 0.000000e+00 -1.000000e+00 0.000000e+00 -1.000000e+00 0.000000e+00 0.000000e+00 5.400000e+01 ... -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 2.000000e-06 -1.000000e+00 -1.000000e+01 -1.000000e+01 0.000000e+00 0.000000e+00
25% 4.386200e+04 4.430000e+02 9.125600e-02 1.780000e+02 1.115361e+02 1.400000e+02 2.447933e+02 1.950612e+02 1.396643e+01 6.733333e+01 ... 5.773510e-01 0.000000e+00 0.000000e+00 1.354475e-02 1.515600e-02 2.400000e-05 -1.000000e+01 -1.000000e+01 0.000000e+00 0.000000e+00
50% 5.243000e+04 4.430000e+02 1.363401e+00 1.524000e+03 1.339862e+03 4.129000e+03 2.337359e+03 1.246238e+05 3.530210e+02 2.234000e+02 ... 9.638028e-01 5.649099e-05 7.516049e-03 2.125350e-02 1.994500e-02 3.670000e-04 -1.160292e+00 8.277568e-01 5.762267e-01 0.000000e+00
75% 5.806000e+04 4.430000e+02 3.326348e+01 2.458000e+03 3.848085e+03 7.467000e+03 7.225861e+03 3.224161e+05 5.678170e+02 3.474667e+02 ... 1.285558e+00 3.026556e-04 1.739700e-02 4.503100e-02 2.860000e-02 2.424700e-02 6.158118e-01 1.332289e+00 9.820539e-01 0.000000e+00
max 6.553400e+04 6.553400e+04 1.790211e+02 1.597561e+07 9.436036e+07 5.268011e+07 3.044444e+07 8.109299e+07 9.005165e+03 5.146276e+03 ... 8.777577e+00 1.002920e+03 3.166891e+01 4.501364e+01 4.501364e+01 4.501364e+01 2.970716e+00 7.096569e+00 7.366547e+01 1.000000e+00

8 rows × 32 columns

In [36]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167269 entries, 0 to 46597
Data columns (total 36 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   SourceIP                                1167269 non-null  object 
 1   DestinationIP                           1167269 non-null  object 
 2   SourcePort                              1167269 non-null  int64  
 3   DestinationPort                         1167269 non-null  int64  
 4   TimeStamp                               1167269 non-null  object 
 5   Duration                                1167269 non-null  float64
 6   FlowBytesSent                           1167269 non-null  int64  
 7   FlowSentRate                            1167269 non-null  float64
 8   FlowBytesReceived                       1167269 non-null  int64  
 9   FlowReceivedRate                        1167269 non-null  float64
 10  PacketLengthVariance                    1167269 non-null  float64
 11  PacketLengthStandardDeviation           1167269 non-null  float64
 12  PacketLengthMean                        1167269 non-null  float64
 13  PacketLengthMedian                      1167269 non-null  float64
 14  PacketLengthMode                        1167269 non-null  int64  
 15  PacketLengthSkewFromMedian              1167269 non-null  float64
 16  PacketLengthSkewFromMode                1167269 non-null  float64
 17  PacketLengthCoefficientofVariation      1167269 non-null  float64
 18  PacketTimeVariance                      1167269 non-null  float64
 19  PacketTimeStandardDeviation             1167269 non-null  float64
 20  PacketTimeMean                          1167269 non-null  float64
 21  PacketTimeMedian                        1167269 non-null  float64
 22  PacketTimeMode                          1167269 non-null  float64
 23  PacketTimeSkewFromMedian                1167269 non-null  float64
 24  PacketTimeSkewFromMode                  1167269 non-null  float64
 25  PacketTimeCoefficientofVariation        1167269 non-null  float64
 26  ResponseTimeTimeVariance                1167269 non-null  float64
 27  ResponseTimeTimeStandardDeviation       1167269 non-null  float64
 28  ResponseTimeTimeMean                    1167269 non-null  float64
 29  ResponseTimeTimeMedian                  1159241 non-null  float64
 30  ResponseTimeTimeMode                    1167269 non-null  float64
 31  ResponseTimeTimeSkewFromMedian          1159241 non-null  float64
 32  ResponseTimeTimeSkewFromMode            1167269 non-null  float64
 33  ResponseTimeTimeCoefficientofVariation  1167269 non-null  float64
 34  DoH                                     1167269 non-null  bool   
 35  label                                   1167269 non-null  float64
dtypes: bool(1), float64(27), int64(5), object(3)
memory usage: 321.7+ MB
In [48]:
columns = ['FlowBytesSent', 'FlowSentRate', 'FlowBytesReceived', 'FlowReceivedRate']
color = ['skyblue', 'pink', 'gold', 'lightgreen']


f, axes = plt.subplots(2, 2, figsize=(15, 10))
for (i, c) in enumerate(zip(columns, color)):
    colm, colr = c
    sns.distplot( df[colm] , color=colr, ax=axes[i // 2, i % 2])

Data Flow w/ Outliers Removed

In [51]:
f, axes = plt.subplots(2, 2, figsize=(20, 10))
for (i, c) in enumerate(zip(columns, color)):
    colm, colr = c
    x = df[colm]
    no_outlier = x[x.between(x.quantile(.15), x.quantile(.85))] # without outliers
    sns.distplot( no_outlier, color=colr, ax=axes[i // 2, i % 2])
In [52]:
f, axes = plt.subplots(1, 3, figsize=(20, 5))
sns.distplot( df["PacketLengthMean"] , color="skyblue", ax=axes[0])
sns.distplot( df["PacketTimeMean"] , color="olive", ax=axes[1])
sns.distplot( df["ResponseTimeTimeMean"] , color="gold", ax=axes[2])
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff881867650>
In [39]:
f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(df.iloc[:, 5:34].corr(), square=True)

3. PCA

In [349]:
sub_df = df.sample(n=10000, replace=False, random_state=1)

features = sub_df.iloc[:,5:-1]
label = sub_df.iloc[:,-1]
label[label == 1] = 'malicious'
label[label == 0] = 'benign'

# scale the data
scaler = StandardScaler()
scaler.fit(features)
scaled = scaler.transform(features)
scaled = np.nan_to_num(scaled)

# fit
pca = PCA(n_components=2) 
pca.fit(scaled)
comp = pca.transform(scaled)

pca_data = pd.DataFrame({'comp1': comp[:,0], 'comp2': comp[:,1], 'label': label})

# plot
fig = plt.figure(figsize=(14, 7))
sns.scatterplot(data=pca_data, x="comp1", y="comp2", hue="label")
plt.title("Initial PCA on DoHBrw")
plt.show()
In [312]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# shuffle data
dataset = df.sample(frac=1)
X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1].astype(np.int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)



# Initialize CatBoostRegressor
cat_features = list(range(5)) # categorical features
model = CatBoostClassifier(iterations=5,
                          learning_rate=0.1,
                          depth=1)

# Fit model
model.fit(
    X_train,
    y_train, 
    cat_features,
    eval_set=(X_val, y_val),
    plot=True,
    verbose=False
)
Out[312]:
<catboost.core.CatBoostClassifier at 0x7ff875293850>
In [313]:
from catboost import Pool
from catboost.utils import get_roc_curve, get_confusion_matrix
from sklearn.metrics import f1_score

catboost_pool = Pool(X_test, y_test, cat_features)
cm = get_confusion_matrix(model, catboost_pool)
preds = model.predict(X_test)
f_score = f1_score(y_test, preds)


x_axis_labels = ['Pred-Benign', 'Pred-Malicious']
y_axis_labels = ['True-Benign', 'True-Malicious']

f, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(cm.astype(np.int), annot=True, fmt='', xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.title('Confusion Matrix - F Score: %.5f' % f_score)
Out[313]:
Text(0.5, 1.0, 'Confusion Matrix - F Score: 0.99789')
In [324]:
# save model
model.save_model('baseline',
                 format="cbm",
                 export_parameters=None,
                 pool=None)