Mohammadreza MontazeriShatoori, Logan Davidson, Gurdip Kaur, and Arash Habibi Lashkari, “Detection of DoH Tunnels using Time-series Classification of Encrypted Traffic”, The 5th IEEE Cyber Science and Technology Congress, Calgary, Canada, August 2020
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
sns.set(font_scale=1.5)
import category_encoders as ce
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
BENIGH_PATH = 'CSVs/'
MALICIOUS_PATH = 'CSVs 2/'
df_chrome = pd.read_csv(BENIGH_PATH + 'Chrome/all.csv')
df_firefox = pd.read_csv(BENIGH_PATH + 'Firefox/all.csv')
df_tcp = pd.read_csv(MALICIOUS_PATH + 'dns2tcp/all.csv')
df_cat = pd.read_csv(MALICIOUS_PATH + 'dnscat2/all.csv')
df_iodine = pd.read_csv(MALICIOUS_PATH + 'iodine/all.csv')
df_b = pd.concat([df_chrome, df_firefox])
df_m = pd.concat([df_tcp, df_cat, df_iodine])
df_b['label'] = 0.
df_m['label'] = 1.
df = pd.concat([df_b, df_m])
print(f'Chrome === Num rows: {df_chrome.shape[0]} Num features: {df_chrome.shape[1]}')
print(f'FireFox === Num rows: {df_firefox.shape[0]} Num features: {df_firefox.shape[1]}')
print(f'dns2tcp === Num rows: {df_tcp.shape[0]} Num features: {df_tcp.shape[1]}')
print(f'dnscat2 === Num rows: {df_cat.shape[0]} Num features: {df_cat.shape[1]}')
print(f'iodine === Num rows: {df_iodine.shape[0]} Num features: {df_iodine.shape[1]}')
df.describe()
df.info()
columns = ['FlowBytesSent', 'FlowSentRate', 'FlowBytesReceived', 'FlowReceivedRate']
color = ['skyblue', 'pink', 'gold', 'lightgreen']
f, axes = plt.subplots(2, 2, figsize=(15, 10))
for (i, c) in enumerate(zip(columns, color)):
colm, colr = c
sns.distplot( df[colm] , color=colr, ax=axes[i // 2, i % 2])
f, axes = plt.subplots(2, 2, figsize=(20, 10))
for (i, c) in enumerate(zip(columns, color)):
colm, colr = c
x = df[colm]
no_outlier = x[x.between(x.quantile(.15), x.quantile(.85))] # without outliers
sns.distplot( no_outlier, color=colr, ax=axes[i // 2, i % 2])
f, axes = plt.subplots(1, 3, figsize=(20, 5))
sns.distplot( df["PacketLengthMean"] , color="skyblue", ax=axes[0])
sns.distplot( df["PacketTimeMean"] , color="olive", ax=axes[1])
sns.distplot( df["ResponseTimeTimeMean"] , color="gold", ax=axes[2])
f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(df.iloc[:, 5:34].corr(), square=True)
sub_df = df.sample(n=10000, replace=False, random_state=1)
features = sub_df.iloc[:,5:-1]
label = sub_df.iloc[:,-1]
label[label == 1] = 'malicious'
label[label == 0] = 'benign'
# scale the data
scaler = StandardScaler()
scaler.fit(features)
scaled = scaler.transform(features)
scaled = np.nan_to_num(scaled)
# fit
pca = PCA(n_components=2)
pca.fit(scaled)
comp = pca.transform(scaled)
pca_data = pd.DataFrame({'comp1': comp[:,0], 'comp2': comp[:,1], 'label': label})
# plot
fig = plt.figure(figsize=(14, 7))
sns.scatterplot(data=pca_data, x="comp1", y="comp2", hue="label")
plt.title("Initial PCA on DoHBrw")
plt.show()
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
# shuffle data
dataset = df.sample(frac=1)
X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1].astype(np.int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
# Initialize CatBoostRegressor
cat_features = list(range(5)) # categorical features
model = CatBoostClassifier(iterations=5,
learning_rate=0.1,
depth=1)
# Fit model
model.fit(
X_train,
y_train,
cat_features,
eval_set=(X_val, y_val),
plot=True,
verbose=False
)
from catboost import Pool
from catboost.utils import get_roc_curve, get_confusion_matrix
from sklearn.metrics import f1_score
catboost_pool = Pool(X_test, y_test, cat_features)
cm = get_confusion_matrix(model, catboost_pool)
preds = model.predict(X_test)
f_score = f1_score(y_test, preds)
x_axis_labels = ['Pred-Benign', 'Pred-Malicious']
y_axis_labels = ['True-Benign', 'True-Malicious']
f, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(cm.astype(np.int), annot=True, fmt='', xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.title('Confusion Matrix - F Score: %.5f' % f_score)
# save model
model.save_model('baseline',
format="cbm",
export_parameters=None,
pool=None)