加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
TrafficClassificationMachineLearning.py 4.87 KB
一键复制 编辑 原始数据 按行查看 历史
import tensorflow.compat.v1 as tf
import numpy as np
import os
from sklearn import metrics
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import pandas as pd
# 标签
list_y = ['WWW','MAIL','FTP-CONTROL','FTP-PASV','ATTACK','P2P',
'DATABASE','FTP-DATA','MULTIMEDIA','SERVICES','INTERACTIVE','GAMES']
# 数据预处理
def data_preprocess(filename):
X, Y = [], []
dir = os.getcwd()
for f in filename:
print(f)
with open(os.path.join(dir, f), 'r') as file:
for n, i in enumerate(file.readlines()[253:]):
# 将 Y 和 N 分别转为 1 和 0
i = i.replace('Y','1')
i = i.replace('N', '0')
spl = i.split(',')
if spl.count('?')>8:
continue
# 去除字符'\n'
i = i.replace('\n', '')
fz = [float(f) for f in i.split(',')[:-1] if f != '?']
meana = sum(fz) / len(fz)
i = i.replace('?', str(0))
# 均值填充,加高斯白噪声
# 方便作为深度学习模型的输入
x = [float(j) for j in i.split(',')[:-1]] +[meana] * 8 + np.random.normal(0,1,256)
# 修正标签字符
y = i.split(',')[-1].replace('FTP-CO0TROL','FTP-CONTROL')
y = y.replace('I0TERACTIVE','INTERACTIVE' )
y = list_y.index(y)
X.append(x)
Y.append(y)
file.close()
return X, Y
# 数据标准化
# 数据预处理,返回处理好的数据和标签
total_x,total_y = data_preprocess(['entry01.weka.allclass.arff','entry02.weka.allclass.arff',
'entry03.weka.allclass.arff','entry04.weka.allclass.arff',
'entry05.weka.allclass.arff','entry09.weka.allclass.arff',
'entry10.weka.allclass.arff','entry07.weka.allclass.arff',
'entry08.weka.allclass.arff','entry06.weka.allclass.arff'])
# 使用 train_test_split 对训练集和测试集按照 1:3 进行划分
train_x,test_x,train_y,test_y = train_test_split(total_x,total_y,test_size=0.25, random_state=0)
# 使用 convert_to_tensor 将数据转为tensor类型
train_x = tf.convert_to_tensor(train_x, dtype=tf.float64)
train_y= tf.convert_to_tensor(train_y,dtype=tf.int64)
test_x = tf.convert_to_tensor(test_x, dtype=tf.float64)
test_y = tf.convert_to_tensor(test_y,dtype= tf.int64)
# 使用 tf.keras.utils.normalize 将训练集和测试集样本规范化处理
train_x = tf.keras.utils.normalize(train_x, axis=1)
test_x = tf.keras.utils.normalize(test_x, axis=1)
def plot_confusion_matrix(title, pred_y):
cm = confusion_matrix(test_y, pred_y)
labels_name = list_y
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # 归一化
plt.imshow(cm, interpolation='nearest') # 在特定的窗口上显示图像
plt.title(title) # 图像标题
plt.colorbar()
num_local = np.array(range(len(labels_name)))
plt.xticks(num_local, labels_name, rotation=90) # 将标签印在x轴坐标上
plt.yticks(num_local, labels_name) # 将标签印在y轴坐标上
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()
# 随机森林模型
def RandomForest(trainData, trainLabel, testData, testLabel):
t1 = time.time()
model = RandomForestClassifier(random_state=0)
model.fit(trainData, trainLabel)
predicted = model.predict(testData)
score = metrics.accuracy_score(testLabel, predicted)
t2 = time.time()
print("time: ", t2-t1)
print("Accuracy: ",score)
plot_confusion_matrix("RandomForest Confusion Matrix", predicted)
print('The Accuracy of RF Classifier is:', model.score(testData,testLabel))
return model
# 所有参数均使用默认值
# 寻找特征重要性
rf1 = RandomForest(train_x, train_y, test_x, test_y)
# 首先使用原始数据进行机器学习模型rf1的训练
# 然后进行特征缩减
df = pd.DataFrame(rf1.feature_importances_, columns=['importance']) # 将特征重要性转成DataFrame数据
sorted_df = df.sort_values(by='importance', axis=0, ascending=False) # 对特征重要性作降序排列
index = sorted_df[:20].index # 取出重要性高的前20个特征值的索引
train_x_reduced = tf.gather(train_x, index, axis=1) # 重要性高的20个特征组成的新训练集样本
test_x_reduced = tf.gather(test_x, index, axis=1) # 重要性高的20个特征组成的新训练集样本
# 使用特征缩减完的数据训练第二个机器学习模型 rf2
rf2 = RandomForest(train_x_reduced, train_y, test_x_reduced, test_y)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化