diff --git a/config.py b/config.py index 3d5e2c4..28ebe4a 100644 --- a/config.py +++ b/config.py @@ -4,7 +4,7 @@ CSV_PATH = './_dataset/TrafficLabelling_/Friday-WorkingHours-DDoS.csv' BYPASS_COLUMNS= ('Destination Port', 'Label') UNIQUE_COLUMNS = [' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' CWE Flag Count', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', - ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'] + ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Label'] IMG_SAVE_PATH = f'./saves/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}' diff --git a/main.py b/main.py index 5e48081..dab8fe5 100644 --- a/main.py +++ b/main.py @@ -3,16 +3,10 @@ from utils.files import create_dir import pandas as pd import numpy as np from config import * -import matplotlib.pyplot as plt from utils.dataframe import * from sklearn.preprocessing import QuantileTransformer from PIL import Image - -def is_in_bypass_list(column_name: str, bypass_list: tuple) -> bool: - for bypass in bypass_list: - if bypass in column_name: - return True - return False +from loguru import logger def input_csv_to_df(file_path: str) -> pd.DataFrame: @@ -21,138 +15,57 @@ def input_csv_to_df(file_path: str) -> pd.DataFrame: return df -def averaging_df(df: pd.DataFrame, column_num: int = None): - numeric_columns = df.select_dtypes(include=[np.number]).columns - max_values = df.max() - if column_num is None: - column_num = 0 - for numeric_column in numeric_columns: - if is_in_bypass_list(numeric_column, BYPASS_COLUMNS): - continue - column_num = column_num + 1 - for numeric_column in numeric_columns: - if is_in_bypass_list(numeric_column, BYPASS_COLUMNS): - continue - df[numeric_column] = df[numeric_column] / max_values[numeric_column] * column_num - # fix nan - df[numeric_column] = df[numeric_column].fillna(0) - return df, column_num - - - - - -def iter_df_to_point(df: pd.DataFrame, column_num: int = None): - size = 0 - points = [] - for index, row in df.iterrows(): - x_values = row.values[2:] - y_values = np.linspace(0, len(x_values) - 1, len(x_values)) - size = size + 1 - points.append({index: (x_values, y_values)}) - return points - - -def generate_one_plot(x_values, y_values, x_y_size: int) -> plt: - yedges = xedges = np.linspace(0, x_y_size, x_y_size) - H = np.zeros((x_y_size, x_y_size)) - plt.pcolormesh(xedges, yedges, H) # pcolormeshp()函数用于创建具有非规则矩形网格的伪彩色图 - plt.scatter(x_values, y_values, marker=',', s=1) - plt.xlim(0, x_y_size) - plt.ylim(0, x_y_size) - # 326 - plt.ylabel('Attributes') - plt.xlabel('Attribute values') - # plt.set_cmap('gnuplot') - plt.set_cmap('BuPu') - # plt.set_cmap('Greys') - plt.axis('on') - return plt - # plt.savefig(os.path.join(figure_save_path, qwe + ".png"), bbox_inches='tight', pad_inches=0) # 分别命名图片 - - -def save_plt(plt: plt, base_path: str, num: int): - plt.savefig(f"{base_path}/{num}.png", bbox_inches='tight', pad_inches=0) - - -from multiprocessing import Pool, cpu_count - - -def process(df: pd.DataFrame): - df, size = averaging_df(df) - points = iter_df_to_point(df, size) - base_path = f'./saves/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}' - create_dir(base_path) - - pool = Pool(cpu_count()) - results = [] - - for point_dict in points: - num = list(point_dict.keys())[0] - point = point_dict[num] - result = pool.apply_async(generate_and_save, args=(base_path, point, size, num)) - results.append(result) - - pool.close() - pool.join() - - -def generate_and_save(base_path: str, point: tuple, size: int, calculate): - plt = generate_one_plot(point[0], point[1], size) - save_plt(plt, base_path, calculate) - - -def process_single_threaded(df: pd.DataFrame): - df, size = averaging_df(df) - points = iter_df_to_point(df, size) - base_path = IMG_SAVE_PATH - create_dir(base_path) - for point_dict in points: - num = list(point_dict.keys())[0] - point = point_dict[num] - size = len(point[0]) - generate_and_save(base_path, point, size, num) - # plt.show() - # return df +def averaging_df(df: pd.DataFrame): + numeric_features = df.dtypes[df.dtypes != 'object'].index + scaler = QuantileTransformer() + df[numeric_features] = scaler.fit_transform(df[numeric_features]) + df[numeric_features] = df[numeric_features].apply( + lambda x: (x * 255)) + return df -if __name__ == '__main__': - df = input_csv_to_df(CSV_PATH) - # process(df) - # process_single_threaded(df) +def clean_data(df: pd.DataFrame) -> pd.DataFrame: df = df.replace([np.inf, -np.inf], np.nan) df = df.dropna(axis=0) # 删除具有NaN值的行 - df = get_ddos_df(df) df = drop_columns(df, UNIQUE_COLUMNS) # df = drop_unique_columns(df) df = df.iloc[:, 7:] - numeric_features = df.dtypes[df.dtypes != 'object'].index - scaler = QuantileTransformer() - df[numeric_features] = scaler.fit_transform(df[numeric_features]) + return df - # In[19]: - # Multiply the feature values by 255 to transform them into the scale of [0,255] - df[numeric_features] = df[numeric_features].apply( - lambda x: (x * 255)) - df_clean_data = df +def process(df: pd.DataFrame, label: str = None): + df = clean_data(df) + df_clean_data = averaging_df(df) + create_dir(IMG_SAVE_PATH) + generate_and_save(df_clean_data) + + +def generate_and_save(df_clean_data: pd.DataFrame): row_length = len(df_clean_data.columns) col_length = len(df_clean_data) - # Transform all features into the scale of [0,1] - count = 0 ims = [] + saves_count = 0 for i in range(0, col_length): count = count + 1 - if count <= (row_length*3): + if count <= (row_length * 3): im = df_clean_data.iloc[i].values ims = np.append(ims, im) else: + saves_count = saves_count + 1 ims = np.array(ims).reshape(row_length, row_length, 3) + if saves_count % 100 == 0: + logger.info(f"Saving {saves_count} images") + if saves_count == 1: + logger.info(f"Shape: {ims.shape}") array = np.array(ims, dtype=np.uint8) new_image = Image.fromarray(array) - new_image.save(IMG_SAVE_PATH + str(i) + '.png') + new_image.save(f"{IMG_SAVE_PATH}/{saves_count}.png") count = 0 ims = [] - print(df) \ No newline at end of file + + +if __name__ == '__main__': + df = input_csv_to_df(CSV_PATH) + process(df) diff --git a/test.py b/test.py deleted file mode 100644 index e69de29..0000000 diff --git a/traffic_csv_converter.py b/traffic_csv_converter.py deleted file mode 100644 index b1705bb..0000000 --- a/traffic_csv_converter.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python -""" -Read traffic_csv -""" - -import os -import argparse -import csv -import glob -import re - -FLAGS = None -INPUT = "../raw_csvs/classes/browsing/reg/CICNTTor_browsing.raw.csv"#"../dataset/iscxNTVPN2016/CompletePCAPs" # "" -INPUT_DIR = "../raw_csvs/classes/chat/vpn/" -CLASSES_DIR = "../raw_csvs/classes/**/**/" - -# LABEL_IND = 1 -TPS = 60 # TimePerSession in secs -DELTA_T = 60 # Delta T between splitted sessions -MIN_TPS = 50 - -# def insert_dataset(dataset, labels, session, label_ind=LABEL_IND): -# dataset.append(session) -# labels.append(label_ind) - -# def export_dataset(dataset, labels): -# print "Start export dataset" -# np.savez(INPUT.split(".")[0] + ".npz", X=dataset, Y=labels) -# print dataset.shape, labels.shape - -# -# def import_dataset(): -# print "Import dataset" -# dataset = np.load(INPUT.split(".")[0] + ".npz") -# print dataset["X"].shape, dataset["Y"].shape - -import matplotlib.pyplot as plt -import numpy as np - -MTU = 1500 - -def session_spectogram(ts, sizes, name=None): - plt.scatter(ts, sizes, marker='.') - plt.ylim(0, MTU) - plt.xlim(ts[0], ts[-1]) - # plt.yticks(np.arange(0, MTU, 10)) - # plt.xticks(np.arange(int(ts[0]), int(ts[-1]), 10)) - plt.title(name + " Session Spectogram") - plt.ylabel('Size [B]') - plt.xlabel('Time [sec]') - - plt.grid(True) - plt.show() - - -def session_atricle_spectogram(ts, sizes, fpath=None, show=True, tps=None): - if tps is None: - max_delta_time = ts[-1] - ts[0] - else: - max_delta_time = tps - - ts_norm = ((np.array(ts) - ts[0]) / max_delta_time) * MTU - plt.figure() - plt.scatter(ts_norm, sizes, marker=',', c='k', s=5) - plt.ylim(0, MTU) - plt.xlim(0, MTU) - plt.ylabel('Packet Size [B]') - plt.xlabel('Normalized Arrival Time') - plt.set_cmap('binary') - plt.axes().set_aspect('equal') - plt.grid(False) - if fpath is not None: - # plt.savefig(OUTPUT_DIR + fname, bbox_inches='tight', pad_inches=1) - plt.savefig(fpath, bbox_inches='tight') - if show: - plt.show() - plt.close() - - -def session_histogram(sizes, plot=False): - hist, bin_edges = np.histogram(sizes, bins=range(0, MTU + 1, 1)) - if plot: - plt.bar(bin_edges[:-1], hist, width=1) - plt.xlim(min(bin_edges), max(bin_edges)+100) - plt.show() - return hist.astype(np.uint16) - - -def session_2d_histogram(ts, sizes, plot=False, tps=None): - if tps is None: - max_delta_time = ts[-1] - ts[0] - else: - max_delta_time = tps - - # ts_norm = map(int, ((np.array(ts) - ts[0]) / max_delta_time) * MTU) - ts_norm = ((np.array(ts) - ts[0]) / max_delta_time) * MTU - H, xedges, yedges = np.histogram2d(sizes, ts_norm, bins=(range(0, MTU + 1, 1), range(0, MTU + 1, 1))) - - if plot: - plt.pcolormesh(xedges, yedges, H) - plt.colorbar() - plt.xlim(0, MTU) - plt.ylim(0, MTU) - plt.set_cmap('binary') - plt.show() - return H.astype(np.uint16) - - -def export_dataset(dataset): - print("Start export dataset") - np.save(os.path.splitext(INPUT)[0], dataset) - print(dataset.shape) - - -def export_class_dataset(dataset, class_dir): - print("Start export dataset") - np.save(class_dir + "/" + "_".join(re.findall(r"[\w']+", class_dir)[-2:]), dataset) - print(dataset.shape) - - -def import_dataset(): - print("Import dataset") - dataset = np.load(os.path.splitext(INPUT)[0] + ".npy") - print(dataset.shape) - return dataset - - -def traffic_csv_converter(file_path): - print("Running on " + file_path) - dataset = [] - # labels = [] - counter = 0 - with open(file_path, 'r') as csv_file: - reader = csv.reader(csv_file) - for i, row in enumerate(reader): - # print row[0], row[7] - session_tuple_key = tuple(row[:8]) - length = int(row[7]) - ts = np.array(row[8:8+length], dtype=float) - sizes = np.array(row[9+length:], dtype=int) - - # if (sizes > MTU).any(): - # a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]] - # print len(a), session_tuple_key - - if length > 10: - # print ts[0], ts[-1] - # h = session_2d_histogram(ts, sizes) - # session_spectogram(ts, sizes, session_tuple_key[0]) - # dataset.append([h]) - # counter += 1 - # if counter % 100 == 0: - # print counter - - for t in range(int(ts[-1]/DELTA_T - TPS/DELTA_T) + 1): - mask = ((ts >= t * DELTA_T) & (ts <= (t * DELTA_T + TPS))) - # print t * DELTA_T, t * DELTA_T + TPS, ts[-1] - ts_mask = ts[mask] - sizes_mask = sizes[mask] - if len(ts_mask) > 10 and ts_mask[-1] - ts_mask[0] > MIN_TPS: - # if "facebook" in session_tuple_key[0]: - # session_spectogram(ts[mask], sizes[mask], session_tuple_key[0]) - # # session_2d_histogram(ts[mask], sizes[mask], True) - # session_histogram(sizes[mask], True) - # exit() - # else: - # continue - - h = session_2d_histogram(ts_mask, sizes_mask) - # session_spectogram(ts_mask, sizes_mask, session_tuple_key[0]) - dataset.append([h]) - counter += 1 - if counter % 100 == 0: - print(counter) - - return np.asarray(dataset) #, np.asarray(labels) - - -def traffic_csv_converter_splitted(file_path): - def split_converter(ts, sizes, dataset, counter): - if ts[-1] - ts[0] > MIN_TPS and len(ts) > 20: - # print ts[0], ts[-1] - h = session_2d_histogram(ts-ts[0], sizes) - # session_spectogram(ts, sizes, session_tuple_key[0]) - dataset.append([h]) - counter += 1 - # if counter % 100 == 0: - # print counter - - total_time = ts[-1] - ts[0] - if total_time > TPS: - for ts_split, sizes_split in zip(np.split(ts, [len(ts)/2]), np.split(sizes, [len(sizes)/2])): - split_converter(ts_split, sizes_split, dataset, counter) - - print("Running on " + file_path) - dataset = [] - # labels = [] - counter = 0 - with open(file_path, 'r') as csv_file: - reader = csv.reader(csv_file) - for i, row in enumerate(reader): - # print row[0], row[7] - session_tuple_key = tuple(row[:8]) - length = int(row[7]) - ts = np.array(row[8:8+length], dtype=float) - sizes = np.array(row[9+length:], dtype=int) - - # if (sizes > MTU).any(): - # a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]] - # print len(a), session_tuple_key - - if length > 10: - split_converter(ts, sizes, dataset, counter) - - return np.asarray(dataset) - - -def traffic_class_converter(dir_path): - dataset_tuple = () - for file_path in [os.path.join(dir_path, fn) for fn in next(os.walk(dir_path))[2] if (".csv" in os.path.splitext(fn)[-1])]: - dataset_tuple += (traffic_csv_converter(file_path),) ################ - - return np.concatenate(dataset_tuple, axis=0) - - -def iterate_all_classes(): - for class_dir in glob.glob(CLASSES_DIR): - if "other" not in class_dir: #"browsing" not in class_dir and - print("working on " + class_dir) - dataset = traffic_class_converter(class_dir) - print(dataset.shape) - export_class_dataset(dataset, class_dir) - - -def random_sampling_dataset(input_array, size=2000): - print("Import dataset " + input_array) - dataset = np.load(input_array) - print(dataset.shape) - p = size*1.0/len(dataset) - print(p) - if p >= 1: - raise Exception - - mask = np.random.choice([True, False], len(dataset), p=[p, 1-p]) - dataset = dataset[mask] - print("Start export dataset") - - np.save(os.path.splitext(input_array)[0] + "_samp", dataset) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--input', type=str, default=INPUT, help='Path to csv file') - - FLAGS = parser.parse_args() - ## - # iterate_all_classes() - - # dataset = traffic_class_converter(INPUT_DIR) - # dataset = traffic_csv_converter(INPUT) - - input_array = "./_dataset/FlowPic/browsing_reg.npy" - random_sampling_dataset(input_array) - - - # export_class_dataset(dataset) - # import_dataset() diff --git a/utils/dataframe.py b/utils/dataframe.py index bc9af64..de37db0 100644 --- a/utils/dataframe.py +++ b/utils/dataframe.py @@ -9,12 +9,18 @@ def drop_unique_columns(df: pd.DataFrame): return df -# def drop_columns_with_fix_up(df: pd.DataFrame, columns: list): -# columns = [w.lstrip() for w in columns] -# df = drop_columns(df, columns) -# columns = [" " + w for w in columns] -# df = drop_columns(df, columns) -# return df +def select_label_rows(df: pd.DataFrame, label: str): + return df[df.iloc[:, -1] == label] + + +def drop_columns_with_fix_up(df: pd.DataFrame, columns: list): + columns = [w.lstrip() for w in columns] + df = drop_columns(df, columns) + columns = [" " + w for w in columns] + df = drop_columns(df, columns) + return df + + def drop_columns(df: pd.DataFrame, columns: list): columns = [w.lstrip() for w in columns] for column_name in columns: @@ -25,3 +31,10 @@ def drop_columns(df: pd.DataFrame, columns: list): def get_ddos_df(df: pd.DataFrame): return df[df.iloc[:, -1] == 'DDoS'] + + +def is_in_bypass_list(column_name: str, bypass_list: tuple) -> bool: + for bypass in bypass_list: + if bypass in column_name: + return True + return False