You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
197 lines
6.7 KiB
Python
197 lines
6.7 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Read traffic_csv
|
|
"""
|
|
|
|
import os
|
|
import argparse
|
|
import csv
|
|
from sessions_plotter import *
|
|
import glob
|
|
import re
|
|
from loguru import logger
|
|
FLAGS = None
|
|
INPUT = "../raw_csvs/classes/browsing/reg/CICNTTor_browsing.raw.csv"#"../dataset/iscxNTVPN2016/CompletePCAPs" # ""
|
|
INPUT_DIR = "../raw_csvs/classes/chat/vpn/"
|
|
CLASSES_DIR = "../raw_csvs/classes/**/**/"
|
|
|
|
# LABEL_IND = 1
|
|
TPS = 60 # TimePerSession in secs
|
|
DELTA_T = 60 # Delta T between splitted sessions
|
|
MIN_TPS = 50
|
|
|
|
# def insert_dataset(dataset, labels, session, label_ind=LABEL_IND):
|
|
# dataset.append(session)
|
|
# labels.append(label_ind)
|
|
|
|
# def export_dataset(dataset, labels):
|
|
# print "Start export dataset"
|
|
# np.savez(INPUT.split(".")[0] + ".npz", X=dataset, Y=labels)
|
|
# print dataset.shape, labels.shape
|
|
|
|
#
|
|
# def import_dataset():
|
|
# print "Import dataset"
|
|
# dataset = np.load(INPUT.split(".")[0] + ".npz")
|
|
# print dataset["X"].shape, dataset["Y"].shape
|
|
|
|
|
|
def export_dataset(dataset):
|
|
logger.info("Start export dataset")
|
|
np.save(os.path.splitext(INPUT)[0], dataset)
|
|
logger.info(dataset.shape)
|
|
|
|
|
|
def export_class_dataset(dataset, class_dir):
|
|
logger.info("Start export dataset")
|
|
np.save(class_dir + "/" + "_".join(re.findall(r"[\w']+", class_dir)[-2:]), dataset)
|
|
logger.info(dataset.shape)
|
|
|
|
|
|
def import_dataset():
|
|
logger.info("Import dataset")
|
|
dataset = np.load(os.path.splitext(INPUT)[0] + ".npy")
|
|
logger.info(dataset.shape)
|
|
return dataset
|
|
|
|
|
|
def traffic_csv_converter(file_path):
|
|
logger.info("Running on " + file_path)
|
|
dataset = []
|
|
# labels = []
|
|
counter = 0
|
|
with open(file_path, 'r') as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
for i, row in enumerate(reader):
|
|
# print row[0], row[7]
|
|
session_tuple_key = tuple(row[:8])
|
|
length = int(row[7])
|
|
ts = np.array(row[8:8+length], dtype=float)
|
|
sizes = np.array(row[9+length:], dtype=int)
|
|
|
|
# if (sizes > MTU).any():
|
|
# a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]]
|
|
# print len(a), session_tuple_key
|
|
|
|
if length > 10:
|
|
# print ts[0], ts[-1]
|
|
# h = session_2d_histogram(ts, sizes)
|
|
# session_spectogram(ts, sizes, session_tuple_key[0])
|
|
# dataset.append([h])
|
|
# counter += 1
|
|
# if counter % 100 == 0:
|
|
# print counter
|
|
|
|
for t in range(int(ts[-1]/DELTA_T - TPS/DELTA_T) + 1):
|
|
mask = ((ts >= t * DELTA_T) & (ts <= (t * DELTA_T + TPS)))
|
|
# print t * DELTA_T, t * DELTA_T + TPS, ts[-1]
|
|
ts_mask = ts[mask]
|
|
sizes_mask = sizes[mask]
|
|
if len(ts_mask) > 10 and ts_mask[-1] - ts_mask[0] > MIN_TPS:
|
|
# if "facebook" in session_tuple_key[0]:
|
|
# session_spectogram(ts[mask], sizes[mask], session_tuple_key[0])
|
|
# # session_2d_histogram(ts[mask], sizes[mask], True)
|
|
# session_histogram(sizes[mask], True)
|
|
# exit()
|
|
# else:
|
|
# continue
|
|
|
|
h = session_2d_histogram(ts_mask, sizes_mask)
|
|
# session_spectogram(ts_mask, sizes_mask, session_tuple_key[0])
|
|
dataset.append([h])
|
|
counter += 1
|
|
if counter % 100 == 0:
|
|
logger.info(counter)
|
|
|
|
return np.asarray(dataset) #, np.asarray(labels)
|
|
|
|
|
|
def traffic_csv_converter_splitted(file_path):
|
|
def split_converter(ts, sizes, dataset, counter):
|
|
if ts[-1] - ts[0] > MIN_TPS and len(ts) > 20:
|
|
# print ts[0], ts[-1]
|
|
h = session_2d_histogram(ts-ts[0], sizes)
|
|
# session_spectogram(ts, sizes, session_tuple_key[0])
|
|
dataset.append([h])
|
|
counter += 1
|
|
# if counter % 100 == 0:
|
|
# print counter
|
|
|
|
total_time = ts[-1] - ts[0]
|
|
if total_time > TPS:
|
|
for ts_split, sizes_split in zip(np.split(ts, [len(ts)/2]), np.split(sizes, [len(sizes)/2])):
|
|
split_converter(ts_split, sizes_split, dataset, counter)
|
|
|
|
logger.info("Running on " + file_path)
|
|
dataset = []
|
|
# labels = []
|
|
counter = 0
|
|
with open(file_path, 'r') as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
for i, row in enumerate(reader):
|
|
# print row[0], row[7]
|
|
session_tuple_key = tuple(row[:8])
|
|
length = int(row[7])
|
|
ts = np.array(row[8:8+length], dtype=float)
|
|
sizes = np.array(row[9+length:], dtype=int)
|
|
|
|
# if (sizes > MTU).any():
|
|
# a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]]
|
|
# print len(a), session_tuple_key
|
|
|
|
if length > 10:
|
|
split_converter(ts, sizes, dataset, counter)
|
|
|
|
return np.asarray(dataset)
|
|
|
|
|
|
def traffic_class_converter(dir_path):
|
|
dataset_tuple = ()
|
|
for file_path in [os.path.join(dir_path, fn) for fn in next(os.walk(dir_path))[2] if (".csv" in os.path.splitext(fn)[-1])]:
|
|
dataset_tuple += (traffic_csv_converter(file_path),) ################
|
|
|
|
return np.concatenate(dataset_tuple, axis=0)
|
|
|
|
|
|
def iterate_all_classes():
|
|
for class_dir in glob.glob(CLASSES_DIR):
|
|
if "other" not in class_dir: #"browsing" not in class_dir and
|
|
logger.info("working on " + class_dir)
|
|
dataset = traffic_class_converter(class_dir)
|
|
logger.info(dataset.shape)
|
|
export_class_dataset(dataset, class_dir)
|
|
|
|
|
|
def random_sampling_dataset(input_array, size=2000):
|
|
logger.info("Import dataset " + input_array)
|
|
dataset = np.load(input_array)
|
|
logger.info(dataset.shape)
|
|
p = size*1.0/len(dataset)
|
|
logger.info(p)
|
|
if p >= 1:
|
|
raise Exception
|
|
|
|
mask = np.random.choice([True, False], len(dataset), p=[p, 1-p])
|
|
dataset = dataset[mask]
|
|
logger.info("Start export dataset")
|
|
|
|
np.save(os.path.splitext(input_array)[0] + "_samp", dataset)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--input', type=str, default=INPUT, help='Path to csv file')
|
|
FLAGS = parser.parse_args()
|
|
##
|
|
iterate_all_classes()
|
|
INPUT = '../../_dataset/FlowPic/classes_csvs/browsing/reg/CICNTTor_browsing_raw.csv'
|
|
# dataset = traffic_class_converter(INPUT_DIR)
|
|
dataset = traffic_csv_converter(INPUT)
|
|
|
|
input_array = "../raw_csvs/classes/browsing/reg/browsing_reg.npy"
|
|
random_sampling_dataset(input_array)
|
|
|
|
|
|
# export_class_dataset(dataset)
|
|
# import_dataset()
|