You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1 line
9.7 KiB
Plaintext
1 line
9.7 KiB
Plaintext
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"4.2 Preprocess.ipynb","provenance":[{"file_id":"1CXwHdwsQd_d5mwwI4WM8bxUnUnCFGWhu","timestamp":1595996348614}],"collapsed_sections":[],"authorship_tag":"ABX9TyOJJMMhEvyOTmePXKVtTT2n"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"IzBPeM9P6mJx","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105296081,"user_tz":-540,"elapsed":772,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["#!/usr/bin/env python3\n","# --------------------------------------------------------------\n","# Author: Mahendra Data - mahendra.data@dbms.cs.kumamoto-u.ac.jp\n","# License: BSD 3 clause\n","# --------------------------------------------------------------"],"execution_count":1,"outputs":[]},{"cell_type":"code","metadata":{"id":"O_7ydDKl6mHV","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1597105296083,"user_tz":-540,"elapsed":758,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"54d057f4-610d-41c3-e2b3-43259a6ea452"},"source":["# Mount Google Drive\n","from google.colab import drive\n","drive.mount(\"/content/drive\")"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"focJJdl26mE0","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297231,"user_tz":-540,"elapsed":1902,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["import os\n","import logging\n","\n","import pandas as pd\n","import numpy as np\n","\n","from sklearn.preprocessing import LabelEncoder\n","from sklearn.model_selection import train_test_split\n","\n","# Log setting\n","logging.basicConfig(format=\"%(asctime)s %(levelname)s %(message)s\", datefmt=\"%H:%M:%S\", level=logging.INFO)\n","\n","# Change display.max_rows to show all features.\n","pd.set_option(\"display.max_rows\", 85)"],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"id":"6ludyWsl6mB3","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297233,"user_tz":-540,"elapsed":1898,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["DIR_PATH = \"/content/drive/My Drive/CICIDS2017/MachineLearningCVE\"\n","PROCESSED_DIR_PATH = \"/content/drive/My Drive/CICIDS2017/ProcessedDataset\"\n","FILE_PATH = os.path.join(DIR_PATH, \"MachineLearningCVE.csv\")"],"execution_count":4,"outputs":[]},{"cell_type":"code","metadata":{"id":"YzHUgbYf6l_F","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297234,"user_tz":-540,"elapsed":1895,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["def _label_encoding() -> LabelEncoder:\n"," # Create Label Encoder\n"," le = LabelEncoder()\n","\n"," # Read Label column from all dataset files\n"," labels = pd.read_csv(FILE_PATH, usecols=['Label'], skipinitialspace=True)\n","\n"," # Fit the labels data to Label Encoder\n"," le.fit(labels.Label)\n","\n"," # Saving the label encoder\n"," np.save(os.path.join(PROCESSED_DIR_PATH, 'label_encoder.npy'), le.classes_)\n","\n"," # Log the result.\n"," logging.info(\"Total rows: {}\".format(labels.shape))\n"," logging.info(\"Class distribution:\\n{}\\n\".format(labels.Label.value_counts()))\n","\n"," return le\n","\n","\n","def _process(df: pd.DataFrame, le: LabelEncoder) -> (np.ndarray, np.ndarray):\n"," # Label encoding\n"," df.Label = le.transform(df.Label)\n","\n"," # Fill NaN with average value of each class in this dataset\n"," nan_rows = df[df.isna().any(axis=1)].shape[0]\n"," logging.info(\"Fill NaN in {} rows with average value of each class.\".format(nan_rows))\n"," df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.mean()))\n","\n"," # Change inf value with maximum value of each class\n"," inf_rows = df[df.isin([np.inf]).any(axis=1)].shape[0]\n"," logging.info(\"Replace Inf in {} rows with maximum value of each class.\".format(inf_rows))\n"," # Temporary replace inf with NaN\n"," df = df.replace([np.inf], np.nan)\n"," # Replace inf with maximum value of each class in this dataset\n"," df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.max()))\n","\n"," # Change negative value with minimum positive value of each class\n"," logging.info(\"Replace negative values with minimum value of each class.\")\n"," # Temporary replace negative value with NaN\n"," df[df < 0] = np.nan\n"," # Replace negative value with minimum value of each class in this dataset\n"," df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.min()))\n","\n"," return df\n","\n","\n","def _split_train_test(df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):\n"," # Sampling the dataset\n"," x = df.iloc[:, df.columns != 'Label']\n"," y = df[['Label']]\n","\n"," x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.20,\n"," random_state=np.random.randint(10))\n","\n"," del x, y\n","\n"," train = pd.concat([x_train, y_train], axis=1, sort=False)\n"," test = pd.concat([x_test, y_test], axis=1, sort=False)\n","\n"," return train, test\n","\n","\n","def _to_csv(df: pd.DataFrame, saving_path: str):\n"," # if file does not exist write header\n"," if not os.path.isfile(saving_path):\n"," df.to_csv(saving_path, index=False)\n"," # else it exists so append without writing the header\n"," else:\n"," df.to_csv(saving_path, index=False, mode='a', header=False)\n","\n","\n","def _preprocessing_all(le: LabelEncoder, chunksize=1000000):\n"," # Preprocess all file\n"," for chunk in pd.read_csv(FILE_PATH, skipinitialspace=True, chunksize=chunksize):\n"," train, test = _split_train_test(_process(chunk, le))\n"," _to_csv(train, os.path.join(PROCESSED_DIR_PATH, \"train_MachineLearningCVE.csv\"))\n"," _to_csv(test, os.path.join(PROCESSED_DIR_PATH, \"test_MachineLearningCVE.csv\"))"],"execution_count":5,"outputs":[]},{"cell_type":"code","metadata":{"id":"a_DfeCt98Y5C","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":340},"executionInfo":{"status":"ok","timestamp":1597105315162,"user_tz":-540,"elapsed":19814,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"4051ae00-c414-4451-c20b-42480ee84a6a"},"source":["label_encoder = _label_encoding()"],"execution_count":6,"outputs":[{"output_type":"stream","text":["00:21:54 INFO Total rows: (2830743, 1)\n","00:21:54 INFO Class distribution:\n","BENIGN 2273097\n","DoS Hulk 231073\n","PortScan 158930\n","DDoS 128027\n","DoS GoldenEye 10293\n","FTP-Patator 7938\n","SSH-Patator 5897\n","DoS slowloris 5796\n","DoS Slowhttptest 5499\n","Bot 1966\n","Web Attack-Brute Force 1507\n","Web Attack-XSS 652\n","Infiltration 36\n","Web Attack-Sql Injection 21\n","Heartbleed 11\n","Name: Label, dtype: int64\n","\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"nMsAEoj56l8M","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":136},"executionInfo":{"status":"ok","timestamp":1597105553169,"user_tz":-540,"elapsed":257813,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"aee9b3c4-d8df-4599-cbdb-a95c7f705ed8"},"source":["_preprocessing_all(label_encoder, 2500000)"],"execution_count":7,"outputs":[{"output_type":"stream","text":["00:22:16 INFO Fill NaN in 1347 rows with average value of each class.\n","00:22:35 INFO Replace Inf in 2682 rows with maximum value of each class.\n","00:22:50 INFO Replace negative values with minimum value of each class.\n","00:22:50 INFO NumExpr defaulting to 2 threads.\n","00:25:31 INFO Fill NaN in 11 rows with average value of each class.\n","00:25:33 INFO Replace Inf in 185 rows with maximum value of each class.\n","00:25:34 INFO Replace negative values with minimum value of each class.\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"ok2M3maU0MTo","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1597105553171,"user_tz":-540,"elapsed":257808,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"34406933-9d91-4ea8-d7c6-81ee06a87ed4"},"source":["logging.info(\"*** END ***\")"],"execution_count":8,"outputs":[{"output_type":"stream","text":["00:25:52 INFO *** END ***\n"],"name":"stderr"}]}]} |