FlowPicRefresh/_reference/cicids2017-ml/4.2 Preprocess.ipynb


			
				
				
					
						
						
						
							
							
							{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"4.2 Preprocess.ipynb","provenance":[{"file_id":"1CXwHdwsQd_d5mwwI4WM8bxUnUnCFGWhu","timestamp":1595996348614}],"collapsed_sections":[],"authorship_tag":"ABX9TyOJJMMhEvyOTmePXKVtTT2n"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"IzBPeM9P6mJx","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105296081,"user_tz":-540,"elapsed":772,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["#!/usr/bin/env python3\n","# --------------------------------------------------------------\n","# Author: Mahendra Data - mahendra.data@dbms.cs.kumamoto-u.ac.jp\n","# License: BSD 3 clause\n","# --------------------------------------------------------------"],"execution_count":1,"outputs":[]},{"cell_type":"code","metadata":{"id":"O_7ydDKl6mHV","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1597105296083,"user_tz":-540,"elapsed":758,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"54d057f4-610d-41c3-e2b3-43259a6ea452"},"source":["# Mount Google Drive\n","from google.colab import drive\n","drive.mount(\"/content/drive\")"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"focJJdl26mE0","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297231,"user_tz":-540,"elapsed":1902,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["import os\n","import logging\n","\n","import pandas as pd\n","import numpy as np\n","\n","from sklearn.preprocessing import LabelEncoder\n","from sklearn.model_selection import train_test_split\n","\n","# Log setting\n","logging.basicConfig(format=\"%(asctime)s %(levelname)s %(message)s\", datefmt=\"%H:%M:%S\", level=logging.INFO)\n","\n","# Change display.max_rows to show all features.\n","pd.set_option(\"display.max_rows\", 85)"],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"id":"6ludyWsl6mB3","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297233,"user_tz":-540,"elapsed":1898,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["DIR_PATH = \"/content/drive/My Drive/CICIDS2017/MachineLearningCVE\"\n","PROCESSED_DIR_PATH = \"/content/drive/My Drive/CICIDS2017/ProcessedDataset\"\n","FILE_PATH = os.path.join(DIR_PATH, \"MachineLearningCVE.csv\")"],"execution_count":4,"outputs":[]},{"cell_type":"code","metadata":{"id":"YzHUgbYf6l_F","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1597105297234,"user_tz":-540,"elapsed":1895,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}}},"source":["def _label_encoding() -> LabelEncoder:\n","    # Create Label Encoder\n","    le = LabelEncoder()\n","\n","    # Read Label column from all dataset files\n","    labels = pd.read_csv(FILE_PATH, usecols=['Label'], skipinitialspace=True)\n","\n","    # Fit the labels data to Label Encoder\n","    le.fit(labels.Label)\n","\n","    # Saving the label encoder\n","    np.save(os.path.join(PROCESSED_DIR_PATH, 'label_encoder.npy'), le.classes_)\n","\n","    # Log the result.\n","    logging.info(\"Total rows: {}\".format(labels.shape))\n","    logging.info(\"Class distribution:\\n{}\\n\".format(labels.Label.value_counts()))\n","\n","    return le\n","\n","\n","def _process(df: pd.DataFrame, le: LabelEncoder) -> (np.ndarray, np.ndarray):\n","    # Label encoding\n","    df.Label = le.transform(df.Label)\n","\n","    # Fill NaN with average value of each class in this dataset\n","    nan_rows = df[df.isna().any(axis=1)].shape[0]\n","    logging.info(\"Fill NaN in {} rows with average value of each class.\".format(nan_rows))\n","    df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.mean()))\n","\n","    # Change inf value with maximum value of each class\n","    inf_rows = df[df.isin([np.inf]).any(axis=1)].shape[0]\n","    logging.info(\"Replace Inf in {} rows with maximum value of each class.\".format(inf_rows))\n","    # Temporary replace inf with NaN\n","    df = df.replace([np.inf], np.nan)\n","    # Replace inf with maximum value of each class in this dataset\n","    df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.max()))\n","\n","    # Change negative value with minimum positive value of each class\n","    logging.info(\"Replace negative values with minimum value of each class.\")\n","    # Temporary replace negative value with NaN\n","    df[df < 0] = np.nan\n","    # Replace negative value with minimum value of each class in this dataset\n","    df.iloc[:, df.columns != \"Label\"] = df.groupby(\"Label\").transform(lambda x: x.fillna(x.min()))\n","\n","    return df\n","\n","\n","def _split_train_test(df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):\n","    # Sampling the dataset\n","    x = df.iloc[:, df.columns != 'Label']\n","    y = df[['Label']]\n","\n","    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.20,\n","                                                        random_state=np.random.randint(10))\n","\n","    del x, y\n","\n","    train = pd.concat([x_train, y_train], axis=1, sort=False)\n","    test = pd.concat([x_test, y_test], axis=1, sort=False)\n","\n","    return train, test\n","\n","\n","def _to_csv(df: pd.DataFrame, saving_path: str):\n","    # if file does not exist write header\n","    if not os.path.isfile(saving_path):\n","        df.to_csv(saving_path, index=False)\n","    # else it exists so append without writing the header\n","    else:\n","        df.to_csv(saving_path, index=False, mode='a', header=False)\n","\n","\n","def _preprocessing_all(le: LabelEncoder, chunksize=1000000):\n","    # Preprocess all file\n","    for chunk in pd.read_csv(FILE_PATH, skipinitialspace=True, chunksize=chunksize):\n","        train, test = _split_train_test(_process(chunk, le))\n","        _to_csv(train, os.path.join(PROCESSED_DIR_PATH, \"train_MachineLearningCVE.csv\"))\n","        _to_csv(test, os.path.join(PROCESSED_DIR_PATH, \"test_MachineLearningCVE.csv\"))"],"execution_count":5,"outputs":[]},{"cell_type":"code","metadata":{"id":"a_DfeCt98Y5C","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":340},"executionInfo":{"status":"ok","timestamp":1597105315162,"user_tz":-540,"elapsed":19814,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"4051ae00-c414-4451-c20b-42480ee84a6a"},"source":["label_encoder = _label_encoding()"],"execution_count":6,"outputs":[{"output_type":"stream","text":["00:21:54 INFO Total rows: (2830743, 1)\n","00:21:54 INFO Class distribution:\n","BENIGN                      2273097\n","DoS Hulk                     231073\n","PortScan                     158930\n","DDoS                         128027\n","DoS GoldenEye                 10293\n","FTP-Patator                    7938\n","SSH-Patator                    5897\n","DoS slowloris                  5796\n","DoS Slowhttptest               5499\n","Bot                            1966\n","Web Attack-Brute Force         1507\n","Web Attack-XSS                  652\n","Infiltration                     36\n","Web Attack-Sql Injection         21\n","Heartbleed                       11\n","Name: Label, dtype: int64\n","\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"nMsAEoj56l8M","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":136},"executionInfo":{"status":"ok","timestamp":1597105553169,"user_tz":-540,"elapsed":257813,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"aee9b3c4-d8df-4599-cbdb-a95c7f705ed8"},"source":["_preprocessing_all(label_encoder, 2500000)"],"execution_count":7,"outputs":[{"output_type":"stream","text":["00:22:16 INFO Fill NaN in 1347 rows with average value of each class.\n","00:22:35 INFO Replace Inf in 2682 rows with maximum value of each class.\n","00:22:50 INFO Replace negative values with minimum value of each class.\n","00:22:50 INFO NumExpr defaulting to 2 threads.\n","00:25:31 INFO Fill NaN in 11 rows with average value of each class.\n","00:25:33 INFO Replace Inf in 185 rows with maximum value of each class.\n","00:25:34 INFO Replace negative values with minimum value of each class.\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"ok2M3maU0MTo","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1597105553171,"user_tz":-540,"elapsed":257808,"user":{"displayName":"Mahendra Data","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Ghn7DAlkRKEg-Y82BqktrBT0ABMFy8r5576xhbKDQ=s64","userId":"08049029618478467489"}},"outputId":"34406933-9d91-4ea8-d7c6-81ee06a87ed4"},"source":["logging.info(\"*** END ***\")"],"execution_count":8,"outputs":[{"output_type":"stream","text":["00:25:52 INFO *** END ***\n"],"name":"stderr"}]}]}
						
						
					
				
				
					
						Reference in New Issue
					
					View Git Blame
					Copy Permalink