bump feat: stable version

bump:stable v0.2
Initial stage 2: color pic
42 changed files with 826292 additions and 367 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ saves/
 *.so
 _dataset/MachineLearningCVE/
 _dataset/TrafficLabelling_/
 _dataset/pcap/
 # Distribution / packaging
 .Python
 build/
--- a/_reference/CNN_Transfer_Refresh/1-Data_pre-processing_CAN.py
+++ b/_reference/CNN_Transfer_Refresh/1-Data_pre-processing_CAN.py
@ -0,0 +1,350 @@
 #!/usr/bin/env python
 # coding: utf-8
 # # A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles 
 # This is the code for the paper entitled "**A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles**" accepted in IEEE International Conference on Communications (IEEE ICC).  
 # Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
 # Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University
 # 
 # **Notebook 1: Data pre-processing**  
 # Procedures:  
 # &nbsp; 1): Read the dataset  
 # &nbsp; 2): Transform the tabular data into images  
 # &nbsp; 3): Display the transformed images  
 # &nbsp; 4): Split the training and test set  
 # ## Import libraries
 # In[14]:
 import numpy as np
 import pandas as pd
 import os
 import cv2
 import math
 import random
 import matplotlib.pyplot as plt
 import shutil
 from sklearn.preprocessing import QuantileTransformer
 from PIL import Image
 import warnings
 warnings.filterwarnings("ignore")
 # ## Read the Car-Hacking/CAN-Intrusion dataset
 # The complete Car-Hacking dataset is publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  
 # In this repository, due to the file size limit of GitHub, we use the 5% subset.
 # In[15]:
 #Read dataset
 df=pd.read_csv('data/Car_Hacking_5%.csv')
 # In[16]:
 df
 # In[17]:
 # The labels of the dataset. "R" indicates normal patterns, and there are four types of attack (DoS, fuzzy. gear spoofing, and RPM spoofing zttacks)
 df.Label.value_counts()
 # ## Data Transformation
 # Convert tabular data to images
 # Procedures:
 # 1. Use quantile transform to transform the original data samples into the scale of [0,255], representing pixel values
 # 2. Generate images for each category (Normal, DoS, Fuzzy, Gear, RPM), each image consists of 27 data samples with 9 features. Thus, the size of each image is 9*9*3, length 9, width 9, and 3 color channels (RGB).
 # In[18]:
 # Transform all features into the scale of [0,1]
 numeric_features = df.dtypes[df.dtypes != 'object'].index
 scaler = QuantileTransformer() 
 df[numeric_features] = scaler.fit_transform(df[numeric_features])
 # In[19]:
 # Multiply the feature values by 255 to transform them into the scale of [0,255]
 df[numeric_features] = df[numeric_features].apply(
    lambda x: (x*255))
 # In[20]:
 df.describe()
 # All features are in the same scale of [0,255]
 # ### Generate images for each class
 # In[21]:
 df0=df[df['Label']=='R'].drop(['Label'],axis=1)
 df1=df[df['Label']=='RPM'].drop(['Label'],axis=1)
 df2=df[df['Label']=='gear'].drop(['Label'],axis=1)
 df3=df[df['Label']=='DoS'].drop(['Label'],axis=1)
 df4=df[df['Label']=='Fuzzy'].drop(['Label'],axis=1)
 # In[22]:
 # Generate 9*9 color images for class 0 (Normal)
 count=0
 ims = []
 image_path = "train/0/"
 os.makedirs(image_path)
 for i in range(0, 2):
    count=count+1
    if count<=27: 
        im=df0.iloc[i].values
        ims=np.append(ims,im)
    else:
        print(ims)
        ims=np.array(ims).reshape(9,9,3)
        print(ims)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[23]:
 # Generate 9*9 color images for class 1 (RPM spoofing)
 count=0
 ims = []
 image_path = "train/1/"
 os.makedirs(image_path)
 for i in range(0, len(df1)):  
    count=count+1
    if count<=27: 
        im=df1.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[24]:
 # Generate 9*9 color images for class 2 (Gear spoofing)
 count=0
 ims = []
 image_path = "train/2/"
 os.makedirs(image_path)
 for i in range(0, len(df2)):  
    count=count+1
    if count<=27: 
        im=df2.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims
        ims=np.array(ims).reshape(9,9,3)
        ims
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[25]:
 # Generate 9*9 color images for class 3 (DoS attack)
 count=0
 ims = []
 image_path = "train/3/"
 os.makedirs(image_path)
 for i in range(0, len(df3)):  
    count=count+1
    if count<=27: 
        im=df3.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[26]:
 # Generate 9*9 color images for class 4 (Fuzzy attack)
 count=0
 ims = []
 image_path = "train/4/"
 os.makedirs(image_path)
 for i in range(0, len(df4)):  
    count=count+1
    if count<=27: 
        im=df4.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # ## Split the training and test set 
 # In[27]:
 # Create folders to store images
 Train_Dir='./train/'
 Val_Dir='./test/'
 allimgs=[]
 for subdir in os.listdir(Train_Dir):
    for filename in os.listdir(os.path.join(Train_Dir,subdir)):
        filepath=os.path.join(Train_Dir,subdir,filename)
        allimgs.append(filepath)
 print(len(allimgs)) # Print the total number of images
 # In[28]:
 #split a test set from the dataset, train/test size = 80%/20%
 Numbers=len(allimgs)//5 	#size of test set (20%)
 def mymovefile(srcfile,dstfile):
    if not os.path.isfile(srcfile):
        print ("%s not exist!"%(srcfile))
    else:
        fpath,fname=os.path.split(dstfile)    
        if not os.path.exists(fpath):
            os.makedirs(fpath)               
        shutil.move(srcfile,dstfile)          
        #print ("move %s -> %s"%(srcfile,dstfile))
 # In[29]:
 # The size of test set
 Numbers
 # In[30]:
 # Create the test set
 val_imgs=random.sample(allimgs,Numbers)
 for img in val_imgs:
    dest_path=img.replace(Train_Dir,Val_Dir)
    mymovefile(img,dest_path)
 print('Finish creating test set')
 # In[31]:
 #resize the images 224*224 for better CNN training
 def get_224(folder,dstdir):
    imgfilepaths=[]
    for root,dirs,imgs in os.walk(folder):
        for thisimg in imgs:
            thisimg_path=os.path.join(root,thisimg)
            imgfilepaths.append(thisimg_path)
    for thisimg_path in imgfilepaths:
        dir_name,filename=os.path.split(thisimg_path)
        dir_name=dir_name.replace(folder,dstdir)
        new_file_path=os.path.join(dir_name,filename)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        img=cv2.imread(thisimg_path)
        img=cv2.resize(img,(224,224))
        cv2.imwrite(new_file_path,img)
    print('Finish resizing'.format(folder=folder))
 # In[32]:
 DATA_DIR_224='./train_224/'
 get_224(folder='./train/',dstdir=DATA_DIR_224)
 # In[33]:
 DATA_DIR2_224='./test_224/'
 get_224(folder='./test/',dstdir=DATA_DIR2_224)
 # ### Display samples for each category
 # In[34]:
 # Read the images for each category, the file name may vary (27.png, 83.png...)
 img1 = Image.open('./train_224/0/27.png')
 img2 = Image.open('./train_224/1/83.png')
 img3 = Image.open('./train_224/2/27.png')
 img4 = Image.open('./train_224/3/27.png')
 img5 = Image.open('./train_224/4/27.png')
 plt.figure(figsize=(10, 10)) 
 plt.subplot(1,5,1)
 plt.imshow(img1)
 plt.title("Normal")
 plt.subplot(1,5,2)
 plt.imshow(img2)
 plt.title("RPM Spoofing")
 plt.subplot(1,5,3)
 plt.imshow(img3)
 plt.title("Gear Spoofing")
 plt.subplot(1,5,4)
 plt.imshow(img4)
 plt.title("DoS Attack")
 plt.subplot(1,5,5)
 plt.imshow(img5)
 plt.title("Fuzzy Attack")
 plt.show()  # display it
 # In[ ]:
--- a/_reference/FlowPic/LICENSE
+++ b/_reference/FlowPic/LICENSE
--- a/_reference/FlowPic/README.md
+++ b/_reference/FlowPic/README.md
--- a/_reference/FlowPic/TrafficParser/datasets_generator.py
+++ b/_reference/FlowPic/TrafficParser/datasets_generator.py
--- a/_reference/FlowPic/TrafficParser/generic_parser.py
+++ b/_reference/FlowPic/TrafficParser/generic_parser.py
--- a/_reference/FlowPic/TrafficParser/sessions_plotter.py
+++ b/_reference/FlowPic/TrafficParser/sessions_plotter.py
--- a/_reference/FlowPic/TrafficParser/traffic_csv_converter.py
+++ b/_reference/FlowPic/TrafficParser/traffic_csv_converter.py
--- a/_reference/FlowPic/TrafficParser/traffic_csv_merger.py
+++ b/_reference/FlowPic/TrafficParser/traffic_csv_merger.py
--- a/_reference/FlowPic/dataset/sessions_plotter.py
+++ b/_reference/FlowPic/dataset/sessions_plotter.py
--- a/_reference/FlowPic/overlap_multiclass_reg_non_bn.ipynb
+++ b/_reference/FlowPic/overlap_multiclass_reg_non_bn.ipynb
--- a/_reference/FlowPic/sessions_plotter.py
+++ b/_reference/FlowPic/sessions_plotter.py
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/1-Data_pre-processing_CAN.ipynb
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/1-Data_pre-processing_CAN.ipynb
@ -0,0 +1,682 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles \n",
    "This is the code for the paper entitled \"**A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles**\" accepted in IEEE International Conference on Communications (IEEE ICC).  \n",
    "Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  \n",
    "Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University\n",
    "\n",
    "**Notebook 1: Data pre-processing**  \n",
    "Procedures:  \n",
    "&nbsp; 1): Read the dataset  \n",
    "&nbsp; 2): Transform the tabular data into images  \n",
    "&nbsp; 3): Display the transformed images  \n",
    "&nbsp; 4): Split the training and test set  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:07.788679800Z",
     "start_time": "2023-07-06T09:03:07.746481Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import cv2\n",
    "import math\n",
    "import random\n",
    "import matplotlib.pyplot as plt\n",
    "import shutil\n",
    "from sklearn.preprocessing import QuantileTransformer\n",
    "from PIL import Image\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read the Car-Hacking/CAN-Intrusion dataset\n",
    "The complete Car-Hacking dataset is publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  \n",
    "In this repository, due to the file size limit of GitHub, we use the 5% subset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:08.040220300Z",
     "start_time": "2023-07-06T09:03:07.750003500Z"
    }
   },
   "outputs": [],
   "source": [
    "#Read dataset\n",
    "df=pd.read_csv('data/Car_Hacking_5%.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:08.050784400Z",
     "start_time": "2023-07-06T09:03:08.042218700Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        CAN ID  DATA[0]  DATA[1]  DATA[2]  DATA[3]  DATA[4]  DATA[5]  DATA[6]  \\\n0         1201       41       39       39       35        0        0        0   \n1          809       64      187      127       20       17       32        0   \n2         1349      216        0        0      136        0        0        0   \n3         1201       41       39       39       35        0        0        0   \n4            2        0        0        0        0        0        3        2   \n...        ...      ...      ...      ...      ...      ...      ...      ...   \n818435     848        5       32       52      104      117        0        0   \n818436    1088      255        0        0        0      255      134        9   \n818437     848        5       32      100      104      117        0        0   \n818438    1349      216       90        0      137        0        0        0   \n818439     790        5       33       48       10       33       30        0   \n\n        DATA[7] Label  \n0           154     R  \n1            20     R  \n2             0     R  \n3           154     R  \n4           228     R  \n...         ...   ...  \n818435       12     R  \n818436        0     R  \n818437       92     R  \n818438        0     R  \n818439      111     R  \n\n[818440 rows x 10 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>CAN ID</th>\n      <th>DATA[0]</th>\n      <th>DATA[1]</th>\n      <th>DATA[2]</th>\n      <th>DATA[3]</th>\n      <th>DATA[4]</th>\n      <th>DATA[5]</th>\n      <th>DATA[6]</th>\n      <th>DATA[7]</th>\n      <th>Label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1201</td>\n      <td>41</td>\n      <td>39</td>\n      <td>39</td>\n      <td>35</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>154</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>809</td>\n      <td>64</td>\n      <td>187</td>\n      <td>127</td>\n      <td>20</td>\n      <td>17</td>\n      <td>32</td>\n      <td>0</td>\n      <td>20</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1349</td>\n      <td>216</td>\n      <td>0</td>\n      <td>0</td>\n      <td>136</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1201</td>\n      <td>41</td>\n      <td>39</td>\n      <td>39</td>\n      <td>35</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>154</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>3</td>\n      <td>2</td>\n      <td>228</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>818435</th>\n      <td>848</td>\n      <td>5</td>\n      <td>32</td>\n      <td>52</td>\n      <td>104</td>\n      <td>117</td>\n      <td>0</td>\n      <td>0</td>\n      <td>12</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>818436</th>\n      <td>1088</td>\n      <td>255</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>255</td>\n      <td>134</td>\n      <td>9</td>\n      <td>0</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>818437</th>\n      <td>848</td>\n      <td>5</td>\n      <td>32</td>\n      <td>100</td>\n      <td>104</td>\n      <td>117</td>\n      <td>0</td>\n      <td>0</td>\n      <td>92</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>818438</th>\n      <td>1349</td>\n      <td>216</td>\n      <td>90</td>\n      <td>0</td>\n      <td>137</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>R</td>\n    </tr>\n    <tr>\n      <th>818439</th>\n      <td>790</td>\n      <td>5</td>\n      <td>33</td>\n      <td>48</td>\n      <td>10</td>\n      <td>33</td>\n      <td>30</td>\n      <td>0</td>\n      <td>111</td>\n      <td>R</td>\n    </tr>\n  </tbody>\n</table>\n<p>818440 rows × 10 columns</p>\n</div>"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:08.131587100Z",
     "start_time": "2023-07-06T09:03:08.052784200Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "Label\nR        701832\nRPM       32539\ngear      29944\nDoS       29501\nFuzzy     24624\nName: count, dtype: int64"
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The labels of the dataset. \"R\" indicates normal patterns, and there are four types of attack (DoS, fuzzy. gear spoofing, and RPM spoofing zttacks)\n",
    "df.Label.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Data Transformation\n",
    "Convert tabular data to images\n",
    "Procedures:\n",
    "1. Use quantile transform to transform the original data samples into the scale of [0,255], representing pixel values\n",
    "2. Generate images for each category (Normal, DoS, Fuzzy, Gear, RPM), each image consists of 27 data samples with 9 features. Thus, the size of each image is 9*9*3, length 9, width 9, and 3 color channels (RGB)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:09.029917800Z",
     "start_time": "2023-07-06T09:03:08.087993Z"
    }
   },
   "outputs": [],
   "source": [
    "# Transform all features into the scale of [0,1]\n",
    "numeric_features = df.dtypes[df.dtypes != 'object'].index\n",
    "scaler = QuantileTransformer() \n",
    "df[numeric_features] = scaler.fit_transform(df[numeric_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:09.083315300Z",
     "start_time": "2023-07-06T09:03:09.030919300Z"
    }
   },
   "outputs": [],
   "source": [
    "# Multiply the feature values by 255 to transform them into the scale of [0,255]\n",
    "df[numeric_features] = df[numeric_features].apply(\n",
    "    lambda x: (x*255))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:09.331286300Z",
     "start_time": "2023-07-06T09:03:09.084313100Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "              CAN ID        DATA[0]        DATA[1]        DATA[2]  \\\ncount  818440.000000  818440.000000  818440.000000  818440.000000   \nmean      127.457890     113.711554     107.926505      89.813595   \nstd        73.812063      89.982269      93.314034     100.866477   \nmin         0.000000       0.000000       0.000000       0.000000   \n25%        66.621622       0.000000       0.000000       0.000000   \n50%       122.267267     126.223724     115.630631       0.000000   \n75%       190.292793     192.590090     192.972973     199.992492   \nmax       255.000000     255.000000     255.000000     255.000000   \n\n             DATA[3]        DATA[4]        DATA[5]        DATA[6]  \\\ncount  818440.000000  818440.000000  818440.000000  818440.000000   \nmean      109.978430     105.412321     112.250627      84.973873   \nstd       103.679776      95.557986      91.033532     101.390068   \nmin         0.000000       0.000000       0.000000       0.000000   \n25%         0.000000       0.000000       0.000000       0.000000   \n50%       130.690691     127.244745     129.159159       0.000000   \n75%       191.186186     192.717718     190.420420     192.207207   \nmax       255.000000     255.000000     255.000000     255.000000   \n\n             DATA[7]  \ncount  818440.000000  \nmean       93.112763  \nstd       100.247486  \nmin         0.000000  \n25%         0.000000  \n50%         0.000000  \n75%       190.675676  \nmax       255.000000  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>CAN ID</th>\n      <th>DATA[0]</th>\n      <th>DATA[1]</th>\n      <th>DATA[2]</th>\n      <th>DATA[3]</th>\n      <th>DATA[4]</th>\n      <th>DATA[5]</th>\n      <th>DATA[6]</th>\n      <th>DATA[7]</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n      <td>818440.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>127.457890</td>\n      <td>113.711554</td>\n      <td>107.926505</td>\n      <td>89.813595</td>\n      <td>109.978430</td>\n      <td>105.412321</td>\n      <td>112.250627</td>\n      <td>84.973873</td>\n      <td>93.112763</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>73.812063</td>\n      <td>89.982269</td>\n      <td>93.314034</td>\n      <td>100.866477</td>\n      <td>103.679776</td>\n      <td>95.557986</td>\n      <td>91.033532</td>\n      <td>101.390068</td>\n      <td>100.247486</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>66.621622</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>122.267267</td>\n      <td>126.223724</td>\n      <td>115.630631</td>\n      <td>0.000000</td>\n      <td>130.690691</td>\n      <td>127.244745</td>\n      <td>129.159159</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>190.292793</td>\n      <td>192.590090</td>\n      <td>192.972973</td>\n      <td>199.992492</td>\n      <td>191.186186</td>\n      <td>192.717718</td>\n      <td>190.420420</td>\n      <td>192.207207</td>\n      <td>190.675676</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n      <td>255.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "All features are in the same scale of [0,255]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate images for each class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:09.553237400Z",
     "start_time": "2023-07-06T09:03:09.334282600Z"
    }
   },
   "outputs": [],
   "source": [
    "df0=df[df['Label']=='R'].drop(['Label'],axis=1)\n",
    "df1=df[df['Label']=='RPM'].drop(['Label'],axis=1)\n",
    "df2=df[df['Label']=='gear'].drop(['Label'],axis=1)\n",
    "df3=df[df['Label']=='DoS'].drop(['Label'],axis=1)\n",
    "df4=df[df['Label']=='Fuzzy'].drop(['Label'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:09.561532500Z",
     "start_time": "2023-07-06T09:03:09.557535700Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate 9*9 color images for class 0 (Normal)\n",
    "count=0\n",
    "ims = []\n",
    "\n",
    "image_path = \"train/0/\"\n",
    "os.makedirs(image_path)\n",
    "\n",
    "for i in range(0, 2):\n",
    "    count=count+1\n",
    "    if count<=27: \n",
    "        im=df0.iloc[i].values\n",
    "        ims=np.append(ims,im)\n",
    "    else:\n",
    "        print(ims)\n",
    "        ims=np.array(ims).reshape(9,9,3)\n",
    "        print(ims)\n",
    "        array = np.array(ims, dtype=np.uint8)\n",
    "        new_image = Image.fromarray(array)\n",
    "        new_image.save(image_path+str(i)+'.png')\n",
    "        count=0\n",
    "        ims = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:11.704231100Z",
     "start_time": "2023-07-06T09:03:09.564039300Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate 9*9 color images for class 1 (RPM spoofing)\n",
    "count=0\n",
    "ims = []\n",
    "\n",
    "image_path = \"train/1/\"\n",
    "os.makedirs(image_path)\n",
    "\n",
    "for i in range(0, len(df1)):  \n",
    "    count=count+1\n",
    "    if count<=27: \n",
    "        im=df1.iloc[i].values\n",
    "        ims=np.append(ims,im)\n",
    "    else:\n",
    "        ims=np.array(ims).reshape(9,9,3)\n",
    "        array = np.array(ims, dtype=np.uint8)\n",
    "        new_image = Image.fromarray(array)\n",
    "        new_image.save(image_path+str(i)+'.png')\n",
    "        count=0\n",
    "        ims = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:13.510844700Z",
     "start_time": "2023-07-06T09:03:11.707374200Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate 9*9 color images for class 2 (Gear spoofing)\n",
    "count=0\n",
    "ims = []\n",
    "\n",
    "image_path = \"train/2/\"\n",
    "os.makedirs(image_path)\n",
    "\n",
    "for i in range(0, len(df2)):  \n",
    "    count=count+1\n",
    "    if count<=27: \n",
    "        im=df2.iloc[i].values\n",
    "        ims=np.append(ims,im)\n",
    "    else:\n",
    "        ims\n",
    "        ims=np.array(ims).reshape(9,9,3)\n",
    "        ims\n",
    "        array = np.array(ims, dtype=np.uint8)\n",
    "        new_image = Image.fromarray(array)\n",
    "        new_image.save(image_path+str(i)+'.png')\n",
    "        count=0\n",
    "        ims = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:15.293229800Z",
     "start_time": "2023-07-06T09:03:13.514351300Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate 9*9 color images for class 3 (DoS attack)\n",
    "count=0\n",
    "ims = []\n",
    "\n",
    "image_path = \"train/3/\"\n",
    "os.makedirs(image_path)\n",
    "\n",
    "\n",
    "for i in range(0, len(df3)):  \n",
    "    count=count+1\n",
    "    if count<=27: \n",
    "        im=df3.iloc[i].values\n",
    "        ims=np.append(ims,im)\n",
    "    else:\n",
    "        ims=np.array(ims).reshape(9,9,3)\n",
    "        array = np.array(ims, dtype=np.uint8)\n",
    "        new_image = Image.fromarray(array)\n",
    "        new_image.save(image_path+str(i)+'.png')\n",
    "        count=0\n",
    "        ims = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:16.797229300Z",
     "start_time": "2023-07-06T09:03:15.294734300Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate 9*9 color images for class 4 (Fuzzy attack)\n",
    "count=0\n",
    "ims = []\n",
    "\n",
    "image_path = \"train/4/\"\n",
    "os.makedirs(image_path)\n",
    "\n",
    "\n",
    "for i in range(0, len(df4)):  \n",
    "    count=count+1\n",
    "    if count<=27: \n",
    "        im=df4.iloc[i].values\n",
    "        ims=np.append(ims,im)\n",
    "    else:\n",
    "        ims=np.array(ims).reshape(9,9,3)\n",
    "        array = np.array(ims, dtype=np.uint8)\n",
    "        new_image = Image.fromarray(array)\n",
    "        new_image.save(image_path+str(i)+'.png')\n",
    "        count=0\n",
    "        ims = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split the training and test set "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:16.815834800Z",
     "start_time": "2023-07-06T09:03:16.797229300Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4163\n"
     ]
    }
   ],
   "source": [
    "# Create folders to store images\n",
    "Train_Dir='./train/'\n",
    "Val_Dir='./test/'\n",
    "allimgs=[]\n",
    "for subdir in os.listdir(Train_Dir):\n",
    "    for filename in os.listdir(os.path.join(Train_Dir,subdir)):\n",
    "        filepath=os.path.join(Train_Dir,subdir,filename)\n",
    "        allimgs.append(filepath)\n",
    "print(len(allimgs)) # Print the total number of images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:16.838914900Z",
     "start_time": "2023-07-06T09:03:16.818833300Z"
    }
   },
   "outputs": [],
   "source": [
    "#split a test set from the dataset, train/test size = 80%/20%\n",
    "Numbers=len(allimgs)//5 \t#size of test set (20%)\n",
    "\n",
    "def mymovefile(srcfile,dstfile):\n",
    "    if not os.path.isfile(srcfile):\n",
    "        print (\"%s not exist!\"%(srcfile))\n",
    "    else:\n",
    "        fpath,fname=os.path.split(dstfile)    \n",
    "        if not os.path.exists(fpath):\n",
    "            os.makedirs(fpath)               \n",
    "        shutil.move(srcfile,dstfile)          \n",
    "        #print (\"move %s -> %s\"%(srcfile,dstfile))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:16.838914900Z",
     "start_time": "2023-07-06T09:03:16.822343500Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "832"
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The size of test set\n",
    "Numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:17.654719900Z",
     "start_time": "2023-07-06T09:03:16.832397200Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finish creating test set\n"
     ]
    }
   ],
   "source": [
    "# Create the test set\n",
    "val_imgs=random.sample(allimgs,Numbers)\n",
    "for img in val_imgs:\n",
    "    dest_path=img.replace(Train_Dir,Val_Dir)\n",
    "    mymovefile(img,dest_path)\n",
    "print('Finish creating test set')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:17.660725400Z",
     "start_time": "2023-07-06T09:03:17.658724800Z"
    }
   },
   "outputs": [],
   "source": [
    "#resize the images 224*224 for better CNN training\n",
    "def get_224(folder,dstdir):\n",
    "    imgfilepaths=[]\n",
    "    for root,dirs,imgs in os.walk(folder):\n",
    "        for thisimg in imgs:\n",
    "            thisimg_path=os.path.join(root,thisimg)\n",
    "            imgfilepaths.append(thisimg_path)\n",
    "    for thisimg_path in imgfilepaths:\n",
    "        dir_name,filename=os.path.split(thisimg_path)\n",
    "        dir_name=dir_name.replace(folder,dstdir)\n",
    "        new_file_path=os.path.join(dir_name,filename)\n",
    "        if not os.path.exists(dir_name):\n",
    "            os.makedirs(dir_name)\n",
    "        img=cv2.imread(thisimg_path)\n",
    "        img=cv2.resize(img,(224,224))\n",
    "        cv2.imwrite(new_file_path,img)\n",
    "    print('Finish resizing'.format(folder=folder))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:22.772090900Z",
     "start_time": "2023-07-06T09:03:17.661728600Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finish resizing\n"
     ]
    }
   ],
   "source": [
    "DATA_DIR_224='./train_224/'\n",
    "get_224(folder='./train/',dstdir=DATA_DIR_224)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:24.056886300Z",
     "start_time": "2023-07-06T09:03:22.772621Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finish resizing\n"
     ]
    }
   ],
   "source": [
    "DATA_DIR2_224='./test_224/'\n",
    "get_224(folder='./test/',dstdir=DATA_DIR2_224)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Display samples for each category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T09:03:24.562540100Z",
     "start_time": "2023-07-06T09:03:24.056886300Z"
    }
   },
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: './train_224/0/27.png'",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mFileNotFoundError\u001B[0m                         Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[34], line 2\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[38;5;66;03m# Read the images for each category, the file name may vary (27.png, 83.png...)\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m img1 \u001B[38;5;241m=\u001B[39m \u001B[43mImage\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m./train_224/0/27.png\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m      3\u001B[0m img2 \u001B[38;5;241m=\u001B[39m Image\u001B[38;5;241m.\u001B[39mopen(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m./train_224/1/83.png\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m      4\u001B[0m img3 \u001B[38;5;241m=\u001B[39m Image\u001B[38;5;241m.\u001B[39mopen(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m./train_224/2/27.png\u001B[39m\u001B[38;5;124m'\u001B[39m)\n",
      "File \u001B[1;32m~\\anaconda3\\envs\\FlowPicRefresh\\lib\\site-packages\\PIL\\Image.py:3227\u001B[0m, in \u001B[0;36mopen\u001B[1;34m(fp, mode, formats)\u001B[0m\n\u001B[0;32m   3224\u001B[0m     filename \u001B[38;5;241m=\u001B[39m fp\n\u001B[0;32m   3226\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m filename:\n\u001B[1;32m-> 3227\u001B[0m     fp \u001B[38;5;241m=\u001B[39m \u001B[43mbuiltins\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfilename\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mrb\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m   3228\u001B[0m     exclusive_fp \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[0;32m   3230\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n",
      "\u001B[1;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: './train_224/0/27.png'"
     ]
    }
   ],
   "source": [
    "# Read the images for each category, the file name may vary (27.png, 83.png...)\n",
    "img1 = Image.open('./train_224/0/27.png')\n",
    "img2 = Image.open('./train_224/1/83.png')\n",
    "img3 = Image.open('./train_224/2/27.png')\n",
    "img4 = Image.open('./train_224/3/27.png')\n",
    "img5 = Image.open('./train_224/4/27.png')\n",
    "\n",
    "plt.figure(figsize=(10, 10)) \n",
    "plt.subplot(1,5,1)\n",
    "plt.imshow(img1)\n",
    "plt.title(\"Normal\")\n",
    "plt.subplot(1,5,2)\n",
    "plt.imshow(img2)\n",
    "plt.title(\"RPM Spoofing\")\n",
    "plt.subplot(1,5,3)\n",
    "plt.imshow(img3)\n",
    "plt.title(\"Gear Spoofing\")\n",
    "plt.subplot(1,5,4)\n",
    "plt.imshow(img4)\n",
    "plt.title(\"DoS Attack\")\n",
    "plt.subplot(1,5,5)\n",
    "plt.imshow(img5)\n",
    "plt.title(\"Fuzzy Attack\")\n",
    "plt.show()  # display it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-06T09:03:24.562540100Z"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/1-Data_pre-processing_CAN.py
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/1-Data_pre-processing_CAN.py
@ -0,0 +1,348 @@
 #!/usr/bin/env python
 # coding: utf-8
 # # A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles 
 # This is the code for the paper entitled "**A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles**" accepted in IEEE International Conference on Communications (IEEE ICC).  
 # Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
 # Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University
 # 
 # **Notebook 1: Data pre-processing**  
 # Procedures:  
 # &nbsp; 1): Read the dataset  
 # &nbsp; 2): Transform the tabular data into images  
 # &nbsp; 3): Display the transformed images  
 # &nbsp; 4): Split the training and test set  
 # ## Import libraries
 # In[14]:
 import numpy as np
 import pandas as pd
 import os
 import cv2
 import math
 import random
 import matplotlib.pyplot as plt
 import shutil
 from sklearn.preprocessing import QuantileTransformer
 from PIL import Image
 import warnings
 warnings.filterwarnings("ignore")
 # ## Read the Car-Hacking/CAN-Intrusion dataset
 # The complete Car-Hacking dataset is publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  
 # In this repository, due to the file size limit of GitHub, we use the 5% subset.
 # In[15]:
 #Read dataset
 df=pd.read_csv('data/Car_Hacking_5%.csv')
 # In[16]:
 df
 # In[17]:
 # The labels of the dataset. "R" indicates normal patterns, and there are four types of attack (DoS, fuzzy. gear spoofing, and RPM spoofing zttacks)
 df.Label.value_counts()
 # ## Data Transformation
 # Convert tabular data to images
 # Procedures:
 # 1. Use quantile transform to transform the original data samples into the scale of [0,255], representing pixel values
 # 2. Generate images for each category (Normal, DoS, Fuzzy, Gear, RPM), each image consists of 27 data samples with 9 features. Thus, the size of each image is 9*9*3, length 9, width 9, and 3 color channels (RGB).
 # In[18]:
 # Transform all features into the scale of [0,1]
 numeric_features = df.dtypes[df.dtypes != 'object'].index
 scaler = QuantileTransformer() 
 df[numeric_features] = scaler.fit_transform(df[numeric_features])
 # In[19]:
 # Multiply the feature values by 255 to transform them into the scale of [0,255]
 df[numeric_features] = df[numeric_features].apply(
    lambda x: (x*255))
 # In[20]:
 df.describe()
 # All features are in the same scale of [0,255]
 # ### Generate images for each class
 # In[21]:
 df0=df[df['Label']=='R'].drop(['Label'],axis=1)
 df1=df[df['Label']=='RPM'].drop(['Label'],axis=1)
 df2=df[df['Label']=='gear'].drop(['Label'],axis=1)
 df3=df[df['Label']=='DoS'].drop(['Label'],axis=1)
 df4=df[df['Label']=='Fuzzy'].drop(['Label'],axis=1)
 # In[22]:
 # Generate 9*9 color images for class 0 (Normal)
 count=0
 ims = []
 image_path = "train/0/"
 os.makedirs(image_path)
 for i in range(0, len(df1)):
    count=count+1
    if count<=27:
        im=df1.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[23]:
 # Generate 9*9 color images for class 1 (RPM spoofing)
 count=0
 ims = []
 image_path = "train/1/"
 os.makedirs(image_path)
 for i in range(0, len(df1)):  
    count=count+1
    if count<=27: 
        im=df1.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[24]:
 # Generate 9*9 color images for class 2 (Gear spoofing)
 count=0
 ims = []
 image_path = "train/2/"
 os.makedirs(image_path)
 for i in range(0, len(df2)):  
    count=count+1
    if count<=27: 
        im=df2.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims
        ims=np.array(ims).reshape(9,9,3)
        ims
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[25]:
 # Generate 9*9 color images for class 3 (DoS attack)
 count=0
 ims = []
 image_path = "train/3/"
 os.makedirs(image_path)
 for i in range(0, len(df3)):  
    count=count+1
    if count<=27: 
        im=df3.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # In[26]:
 # Generate 9*9 color images for class 4 (Fuzzy attack)
 count=0
 ims = []
 image_path = "train/4/"
 os.makedirs(image_path)
 for i in range(0, len(df4)):  
    count=count+1
    if count<=27: 
        im=df4.iloc[i].values
        ims=np.append(ims,im)
    else:
        ims=np.array(ims).reshape(9,9,3)
        array = np.array(ims, dtype=np.uint8)
        new_image = Image.fromarray(array)
        new_image.save(image_path+str(i)+'.png')
        count=0
        ims = []
 # ## Split the training and test set 
 # In[27]:
 # Create folders to store images
 Train_Dir='./train/'
 Val_Dir='./test/'
 allimgs=[]
 for subdir in os.listdir(Train_Dir):
    for filename in os.listdir(os.path.join(Train_Dir,subdir)):
        filepath=os.path.join(Train_Dir,subdir,filename)
        allimgs.append(filepath)
 print(len(allimgs)) # Print the total number of images
 # In[28]:
 #split a test set from the dataset, train/test size = 80%/20%
 Numbers=len(allimgs)//5 	#size of test set (20%)
 def mymovefile(srcfile,dstfile):
    if not os.path.isfile(srcfile):
        print ("%s not exist!"%(srcfile))
    else:
        fpath,fname=os.path.split(dstfile)    
        if not os.path.exists(fpath):
            os.makedirs(fpath)               
        shutil.move(srcfile,dstfile)          
        #print ("move %s -> %s"%(srcfile,dstfile))
 # In[29]:
 # The size of test set
 Numbers
 # In[30]:
 # Create the test set
 val_imgs=random.sample(allimgs,Numbers)
 for img in val_imgs:
    dest_path=img.replace(Train_Dir,Val_Dir)
    mymovefile(img,dest_path)
 print('Finish creating test set')
 # In[31]:
 #resize the images 224*224 for better CNN training
 def get_224(folder,dstdir):
    imgfilepaths=[]
    for root,dirs,imgs in os.walk(folder):
        for thisimg in imgs:
            thisimg_path=os.path.join(root,thisimg)
            imgfilepaths.append(thisimg_path)
    for thisimg_path in imgfilepaths:
        dir_name,filename=os.path.split(thisimg_path)
        dir_name=dir_name.replace(folder,dstdir)
        new_file_path=os.path.join(dir_name,filename)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        img=cv2.imread(thisimg_path)
        img=cv2.resize(img,(224,224))
        cv2.imwrite(new_file_path,img)
    print('Finish resizing'.format(folder=folder))
 # In[32]:
 DATA_DIR_224='./train_224/'
 get_224(folder='./train/',dstdir=DATA_DIR_224)
 # In[33]:
 DATA_DIR2_224='./test_224/'
 get_224(folder='./test/',dstdir=DATA_DIR2_224)
 # ### Display samples for each category
 # In[34]:
 # Read the images for each category, the file name may vary (27.png, 83.png...)
 img1 = Image.open('./train_224/0/27.png')
 img2 = Image.open('./train_224/1/83.png')
 img3 = Image.open('./train_224/2/27.png')
 img4 = Image.open('./train_224/3/27.png')
 img5 = Image.open('./train_224/4/27.png')
 plt.figure(figsize=(10, 10)) 
 plt.subplot(1,5,1)
 plt.imshow(img1)
 plt.title("Normal")
 plt.subplot(1,5,2)
 plt.imshow(img2)
 plt.title("RPM Spoofing")
 plt.subplot(1,5,3)
 plt.imshow(img3)
 plt.title("Gear Spoofing")
 plt.subplot(1,5,4)
 plt.imshow(img4)
 plt.title("DoS Attack")
 plt.subplot(1,5,5)
 plt.imshow(img5)
 plt.title("Fuzzy Attack")
 plt.show()  # display it
 # In[ ]:
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/2-CNN_Model_Development&Hyperparameter
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/2-CNN_Model_Development&Hyperparameter
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/3-Ensemble_Models-CAN.ipynb
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/3-Ensemble_Models-CAN.ipynb
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/CAN.png
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/CAN.png
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/LICENSE
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2022 Western OC2 Lab
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/Paper_2201.11812.pdf
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/Paper_2201.11812.pdf
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/README.md
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/README.md
@ -0,0 +1,81 @@
 # Intrusion-Detection-System-Using-CNN-and-Transfer-Learning
 This is the code for the paper entitled "**[A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles](https://arxiv.org/pdf/2201.11812.pdf)**" published in **IEEE International Conference on Communications (IEEE ICC)**, doi: [10.1109/ICC45855.2022.9838780](https://ieeexplore.ieee.org/document/9838780).  
 - Authors: Li Yang and Abdallah Shami  
 - Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University
 This repository introduces how to use **convolutional neural networks (CNNs)** and **transfer learning** techniques to develop **intrusion detection systems**. **Ensemble learning** and **hyperparameter optimization techniques** are also used to achieve optimized model performance.
 - Another **intrusion detection system development code** using **decision tree-based machine learning algorithms (Decision tree, random forest, XGBoost, stacking, etc.)** can be found in: [Intrusion-Detection-System-Using-Machine-Learning](https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-Machine-Learning)
 - A comprehensive **hyperparameter optimization** tutorial code can be found in: [Hyperparameter-Optimization-of-Machine-Learning-Algorithms](https://github.com/LiYangHart/Hyperparameter-Optimization-of-Machine-Learning-Algorithms)
 ## Abstract of The Paper
 Modern vehicles, including autonomous vehicles and connected vehicles,  are increasingly connected to the external world, which enables various functionalities and services. However, the improving connectivity also increases the attack surfaces of the Internet of Vehicles (IoV), causing its vulnerabilities to cyber-threats. Due to the lack of authentication and encryption procedures in vehicular networks, Intrusion Detection Systems (IDSs) are essential approaches to protect modern vehicle systems from network attacks. In this paper, a transfer learning and ensemble learning-based IDS is proposed for IoV systems using convolutional neural networks (CNNs) and hyper-parameter optimization techniques. In the experiments, the proposed IDS has demonstrated over 99.25% detection rates and F1-scores on two well-known public benchmark IoV security datasets: the Car-Hacking dataset and the CICIDS2017 dataset. This shows the effectiveness of the proposed IDS for cyber-attack detection in both intra-vehicle and external vehicular networks.
 <p float="left">
  <img src="https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/blob/main/framework.png" width="500" />
  <img src="https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/blob/main/CAN.png" width="400" /> 
 </p>
 ## Implementation 
 ### CNN Models  
 * VGG16
 * VGG19
 * Xception
 * Inception
 * Resnet
 * InceptionResnet
 ### Ensemble Learning Models
 * Bagging
 * Probability Averaging
 * Concatenation
 ### Hyperparameter Optimization Methods
 * Random Search (RS)
 * Bayesian Optimization - Tree Parzen Estimator(BO-TPE)
 ### Dataset 
 1. CAN-intrusion/Car-Hacking dataset, a benchmark network security dataset for intra-vehicle intrusion detection
 * Publicly available at: https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset  
 * Can be processed using the same code
 2. CICIDS2017 dataset, a popular network traffic dataset for intrusion detection problems
 * Publicly available at: https://www.unb.ca/cic/datasets/ids-2017.html  
 For the purpose of displaying the experimental results in Jupyter Notebook, the sampled subset of the CAN-intrusion dataset is used in the sample code. The subsets are in the "[data](https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/tree/main/data)" folder.
 ### Code  
 * [1-Data_pre-processing_CAN.ipynb](https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/blob/main/1-Data_pre-processing_CAN.ipynb): code for data pre-processing and transformation (tabular data to images).  
 * [2-CNN_Model_Development&Hyperparameter Optimization.ipynb](https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/blob/main/2-CNN_Model_Development%26Hyperparameter%20Optimization.ipynb): code for the development and CNN models and their hyperparameter optimization.
 * [3-Ensemble_Models-CAN.ipynb](https://github.com/Western-OC2-Lab/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/blob/main/3-Ensemble_Models-CAN.ipynb): code for the construction of three ensemble learning techniques.
 Libraries  
 * Python 3.5+
 * [Keras 2.1.0+](hhttps://keras.io/)  
 * [Tensorflow 1.10.0+](https://www.tensorflow.org/install/gpu)
 * [OpenCV-python](https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html)
 * [hyperopt](https://github.com/hyperopt/hyperopt) 
 ## Contact-Info
 Please feel free to contact us for any questions or cooperation opportunities. We will be happy to help.
 * Email: [liyanghart@gmail.com](mailto:liyanghart@gmail.com) or [Abdallah.Shami@uwo.ca](mailto:Abdallah.Shami@uwo.ca)
 * GitHub: [LiYangHart](https://github.com/LiYangHart) and [Western OC2 Lab](https://github.com/Western-OC2-Lab/)
 * LinkedIn: [Li Yang](https://www.linkedin.com/in/li-yang-phd-65a190176/)  
 * Google Scholar: [Li Yang](https://scholar.google.com.eg/citations?user=XEfM7bIAAAAJ&hl=en) and [OC2 Lab](https://scholar.google.com.eg/citations?user=oiebNboAAAAJ&hl=en)
 ## Citation
 If you find this repository useful in your research, please cite this article as:  
 L. Yang and A. Shami, "A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles," ICC 2022 - IEEE International Conference on Communications, 2022, pp. 2774-2779, doi: 10.1109/ICC45855.2022.9838780.
 ```
@INPROCEEDINGS{9838780,
  author={Yang, Li and Shami, Abdallah},
  booktitle={ICC 2022 - IEEE International Conference on Communications}, 
  title={A Transfer Learning and Optimized CNN Based Intrusion Detection System for Internet of Vehicles}, 
  year={2022},
  pages={2774-2779},
  doi={10.1109/ICC45855.2022.9838780}}
 ```
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/data/Car_Hacking_5%.csv
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/data/Car_Hacking_5%.csv
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/data/README.md
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/data/README.md
@ -0,0 +1,3 @@
 # The sampled datasets used for the experiments in the sample code
 **Car_Hacking_5%.csv**: The 5% randomly sampled subset of the [Car Hacking dataset](https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset)  
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/framework.png
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/framework.png
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/supplementary_code/CAR_IDS_SVC.ipynb
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/supplementary_code/CAR_IDS_SVC.ipynb
--- a/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/supplementary_code/README.md
+++ b/_reference/Intrusion-Detection-System-Using-CNN-and-Transfer-Learning/supplementary_code/README.md
@ -0,0 +1 @@
 # The code in this folder shows an example of the pre-processing of the Car-Hacking dataset.
--- a/_reference/cicids2017-ml/1.1
+++ b/_reference/cicids2017-ml/1.1
--- a/_reference/cicids2017-ml/1.2
+++ b/_reference/cicids2017-ml/1.2
--- a/_reference/cicids2017-ml/2.1
+++ b/_reference/cicids2017-ml/2.1
--- a/_reference/cicids2017-ml/2.2
+++ b/_reference/cicids2017-ml/2.2
--- a/_reference/cicids2017-ml/3
+++ b/_reference/cicids2017-ml/3
--- a/_reference/cicids2017-ml/4.2
+++ b/_reference/cicids2017-ml/4.2
--- a/_reference/cicids2017-ml/5.2
+++ b/_reference/cicids2017-ml/5.2
--- a/_reference/cicids2017-ml/6.2
+++ b/_reference/cicids2017-ml/6.2
--- a/_reference/cicids2017-ml/LICENSE
+++ b/_reference/cicids2017-ml/LICENSE
@ -0,0 +1,29 @@
 BSD 3-Clause License
 Copyright (c) 2020, Mahendra Data
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
 3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/_reference/cicids2017-ml/README.md
+++ b/_reference/cicids2017-ml/README.md
@ -0,0 +1,2 @@
 # CICIDS2017-ML
 The purpose of this repository is to demonstrate the steps of processing CICIDS2017 dataset using machine learning algorithms.
--- a/_tools/拼接数据集.py
+++ b/_tools/拼接数据集.py
@ -0,0 +1,30 @@
 import pandas as pd
 def merge():
    # 读取三份csv文件
    df1 = pd.read_csv("../_dataset/TrafficLabelling_/Friday-WorkingHours-Morning.pcap_ISCX.csv")
    df2 = pd.read_csv("../_dataset/TrafficLabelling_/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
    df3 = pd.read_csv("../_dataset/TrafficLabelling_/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
    # 将它们拼合成一个DataFrame
    df = pd.concat([df1, df2, df3])
    # 保存为新的csv文件
    df.to_csv("../_dataset/TrafficLabelling_/Friday-WorkingHours-merged.csv", index=False)
 def select():
    df = pd.read_csv('../_dataset/TrafficLabelling_/Friday-WorkingHours-merged.csv')
    df_ddos = df[df.iloc[:, -1] == 'DDoS']
    df_ddos.to_csv('../_dataset/TrafficLabelling_/Friday-WorkingHours-DDoS.csv', index=False)
 def search(query: str, row_name: str):
    df = pd.read_csv('../_dataset/TrafficLabelling_/Friday-WorkingHours-merged.csv')
    result = df[df[row_name].str.contains(query)]
    print(result.head())
 if __name__ == "__main__":
    # merge()
    # select()
    search("172.16.0.1-192.168.10.50-49533-80-6", "Flow ID")
--- a/config.py
+++ b/config.py
@ -1,2 +1,39 @@
-CSV_PATH = './_dataset/DDos3.csv'
+import datetime
 CSV_PATH = './_dataset/TrafficLabelling_/Friday-WorkingHours-merged.csv'
 BYPASS_COLUMNS= ('Destination Port', 'Label')
 UNIQUE_COLUMNS = [' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' CWE Flag Count',
       'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate',
       ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Label']
 IMG_SAVE_PATH = f'./saves/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}'
 (['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count',
       ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count',
       ' CWE Flag Count', ' ECE Flag Count', ' Down/Up Ratio',
       ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets',
       ' Subflow Fwd Bytes', ' Subflow Bwd Packets', ' Subflow Bwd Bytes',
       'Init_Win_bytes_forward', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', ' min_seg_size_forward', 'Active Mean',
       ' Active Std', ' Active Max', ' Active Min', 'Idle Mean', ' Idle Std',
       ' Idle Max', ' Idle Min', ' Label'])
--- a/flows.csv
+++ b/flows.csv
--- a/main.py
+++ b/main.py
@ -3,14 +3,10 @@ from utils.files import create_dir
 import pandas as pd
 import numpy as np
 from config import *
-import matplotlib.pyplot as plt
+from utils.dataframe import *
-
+from sklearn.preprocessing import QuantileTransformer
-
+from PIL import Image
-def is_in_bypass_list(column_name: str, bypass_list: tuple) -> bool:
+from loguru import logger
    for bypass in bypass_list:
        if bypass in column_name:
            return True
    return False
 def input_csv_to_df(file_path: str) -> pd.DataFrame:
@ -19,96 +15,72 @@ def input_csv_to_df(file_path: str) -> pd.DataFrame:
    return df
-def averaging_df(df: pd.DataFrame, column_num: int = None):
+def averaging_df(df: pd.DataFrame):
-    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    logger.info(f"Total: {len(df)} averaging...")
-    max_values = df.max()
+    numeric_features = df.dtypes[df.dtypes != 'object'].index
-    if column_num is None:
+    scaler = QuantileTransformer()
-        column_num = 0
+    df[numeric_features] = scaler.fit_transform(df[numeric_features])
-        for numeric_column in numeric_columns:
+    df[numeric_features] = df[numeric_features].apply(
-            if is_in_bypass_list(numeric_column, BYPASS_COLUMNS):
+        lambda x: (x * 255))
-                continue
+    return df
-            column_num = column_num + 1
+
-    for numeric_column in numeric_columns:
+
-        if is_in_bypass_list(numeric_column, BYPASS_COLUMNS):
+def clean_data(df: pd.DataFrame) -> pd.DataFrame:
-            continue
+    df = df.replace([np.inf, -np.inf], np.nan)
-        df[numeric_column] = df[numeric_column] / max_values[numeric_column] * column_num
+    df = df.dropna(axis=0)  # 删除具有NaN值的行
-        # fix nan
+    # df = get_ddos_df(df)
-        df[numeric_column] = df[numeric_column].fillna(0)
+    df = drop_columns(df, UNIQUE_COLUMNS)
-    return df, column_num
+    # df = drop_unique_columns(df)
-
+    df = df.iloc[:, 7:]
-
+    return df
-def iter_df_to_point(df: pd.DataFrame, column_num: int = None):
+
-    size = 0
+def slice_df(df: pd.DataFrame):
-    points = []
+    logger.info(f"Total: {len(df)} slicing...")
-    for index, row in df.iterrows():
+    ddos_df = select_label_rows(df, 'DDoS')
-        x_values = row.values[2:]
+    normal_df = select_label_rows(df, 'BENIGN')
-        y_values = np.linspace(0, len(x_values) - 1, len(x_values))
+    return ddos_df, normal_df
-        size = size + 1
+
-        points.append({index: (x_values, y_values)})
+
-    return points
+def process(df: pd.DataFrame, label: str = None):
-
+
-
+    ddos_df, normal_df = slice_df(df)
-def generate_one_plot(x_values, y_values, x_y_size: int) -> plt:
+
-    yedges = xedges = np.linspace(0, x_y_size, x_y_size)
+    ddos_df = averaging_df(clean_data(ddos_df))
-    H = np.zeros((x_y_size, x_y_size))
+    normal_df = averaging_df(clean_data(normal_df))
-    plt.pcolormesh(xedges, yedges, H)  # pcolormeshp()函数用于创建具有非规则矩形网格的伪彩色图
+
-    plt.scatter(x_values, y_values, marker=',', s=1)
+    logger.info(f"DDoS: {len(ddos_df)}, Normal: {len(normal_df)}")
-    plt.xlim(0, x_y_size)
+
-    plt.ylim(0, x_y_size)
+    ddos_save_path = f"{IMG_SAVE_PATH}/ddos"
-    # 326
+    benign_save_path = f"{IMG_SAVE_PATH}/benign"
-    plt.ylabel('Attributes')
+    create_dir(ddos_save_path)
-    plt.xlabel('Attribute values')
+    generate_and_save(ddos_df, ddos_save_path)
-    # plt.set_cmap('gnuplot')
+    create_dir(benign_save_path)
-    plt.set_cmap('BuPu')
+    generate_and_save(normal_df, benign_save_path)
-    # plt.set_cmap('Greys')
+
-    plt.axis('on')
+
-    return plt
+def generate_and_save(df_clean_data: pd.DataFrame, save_path: str = IMG_SAVE_PATH):
-    # plt.savefig(os.path.join(figure_save_path, qwe + ".png"), bbox_inches='tight', pad_inches=0)  # 分别命名图片
+    row_length = len(df_clean_data.columns)
-
+    col_length = len(df_clean_data)
-
+    count = 0
-def save_plt(plt: plt, base_path: str, num: int):
+    ims = []
-    plt.savefig(f"{base_path}/{num}.png", bbox_inches='tight', pad_inches=0)
+    saves_count = 0
-
+    for i in range(0, col_length):
-
+        count = count + 1
-from multiprocessing import Pool, cpu_count
+        if count <= (row_length * 3):
-
+            im = df_clean_data.iloc[i].values
-
+            ims = np.append(ims, im)
-def process(df: pd.DataFrame):
+        else:
-    df, size = averaging_df(df)
+            saves_count = saves_count + 1
-    points = iter_df_to_point(df, size)
+            ims = np.array(ims).reshape(row_length, row_length, 3)
-    base_path = f'./saves/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}'
+            if saves_count % 100 == 0:
-    create_dir(base_path)
+                logger.info(f"Saving {saves_count} images")
-
+            if saves_count == 1:
-    pool = Pool(cpu_count())
+                logger.info(f"Shape: {ims.shape}")
-    results = []
+            array = np.array(ims, dtype=np.uint8)
-
+            new_image = Image.fromarray(array)
-    for point_dict in points:
+            new_image.save(f"{save_path}/{saves_count}.png")
-        num = list(point_dict.keys())[0]
+            count = 0
-        point = point_dict[num]
+            ims = []
        result = pool.apply_async(generate_and_save, args=(base_path, point, size, num))
        results.append(result)
    pool.close()
    pool.join()
 def generate_and_save(base_path: str, point: tuple, size: int, calculate):
    plt = generate_one_plot(point[0], point[1], size)
    save_plt(plt, base_path, calculate)
 def process_single_threaded(df: pd.DataFrame):
    df, size = averaging_df(df)
    points = iter_df_to_point(df, size)
    base_path = f'./saves/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}'
    create_dir(base_path)
    for point_dict in points:
        num = list(point_dict.keys())[0]
        point = point_dict[num]
        size = len(point[0])
        generate_and_save(base_path, point, size, num)
        # plt.show()
    # return df
 if __name__ == '__main__':
--- a/traffic_csv_converter.py
+++ b/traffic_csv_converter.py
@ -1,267 +0,0 @@
 #!/usr/bin/env python
 """
 Read traffic_csv
 """
 import os
 import argparse
 import csv
 import glob
 import re
 FLAGS = None
 INPUT = "../raw_csvs/classes/browsing/reg/CICNTTor_browsing.raw.csv"#"../dataset/iscxNTVPN2016/CompletePCAPs" # ""
 INPUT_DIR = "../raw_csvs/classes/chat/vpn/"
 CLASSES_DIR = "../raw_csvs/classes/**/**/"
 # LABEL_IND = 1
 TPS = 60 # TimePerSession in secs
 DELTA_T = 60 # Delta T between splitted sessions
 MIN_TPS = 50
 # def insert_dataset(dataset, labels, session, label_ind=LABEL_IND):
 #     dataset.append(session)
 #     labels.append(label_ind)
 # def export_dataset(dataset, labels):
 #     print "Start export dataset"
 #     np.savez(INPUT.split(".")[0] + ".npz", X=dataset, Y=labels)
 #     print dataset.shape, labels.shape
 #
 # def import_dataset():
 #     print "Import dataset"
 #     dataset = np.load(INPUT.split(".")[0] + ".npz")
 #     print dataset["X"].shape, dataset["Y"].shape
 import matplotlib.pyplot as plt
 import numpy as np
 MTU = 1500
 def session_spectogram(ts, sizes, name=None):
    plt.scatter(ts, sizes, marker='.')
    plt.ylim(0, MTU)
    plt.xlim(ts[0], ts[-1])
    # plt.yticks(np.arange(0, MTU, 10))
    # plt.xticks(np.arange(int(ts[0]), int(ts[-1]), 10))
    plt.title(name + " Session Spectogram")
    plt.ylabel('Size [B]')
    plt.xlabel('Time [sec]')
    plt.grid(True)
    plt.show()
 def session_atricle_spectogram(ts, sizes, fpath=None, show=True, tps=None):
    if tps is None:
        max_delta_time = ts[-1] - ts[0]
    else:
        max_delta_time = tps
    ts_norm = ((np.array(ts) - ts[0]) / max_delta_time) * MTU
    plt.figure()
    plt.scatter(ts_norm, sizes, marker=',', c='k', s=5)
    plt.ylim(0, MTU)
    plt.xlim(0, MTU)
    plt.ylabel('Packet Size [B]')
    plt.xlabel('Normalized Arrival Time')
    plt.set_cmap('binary')
    plt.axes().set_aspect('equal')
    plt.grid(False)
    if fpath is not None:
        # plt.savefig(OUTPUT_DIR + fname, bbox_inches='tight', pad_inches=1)
        plt.savefig(fpath, bbox_inches='tight')
    if show:
        plt.show()
    plt.close()
 def session_histogram(sizes, plot=False):
    hist, bin_edges = np.histogram(sizes, bins=range(0, MTU + 1, 1))
    if plot:
        plt.bar(bin_edges[:-1], hist, width=1)
        plt.xlim(min(bin_edges), max(bin_edges)+100)
        plt.show()
    return hist.astype(np.uint16)
 def session_2d_histogram(ts, sizes, plot=False, tps=None):
    if tps is None:
        max_delta_time = ts[-1] - ts[0]
    else:
        max_delta_time = tps
    # ts_norm = map(int, ((np.array(ts) - ts[0]) / max_delta_time) * MTU)
    ts_norm = ((np.array(ts) - ts[0]) / max_delta_time) * MTU
    H, xedges, yedges = np.histogram2d(sizes, ts_norm, bins=(range(0, MTU + 1, 1), range(0, MTU + 1, 1)))
    if plot:
        plt.pcolormesh(xedges, yedges, H)
        plt.colorbar()
        plt.xlim(0, MTU)
        plt.ylim(0, MTU)
        plt.set_cmap('binary')
        plt.show()
    return H.astype(np.uint16)
 def export_dataset(dataset):
    print("Start export dataset")
    np.save(os.path.splitext(INPUT)[0], dataset)
    print(dataset.shape)
 def export_class_dataset(dataset, class_dir):
    print("Start export dataset")
    np.save(class_dir + "/" + "_".join(re.findall(r"[\w']+", class_dir)[-2:]), dataset)
    print(dataset.shape)
 def import_dataset():
    print("Import dataset")
    dataset = np.load(os.path.splitext(INPUT)[0] + ".npy")
    print(dataset.shape)
    return dataset
 def traffic_csv_converter(file_path):
    print("Running on " + file_path)
    dataset = []
    # labels = []
    counter = 0
    with open(file_path, 'r') as csv_file:
        reader = csv.reader(csv_file)
        for i, row in enumerate(reader):
            # print row[0], row[7]
            session_tuple_key = tuple(row[:8])
            length = int(row[7])
            ts = np.array(row[8:8+length], dtype=float)
            sizes = np.array(row[9+length:], dtype=int)
            # if (sizes > MTU).any():
            #     a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]]
            #     print len(a), session_tuple_key
            if length > 10:
                # print ts[0], ts[-1]
                # h = session_2d_histogram(ts, sizes)
                # session_spectogram(ts, sizes, session_tuple_key[0])
                # dataset.append([h])
                # counter += 1
                # if counter % 100 == 0:
                #     print counter
                for t in range(int(ts[-1]/DELTA_T - TPS/DELTA_T) + 1):
                    mask = ((ts >= t * DELTA_T) & (ts <= (t * DELTA_T + TPS)))
                    # print t * DELTA_T, t * DELTA_T + TPS, ts[-1]
                    ts_mask = ts[mask]
                    sizes_mask = sizes[mask]
                    if len(ts_mask) > 10 and ts_mask[-1] - ts_mask[0] > MIN_TPS:
                        # if "facebook" in session_tuple_key[0]:
                        #     session_spectogram(ts[mask], sizes[mask], session_tuple_key[0])
                        #     # session_2d_histogram(ts[mask], sizes[mask], True)
                        #     session_histogram(sizes[mask], True)
                        #     exit()
                        # else:
                        #     continue
                        h = session_2d_histogram(ts_mask, sizes_mask)
                        # session_spectogram(ts_mask, sizes_mask, session_tuple_key[0])
                        dataset.append([h])
                        counter += 1
                        if counter % 100 == 0:
                            print(counter)
    return np.asarray(dataset) #, np.asarray(labels)
 def traffic_csv_converter_splitted(file_path):
    def split_converter(ts, sizes, dataset, counter):
        if ts[-1] - ts[0] > MIN_TPS and len(ts) > 20:
            # print ts[0], ts[-1]
            h = session_2d_histogram(ts-ts[0], sizes)
            # session_spectogram(ts, sizes, session_tuple_key[0])
            dataset.append([h])
            counter += 1
            # if counter % 100 == 0:
            #     print counter
            total_time = ts[-1] - ts[0]
            if total_time > TPS:
                for ts_split, sizes_split in zip(np.split(ts, [len(ts)/2]), np.split(sizes, [len(sizes)/2])):
                    split_converter(ts_split, sizes_split, dataset, counter)
    print("Running on " + file_path)
    dataset = []
    # labels = []
    counter = 0
    with open(file_path, 'r') as csv_file:
        reader = csv.reader(csv_file)
        for i, row in enumerate(reader):
            # print row[0], row[7]
            session_tuple_key = tuple(row[:8])
            length = int(row[7])
            ts = np.array(row[8:8+length], dtype=float)
            sizes = np.array(row[9+length:], dtype=int)
            # if (sizes > MTU).any():
            #     a = [(sizes[i], i) for i in range(len(sizes)) if (np.array(sizes) > MTU)[i]]
            #     print len(a), session_tuple_key
            if length > 10:
                split_converter(ts, sizes, dataset, counter)
    return np.asarray(dataset)
 def traffic_class_converter(dir_path):
    dataset_tuple = ()
    for file_path in [os.path.join(dir_path, fn) for fn in next(os.walk(dir_path))[2] if (".csv" in os.path.splitext(fn)[-1])]:
        dataset_tuple += (traffic_csv_converter(file_path),)  ################
    return np.concatenate(dataset_tuple, axis=0)
 def iterate_all_classes():
    for class_dir in glob.glob(CLASSES_DIR):
        if "other" not in class_dir: #"browsing" not in class_dir and
            print("working on " + class_dir)
            dataset = traffic_class_converter(class_dir)
            print(dataset.shape)
            export_class_dataset(dataset, class_dir)
 def random_sampling_dataset(input_array, size=2000):
    print("Import dataset " + input_array)
    dataset = np.load(input_array)
    print(dataset.shape)
    p = size*1.0/len(dataset)
    print(p)
    if p >= 1:
        raise Exception
    mask = np.random.choice([True, False], len(dataset), p=[p, 1-p])
    dataset = dataset[mask]
    print("Start export dataset")
    np.save(os.path.splitext(input_array)[0] + "_samp", dataset)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, default=INPUT, help='Path to csv file')
    FLAGS = parser.parse_args()
    ##
    # iterate_all_classes()
    # dataset = traffic_class_converter(INPUT_DIR)
    # dataset = traffic_csv_converter(INPUT)
    input_array = "./_dataset/FlowPic/browsing_reg.npy"
    random_sampling_dataset(input_array)
    # export_class_dataset(dataset)
    # import_dataset()
--- a/utils/dataframe.py
+++ b/utils/dataframe.py
@ -0,0 +1,40 @@
 import pandas as pd
 def drop_unique_columns(df: pd.DataFrame):
    nunique = df.nunique()  # 计算每一列的唯一值的数量
    cols_to_drop = nunique[nunique == 1].index  # 找到只有一个唯一值的列的索引
    df.drop(cols_to_drop, axis=1, inplace=True)  # 删除这些列
    print(cols_to_drop)  # 输出删除的列的列名
    return df
 def select_label_rows(df: pd.DataFrame, label: str):
    return df[df.iloc[:, -1] == label]
 def drop_columns_with_fix_up(df: pd.DataFrame, columns: list):
    columns = [w.lstrip() for w in columns]
    df = drop_columns(df, columns)
    columns = [" " + w for w in columns]
    df = drop_columns(df, columns)
    return df
 def drop_columns(df: pd.DataFrame, columns: list):
    columns = [w.lstrip() for w in columns]
    for column_name in columns:
        cols_to_drop = df.filter(regex=column_name).columns
        df.drop(cols_to_drop, axis=1, inplace=True)
    return df
 def get_ddos_df(df: pd.DataFrame):
    return df[df.iloc[:, -1] == 'DDoS']
 def is_in_bypass_list(column_name: str, bypass_list: tuple) -> bool:
    for bypass in bypass_list:
        if bypass in column_name:
            return True
    return False
--- a/utils/pcap.py
+++ b/utils/pcap.py
@ -0,0 +1,29 @@
 from scapy.all import *
 from loguru import logger
 def split_pcap(file_path: str, chunk_size: int, save_base_path: str = None):
    packets = PcapReader(file_path)
    chunk = []
    counter = 1
    for packet in packets:
        # logger.info(packet.time)
        chunk.append(packet)
        if len(chunk) == chunk_size:
            wrpcap(f'{save_base_path}/chunk_{counter}.pcap', chunk)
            chunk = []
            logger.info(f'chunk_{counter}.pcap saved')
            counter += 1
    if chunk:
        wrpcap(f'{save_base_path}/chunk_{counter}.pcap', chunk)
 def get_packet_time(pkt: Packet):
    return pkt.time
 if __name__ == '__main__':
    from utils.files import create_dir
    create_dir('../_dataset/pcap/Friday-WorkingHours')
    split_pcap('../_dataset/pcap/Friday-WorkingHours.pcap', 10000, '../_dataset/pcap/Friday-WorkingHours')
Author	SHA1	Message	Date
yulonger's Desktop	28fff1c924	bump feat: stable version	2 years ago
yulonger's Desktop	984d403510	bump:stable v0.2	2 years ago
yulonger's Desktop	847780028c	Initial stage 2: color pic	2 years ago
		`@ -0,0 +1,3 @@`
							`# The sampled datasets used for the experiments in the sample code`

							`Car_Hacking_5%.csv: The 5% randomly sampled subset of the [Car Hacking dataset](https://ocslab.hksecurity.net/Datasets/CAN-intrusion-dataset)`
		`@ -0,0 +1 @@`
							`# The code in this folder shows an example of the pre-processing of the Car-Hacking dataset.`
		`@ -0,0 +1,2 @@`
							`# CICIDS2017-ML`
							`The purpose of this repository is to demonstrate the steps of processing CICIDS2017 dataset using machine learning algorithms.`