File size: 23,848 Bytes

43cec6e

{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Install Necessary Packages"
      ],
      "metadata": {
        "id": "GUB8N3k9fq-E"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zt59bSq5vcnA"
      },
      "outputs": [],
      "source": [
        "#Necessary installations\n",
        "!pip install datasets evaluate transformers[sentencepiece]\n",
        "!pip install huggingface_hub\n",
        "!pip install pandas\n",
        "!pip install imblearn\n",
        "!pip install torch"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Load the Dataset"
      ],
      "metadata": {
        "id": "9lyEyWBic5RN"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QJDszQKe6oxK"
      },
      "outputs": [],
      "source": [
        "from datasets import Features, Value, ClassLabel\n",
        "import pandas as pd\n",
        "\n",
        "from datasets import load_dataset\n",
        "dataset = load_dataset(\"19kmunz/iot-23-preprocessed-minimumcolumns\")\n",
        "print(dataset.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wRjakUpXD3D9"
      },
      "source": [
        "# Oversample the Dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wzU5AHGxD2Ut"
      },
      "outputs": [],
      "source": [
        "from imblearn.over_sampling import SMOTE\n",
        "from sklearn.preprocessing import OneHotEncoder\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "from sklearn.model_selection import train_test_split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mT027c7R1t7n"
      },
      "outputs": [],
      "source": [
        "df = dataset['train'].to_pandas()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "v2l9xGpr6bZc"
      },
      "outputs": [],
      "source": [
        "# Separate features and target\n",
        "features = ['id.resp_p', 'proto', 'conn_state', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
        "X = df[features]\n",
        "y = df['label']"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SlFbgG_69B1K"
      },
      "source": [
        "ADASYN and SMOTE oversampling algorithm expects numeric data, but features like proto is non-numeric categorical column. SMOTE cannot handle the string values like 'tcp' in those columns. So, I applied one hot encoding to categorical columns and then applied SMOTE"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8zSNEGIiWjMZ"
      },
      "outputs": [],
      "source": [
        "#########################################NEWWWW#############################################\n",
        "# Define categorical columns to be label-encoded\n",
        "cat_cols = ['proto', 'conn_state']\n",
        "\n",
        "# Initialize a dictionary to store label encoders for each column\n",
        "label_encoders = {}\n",
        "label_encoded_columns = {}  # Store label-encoded columns\n",
        "\n",
        "for col in cat_cols:\n",
        "    le = LabelEncoder()\n",
        "    label_encoded = le.fit_transform(df[col])\n",
        "    df[col + '_label'] = label_encoded  # Create new columns with label-encoded data\n",
        "    label_encoders[col] = le\n",
        "    label_encoded_columns[col] = label_encoded\n",
        "# Get numeric columns\n",
        "num_cols = ['id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
        "\n",
        "# Extract numeric columns\n",
        "X_num = df[num_cols]\n",
        "\n",
        "# Concatenate label-encoded columns and numeric columns\n",
        "X_combined = pd.concat([df[['proto_label', 'conn_state_label']], X_num], axis=1)\n",
        "\n",
        "# Store the labels in y_os\n",
        "y_os = df['label']\n",
        "y_os1 = df['label'].apply(lambda x: 0 if x == \"Benign\" else 1)\n",
        "\n",
        "# Specify desired number of samples\n",
        "#k_neighbors = 10000 - y_os.shape[0]\n",
        "\n",
        "# Perform oversampling using SMOTE\n",
        "smote = SMOTE(sampling_strategy={0: 5000, 1: 5000})\n",
        "X_combined_os, Y_combined_os = smote.fit_resample(X_combined, y_os1)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Print new class counts\n",
        "print(Y_combined_os.value_counts())\n",
        "print(X_combined_os.shape)"
      ],
      "metadata": {
        "id": "mZ1iMnEIkVAj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Split the Dataset"
      ],
      "metadata": {
        "id": "oO9g2nhlbr3o"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OzJI6451n4tE"
      },
      "outputs": [],
      "source": [
        "# Manually define the column names\n",
        "column_names = ['proto_label', 'conn_state_label', 'id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n",
        "result_column = ['label']\n",
        "\n",
        "# Create a new DataFrame with the oversampled data and specified column names\n",
        "X_combined_os_df = pd.DataFrame(X_combined_os, columns=column_names)\n",
        "Y_combined_os_df = pd.DataFrame(Y_combined_os, columns=result_column)\n",
        "\n",
        "# Print the first 5 rows of the oversampled data\n",
        "print(X_combined_os_df.shape)\n",
        "print(X_combined_os_df.head())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YwnVJ7RqKFRD"
      },
      "outputs": [],
      "source": [
        "# Split oversampled data\n",
        "\n",
        "# Initial split into train and temp test sets\n",
        "X_train, X_temp, y_train, y_temp = train_test_split(X_combined_os_df, Y_combined_os_df, test_size=0.2, random_state=42)\n",
        "\n",
        "# Split oversampled data\n",
        "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)\n",
        "\n",
        "print(\"Oversampled dataset shape:\", X_combined_os.shape)\n",
        "print(\"X_train shape:\", X_train.shape)\n",
        "print(\"X_test shape:\", X_test.shape)\n",
        "print(\"X_val shape:\", X_val.shape)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WHobtry9LI_d"
      },
      "source": [
        "# Tokenize the Dataset"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Run one of the following cell if loading from local. Otherwise x_train and y_train are already defined."
      ],
      "metadata": {
        "id": "3UMlgohccAPg"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "X_train = pd.read_csv('X_train.csv', index_col=0)\n",
        "y_train = pd.read_csv('y_train.csv', index_col=0)"
      ],
      "metadata": {
        "id": "z2BM318ufee_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_encodings = torch.load('train_encodings.pt')\n",
        "val_encodings = torch.load('val_encodings.pt')\n",
        "test_encodings = torch.load('test_encodings.pt')"
      ],
      "metadata": {
        "id": "sVTr9fxIMZl9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Otherwise, Continue running here"
      ],
      "metadata": {
        "id": "j0FyKqdMezWv"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "U09fvCzaMn2P"
      },
      "outputs": [],
      "source": [
        "# Dictionary of feature names to use in the make sentence function\n",
        "feature_names = {'id.resp_p':'response port',\n",
        "                 'proto_label':'transport protocol',\n",
        "                 'orig_pkts':'number of packets sent by the origin',\n",
        "                 'conn_state_label':'connection state',\n",
        "                 'orig_ip_bytes':'number of IP level bytes sent by the originator',\n",
        "                 'resp_ip_bytes':'number of IP level bytes sent by the responder'}\n",
        "\n",
        "# Function to make sentences out of the data\n",
        "def make_sentence(row):\n",
        "  sentences = {}\n",
        "  for feature in row.keys():\n",
        "    if feature != 'label':\n",
        "      sentences[feature] = feature_names[feature] + \" is \" + str(row[feature]) + \".\"\n",
        "  return sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Fe_vj8hO9dNw"
      },
      "outputs": [],
      "source": [
        "# Take all sentence observations and make them into paragraph inputs\n",
        "def make_paragraphs(ser):\n",
        "  paragraphs_list = []\n",
        "  for index,obs in ser.items():\n",
        "    new_para = obs['id.resp_p'] + \" \" + obs['proto_label'] + \" \" + obs['conn_state_label'] + \" \" + obs['orig_pkts'] + \" \" + obs['orig_ip_bytes'] + \" \" + obs['resp_ip_bytes']\n",
        "    paragraphs_list.append(new_para)\n",
        "  return pd.Series(paragraphs_list, index=ser.index)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bNyv9zOlGaBm"
      },
      "outputs": [],
      "source": [
        "from transformers import BertTokenizer\n",
        "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
        "\n",
        "\n",
        "# Transform the dataset into sentences\n",
        "X_train_sentences = X_train.apply(make_sentence, axis=1)\n",
        "X_val_sentences = X_val.apply(make_sentence, axis=1)\n",
        "X_test_sentences = X_test.apply(make_sentence, axis=1)\n",
        "\n",
        "# Transform the sentences into paragraphs\n",
        "X_train_paragraphs = make_paragraphs(X_train_sentences)\n",
        "X_val_paragraphs = make_paragraphs(X_val_sentences)\n",
        "X_test_paragraphs = make_paragraphs(X_test_sentences)\n",
        "\n",
        "# Turn labels into lists of strings\n",
        "y_train_str = [str(y) for y in y_train['label'].tolist()]\n",
        "y_val_str = [str(y) for y in y_val['label'].tolist()]\n",
        "y_test_str = [str(y) for y in y_test['label'].tolist()]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f5bT1RIEW0O7"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "# Encode both paragraphs and the labels\n",
        "train_encodings = tokenizer(text=X_train_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
        "val_encodings = tokenizer(text=X_val_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
        "test_encodings = tokenizer(text=X_test_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n",
        "\n",
        "# Add label tensors\n",
        "y_train_tensor = torch.tensor(y_train['label'].values)\n",
        "y_val_tensor = torch.tensor(y_val['label'].values)\n",
        "y_test_tensor = torch.tensor(y_test['label'].values)\n",
        "\n",
        "train_encodings['labels'] = y_train_tensor\n",
        "val_encodings['labels'] = y_val_tensor\n",
        "test_encodings['labels'] = y_test_tensor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OV600RIVGlTi"
      },
      "outputs": [],
      "source": [
        "torch.save(train_encodings, 'train_encodings.pt')\n",
        "torch.save(val_encodings, 'val_encodings.pt')\n",
        "torch.save(test_encodings, 'test_encodings.pt')"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Finally, prepare dataset as Hugging Face Dataset"
      ],
      "metadata": {
        "id": "gev2VE5VcnaY"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZNmaJOCUifpD"
      },
      "source": [
        "### Optional: Load training, validation, and test encodings in from Drive or local"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "id": "7NlSBStpD_rO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install torch==2.1.0\n",
        "!pip install -U transformers[torch]\n",
        "!pip install optimum[exporters]"
      ],
      "metadata": {
        "id": "okamUGSAmBYN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "from transformers import BertTokenizer\n",
        "# Load tensor data back from drive\n",
        "train_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/train_encodings.pt\")\n",
        "val_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/val_encodings.pt\")\n",
        "test_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/test_encodings.pt\")\n",
        "\n",
        "# Load labels tensors back from drive\n",
        "# y_train_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_train_tensor.pt\")\n",
        "# y_val_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_val_tensor.pt\")\n",
        "# y_test_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_test_tensor.pt\")"
      ],
      "metadata": {
        "id": "rVEX0OhgEAJT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# FROM LOCAL\n",
        "import torch\n",
        "train_encodings = torch.load(\"train_encodings.pt\")\n",
        "val_encodings = torch.load(\"val_encodings.pt\")\n",
        "test_encodings = torch.load(\"test_encodings.pt\")"
      ],
      "metadata": {
        "id": "Jxbp-oouNHsT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(train_encodings['input_ids'].size())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YY7xwbuZlhK4",
        "outputId": "3faf0705-93f8-456e-8dbf-22b406314766"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "torch.Size([8000, 67])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Otherwise, continue running here"
      ],
      "metadata": {
        "id": "dlTL3uj1fKF6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Creating small datasets to test finetuning\n",
        "train = train_encodings\n",
        "eval = val_encodings\n",
        "test = test_encodings\n",
        "\n",
        "# Creating small datasets to test finetuning (delete :1000 for full dataset)\n",
        "#train = train_encodings[:1000]\n",
        "#eval = val_encodings[:1000]\n",
        "#test = test_encodings[:1000]\n",
        "\n",
        "# Replacing target tensors (delete :128 for full label tensors)\n",
        "# train['labels'] = y_train_tensor[:1000]\n",
        "# eval['labels'] = y_val_tensor[:1000]\n",
        "# test['labels'] = y_test_tensor[:1000]\n",
        "\n",
        "# Pytorch tensors to HF Dataset\n",
        "from datasets import Dataset\n",
        "train_dataset = Dataset.from_dict(train)\n",
        "eval_dataset = Dataset.from_dict(eval)\n",
        "test_dataset = Dataset.from_dict(test)"
      ],
      "metadata": {
        "id": "llZN2akWHxe5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SRLNGcQFvJAa"
      },
      "source": [
        "# Fine-tune BERT for benign vs malicious"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xf2CGlW1dLlH"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup\n",
        "from transformers import Trainer, TrainingArguments\n",
        "from torch.utils.data import DataLoader, TensorDataset, random_split\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.utils.class_weight import compute_class_weight"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import evaluate\n",
        "\n",
        "combined_metrics = evaluate.combine([\"accuracy\", \"f1\"])"
      ],
      "metadata": {
        "id": "maPzffCsAS__"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def compute_metrics(eval_pred):\n",
        "    logits, labels = eval_pred\n",
        "    predictions = np.argmax(logits, axis=-1)\n",
        "    results = combined_metrics.compute(predictions=predictions, references=labels)\n",
        "    print(f\"Accuracy: {results['accuracy']:.3f}% | F1: {results['f1']:.3f}\")\n",
        "    return results"
      ],
      "metadata": {
        "id": "Subi5OZxAvlh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load pretrained BERT model\n",
        "model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)\n",
        "\n",
        "# OR Load local model\n",
        "# model = BertForSequenceClassification.from_pretrained('./model', num_labels=2)"
      ],
      "metadata": {
        "id": "OWLPaQn9ysMg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define TrainingArguments\n",
        "training_args = TrainingArguments(\n",
        "    output_dir='./results',\n",
        "    num_train_epochs=6,\n",
        "    per_device_train_batch_size=32,\n",
        "    # per_device_eval_batch_size=16,\n",
        "    warmup_steps=500,\n",
        "    weight_decay=0.01,\n",
        "    logging_dir='./logs',\n",
        "    # logging_steps=0.10,\n",
        "    eval_steps=0.10,\n",
        "    save_steps=0.10,\n",
        "    logging_strategy='epoch',\n",
        "    evaluation_strategy='epoch',\n",
        "    save_strategy='epoch',\n",
        "    save_total_limit=2,\n",
        "    load_best_model_at_end=True\n",
        ")\n",
        "\n",
        "# Create Trainer instance\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=train_dataset,\n",
        "    eval_dataset=eval_dataset,\n",
        "    compute_metrics=compute_metrics\n",
        ")\n",
        "\n",
        "# Train\n",
        "trainer.train()"
      ],
      "metadata": {
        "id": "7a-zvoP0j8C8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(test_dataset)"
      ],
      "metadata": {
        "id": "SzDiVYRf23dp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TlxpJByQXL_w"
      },
      "outputs": [],
      "source": [
        "# Use test_dataset instead to test it later\n",
        "trainer.evaluate(eval_dataset=test_dataset)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model.save_pretrained('./model')"
      ],
      "metadata": {
        "id": "dqMkv8aA5Tdk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Save to Hugging Face"
      ],
      "metadata": {
        "id": "-qKGOqJTWt3a"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import create_repo"
      ],
      "metadata": {
        "id": "m0mCacsshEhy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install cupy --upgrade"
      ],
      "metadata": {
        "id": "Ba-kOs8WqQTl"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "libcuda.so.1"
      ],
      "metadata": {
        "id": "AK2rcA5-qyGh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install onnxruntime\n",
        "import onnxruntime as rt\n",
        "import onnx\n",
        "import cv2"
      ],
      "metadata": {
        "id": "8z2pir6uo-vM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!optimum-cli export onnx --model ./ --task question-answering ./results/checkpoint-10"
      ],
      "metadata": {
        "id": "WlderhErraWX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from onnxruntime import ORTModelForSequenceClassification\n",
        "\n",
        "ort_model = ORTModelForSequenceClassification.from_pretrained(model, export=True)\n",
        "\n",
        "ort_model.save_pretrained(\"./results/checkpoint-10\")"
      ],
      "metadata": {
        "id": "NzPr5eIkZfi7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Export model\n",
        "import torch\n",
        "# Get input ids\n",
        "input_ids = train_dataset['input_ids']\n",
        "# Convert to torch tensor\n",
        "input_ids = torch.tensor(input_ids)\n",
        "\n",
        "torch.onnx.export(model,             # Model being run\n",
        "         input_ids,                  # Model input\n",
        "         \"IoT23_Log_Prediction.onnx\",# Where to save the model\n",
        "         export_params=True,         # Store model parameters\n",
        "         output_names=['labels'],\n",
        "         opset_version=11,           # ONNX version\n",
        "         do_constant_folding=True,   # Optimize\n",
        "         input_names = ['input_ids'])"
      ],
      "metadata": {
        "id": "ZM8xTkjeTm0c"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "td-xtcTdcoVO",
        "GUB8N3k9fq-E",
        "9lyEyWBic5RN",
        "wRjakUpXD3D9",
        "oO9g2nhlbr3o",
        "3UMlgohccAPg",
        "gev2VE5VcnaY",
        "ZNmaJOCUifpD",
        "L0eqXeQUTpXM"
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}