In [None]:
# Capture to supress the download ouput
%%capture
!pip install gradio
!pip install pandas
!pip install transformers
!pip install parsezeeklogs
!pip install elasticsearch

In [None]:
# Define imports for model use
import torch
from transformers import pipeline
from parsezeeklogs import ParseZeekLogs
from transformers import BertTokenizer
import gradio as gr
import pandas as pd

# Define model
pipe = pipeline(model="yashika0998/IoT-23-BERT-Network-Logs-Classification", tokenizer=BertTokenizer.from_pretrained("bert-base-cased"))

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Define string constants
LOG = "conn.log Output"
HEADER_TABLE = "Headers Table"
SENTENCES = "Sentences"
OUT = "out"
INPUT_TYPES = [LOG, HEADER_TABLE, SENTENCES]
STEPS = [HEADER_TABLE, SENTENCES]
HEADERS=['id.resp_p', 'proto', 'conn_state', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']

In [None]:
# Define sentence-ization functions
# Dictionary of feature names to use in the make sentence function
feature_names = {'id.resp_p':'response port',
                 'proto':'transport protocol',
                 'orig_pkts':'number of packets sent by the origin',
                 'conn_state':'connection state',
                 'orig_ip_bytes':'number of IP level bytes sent by the originator',
                 'resp_ip_bytes':'number of IP level bytes sent by the responder'}

# Function to make sentences out of the data
def make_sentence(row):
  sentences = {}
  for feature in row.keys():
    if feature == 'label' or feature == "#":
      sentences[feature] = row[feature]
    else:
      sentences[feature] = feature_names[feature] + " is " + str(row[feature]) + "."
  return sentences

# Take all sentence observations and make them into paragraph inputs
def make_paragraphs(ser):
  paragraphs_list = []
  for index,obs in ser.items():
    new_para = obs['id.resp_p'] + " " + obs['proto'] + " " + obs['conn_state'] + " " + obs['orig_pkts'] + " " + obs['orig_ip_bytes'] + " " + obs['resp_ip_bytes']
    paragraphs_list.append(new_para)
  return pd.Series(paragraphs_list, name="Sentences", index=ser.index).to_frame()

In [None]:
# Define prediction Functions For Different Settings
def predictFromSentences(sentenceTable):
  output = pipe(sentenceTable[SENTENCES].tolist()) # This does the prediction!
  return { OUT: pd.DataFrame({"Output": ["Malicious" if pred['label'] == "LABEL_0" else "Benign" for pred in output] }) }

def predictFromHeaderTable(headerTable):
  sentences = headerTable.apply(make_sentence, axis=1);
  paragraphs = make_paragraphs(sentences)
  return {
      SENTENCES: paragraphs,
      OUT: predictFromSentences(paragraphs)[OUT]
  }

def predictFromFileUpload(fileUpload):
  if(fileUpload is None):
      raise gr.Error("No file uploaded")
  fileType = fileUpload.split('.')[-1]
  if(fileType == 'csv'):
    dataFrame = pd.read_csv(fileUpload, usecols=HEADERS)
  elif(fileType == 'log' or fileType == 'labeled'):
    with open('out.csv',"w") as outfile:
      for log_record in ParseZeekLogs(fileUpload, output_format="csv", safe_headers=False, fields=HEADERS):
          if log_record is not None:
              outfile.write(log_record + "\n")
    dataFrame = pd.read_csv('out.csv', names=HEADERS)
  result = predictFromHeaderTable(dataFrame)
  toReturn =  {
      HEADER_TABLE: dataFrame,
      SENTENCES: result[SENTENCES],
      OUT: result[OUT]
  }
  return toReturn

In [None]:
def makeIndexColumn(allInputs):
  def _makeIndexColumnFor(column):
    theseHeaders = allInputs[column].columns
    newHeaders = ['#', *theseHeaders]
    allInputs[column]['#'] = allInputs[column].index
    allInputs[column] = allInputs[column][newHeaders]

  if(SENTENCES in allInputs):
    _makeIndexColumnFor(SENTENCES)
  if(HEADER_TABLE in allInputs):
    _makeIndexColumnFor(HEADER_TABLE)
  if(OUT in allInputs):
    _makeIndexColumnFor(OUT)
  return allInputs

def predict(inputType, fileUpload, headerTable, sentenceTable, out):
  output = {};
  if(inputType == LOG):
    # Process File Upload
    output = makeIndexColumn(predictFromFileUpload(fileUpload))
    return [output[HEADER_TABLE], output[SENTENCES], output[OUT]]
  elif(inputType == HEADER_TABLE):
    # Process Header Table
    output = makeIndexColumn(predictFromHeaderTable(headerTable))
    return [headerTable, output[SENTENCES], output[OUT]]
  elif(inputType == SENTENCES):
    # Process From Sentences
    output = makeIndexColumn(predictFromSentences(sentenceTable))
    return [headerTable, sentenceTable, output[OUT]]

# Update UI
def updateInputOutputBlocks(inputType, steps):
  # Update visibility and Interactivity of Gradio Blocks based on Settings
  fileUpload = gr.File(
      visible=(True if inputType == LOG else False),
      interactive=(1 if inputType == LOG else 0)
  )
  headerTable = gr.Dataframe(
      visible=(True if (inputType == HEADER_TABLE or HEADER_TABLE in steps) else False),
      interactive=(1 if inputType == HEADER_TABLE else 0)
    )
  sentenceTable = gr.Dataframe(
      interactive=(1 if inputType == SENTENCES else 0),
      visible=(True if (inputType == SENTENCES or SENTENCES in steps) else False)
    )
  return fileUpload, headerTable, sentenceTable

In [None]:
# Create Gradio UI
with gr.Blocks() as app:
    gr.Markdown("""
    # Network Log Predictions
    Input log information below and click 'Run' to get predictions from our model!
    Access the settings at the bottom for different types of input and to see inbetween steps.
    """)
    # Inputs / Outputs
    fileUpload = gr.File(file_types=[".log", ".log.labeled", ".csv"], label="Zeek Log File", visible=False, file_count='single')
    headerTable = gr.Dataframe(row_count = (2, "dynamic"), col_count=(7,"fixed"), headers=['#', *HEADERS], label="Header Inputs", interactive=1)
    sentenceTable = gr.Dataframe(row_count = (2, "dynamic"), col_count=(2, "fixed"), headers=["#", "Sentence"], label="Sentences", interactive=0, visible=False)
    out = gr.Dataframe(row_count = (2, "dynamic"), col_count=(2, "fixed"), headers=['#', "Output"], label="Predictions", column_widths=["60px", "100%"])
    btn = gr.Button("Run")

    # Settings
    with gr.Accordion("Settings", open=False):
      inputType = gr.Radio(INPUT_TYPES, value="Headers Table", label="Input")
      steps = gr.CheckboxGroup(STEPS, label="Display Intermediary Steps")
      inputType.change(
        fn=updateInputOutputBlocks,
        inputs=[inputType, steps],
        outputs=[fileUpload, headerTable, sentenceTable]
      )
      steps.change(
        fn=updateInputOutputBlocks,
        inputs=[inputType, steps],
        outputs=[fileUpload, headerTable, sentenceTable]
      )
    # Assign Callback
    btn.click(
        fn=predict,
        inputs=[inputType, fileUpload, headerTable, sentenceTable, out],
        outputs=[headerTable, sentenceTable, out]
        )

    # Add tokenization and prediction functions
    #btn.click(fn=get_predictions, inputs=tokenize_inputs(sentence_input), outputs=out)


app.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://275d064dabebea6639.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




### Export the model in ONNX format


In [None]:
!pip install transformers[onnx]
!pip install optimum
!pip install onnxruntime

In [None]:
import torch

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

model = AutoModelForSequenceClassification.from_pretrained("19kmunz/IoT-23-BERT-Network-Logs-Classification")

In [None]:
dummy_input = ["response port is 8081. transport protocol is tcp. connection state is S0. number of packets sent by the origin is 2. number of IP level bytes sent by the originator is 80. number of IP level bytes sent by the responder is 0"] * model.config.max_position_embeddings
dummy_input = tokenizer(dummy_input, return_tensors="pt").input_ids

In [None]:
import torch
torch.onnx.export(model, dummy_input, "IoT-23-BERT-Network-Logs-Classification.onnx", export_params=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
export_dir = "/content/drive/MyDrive/CS513 Final Project/ONNX_model"
torch.onnx.export(model, dummy_input, export_dir + "/IoT-23-BERT-Network-Logs-Classification.onnx", export_params=True)