CyberNative AI for CyberSecurity | Q/A Evaluation | Colibri_8b_v0.1 scored 74/100!

#1
by CyberNative - opened
CyberNative AI org

Tested Colibri_8b_v0.1 on Cybersecurity Evaluation Dataset to measure the performance of cybersecurity models.
Correct (74.0): 370 | Incorrect: 130

We excluded eval dataset from training to prevent contamination. However, some contamination is still possible due to the nature of our training dataset creation method.

eval_colibri_exl2.py

import time
import random
random.seed(time.time())
import torch

print(f"PyTorch version: {torch.__version__}")

from exllamav2 import(
    ExLlamaV2,
    ExLlamaV2Config,
    ExLlamaV2Cache,
    ExLlamaV2Tokenizer,
)

from exllamav2.generator import (
    ExLlamaV2BaseGenerator,
    ExLlamaV2Sampler
)

# CyberNative-AI/Colibri_8b_v0.1
og_model_dir = "Colibri_8b_v0.1"

stop_tkn_id = 128256

config = ExLlamaV2Config()
config.debug_mode = True
config.model_dir = og_model_dir
config.prepare()
config.max_seq_len = 8192
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
# Initialize generator
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)

def ai_complete(system_prompt, user_message, assistant_pre_message="", max_gen=4096, temperature = 0.3):
        settings = ExLlamaV2Sampler.Settings()
        settings.temperature = temperature
        settings.top_p = 0.7
        time_begin = time.time()
        system_prompt = system_prompt.strip()
        user_message = user_message.strip()
        prompt =  f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"+assistant_pre_message
        generator.warmup()
        random_seed = time_begin * random.randint(1, 1000) / random.randint(1, 1000)
        og_output = generator.generate_simple(prompt, settings, num_tokens=max_gen, seed = random_seed, stop_token=stop_tkn_id, decode_special_tokens=True, encode_special_tokens=True)
        og_output = og_output.replace(prompt, "").strip()
        return og_output

QA_EXTRACTOR_PROMPT = """You are Colibri, an advanced cybersecurity AI assistant developed by CyberNative AI. Answer with correct answer option only, do not say anything else."""

def Eval_QA(question):
        user_message = question
        assistant_pre = "The correct answer is:"
        predicted_text = ai_complete(QA_EXTRACTOR_PROMPT, user_message, assistant_pre, max_gen=5, temperature=0.1).strip()
        predicted_text = predicted_text.split(" ")[0].strip()
        return predicted_text

run_eval_cybersec_colibri.py

# https://huggingface.co/datasets/CyberNative/CyberSecurityEval
eval_dataset_file = "cybersec_qa_eval_500_pairs.jsonl"

import jsonlines

qa_pairs = []

with jsonlines.open(eval_dataset_file) as reader:
    for obj in reader:
        qa_pairs.append(obj)

print(len(qa_pairs))

import eval_colibri_exl2

SCORE_CORRECT = 0
SCORE_INCORRECT = 0

for pair in qa_pairs:
    print("===")
    question = pair["question"]
    answer = pair["answer"]
    answer = answer.replace("The correct answer is: ", "")
    print(f"Question: {question}")
    colibri_answer = eval_colibri_exl2.Eval_QA(question)
    print(f"OG Answer: {answer} | Colibri Answer: {colibri_answer}")
    # replace . and ) from answers
    answer = answer.replace(".", "").replace(")", "").lower().strip()
    colibri_answer = colibri_answer.replace(".", "").replace(")", "").lower().strip()
    if answer == colibri_answer:
        print("### Correct")
        SCORE_CORRECT += 1
    else:
        print("### Incorrect")
        SCORE_INCORRECT += 1

correct_percent = (SCORE_CORRECT / (SCORE_CORRECT + SCORE_INCORRECT)) * 100

print(f"Correct ({correct_percent}): {SCORE_CORRECT} | Incorrect: {SCORE_INCORRECT}")

Sign up or log in to comment