Helion-V2 / benchmark.py
Trouter-Library's picture
Create benchmark.py
7440c87 verified
"""
Benchmark script for evaluating Helion-V2 on standard benchmarks.
Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval.
"""
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import argparse
from typing import Dict, List, Tuple
import re
class BenchmarkEvaluator:
"""Evaluator for running benchmarks on Helion-V2."""
def __init__(self, model_name: str, device: str = "cuda"):
"""Initialize evaluator with model."""
print(f"Loading model: {model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map=device,
)
self.model.eval()
self.device = device
def evaluate_mmlu(self, num_shots: int = 5) -> float:
"""
Evaluate on MMLU (Massive Multitask Language Understanding).
Args:
num_shots: Number of examples for few-shot learning
Returns:
Average accuracy across all subjects
"""
print("\n=== Evaluating MMLU ===")
dataset = load_dataset("cais/mmlu", "all", split="test")
correct = 0
total = 0
for item in tqdm(dataset, desc="MMLU"):
question = item["question"]
choices = item["choices"]
answer = item["answer"]
# Format prompt
prompt = f"Question: {question}\n"
for i, choice in enumerate(choices):
prompt += f"{chr(65+i)}. {choice}\n"
prompt += "Answer:"
# Get model prediction
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=1,
temperature=0.0,
do_sample=False,
)
response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
# Check if correct
if response.upper() in ['A', 'B', 'C', 'D']:
predicted_idx = ord(response.upper()) - ord('A')
if predicted_idx == answer:
correct += 1
total += 1
if total >= 1000: # Limit for testing
break
accuracy = correct / total if total > 0 else 0
print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})")
return accuracy
def evaluate_hellaswag(self) -> float:
"""
Evaluate on HellaSwag (commonsense reasoning).
Returns:
Accuracy on HellaSwag
"""
print("\n=== Evaluating HellaSwag ===")
dataset = load_dataset("Rowan/hellaswag", split="validation")
correct = 0
total = 0
for item in tqdm(dataset[:1000], desc="HellaSwag"):
context = item["ctx"]
endings = item["endings"]
label = int(item["label"])
# Calculate log-likelihood for each ending
best_score = float('-inf')
best_idx = -1
for idx, ending in enumerate(endings):
full_text = context + " " + ending
inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model(**inputs, labels=inputs["input_ids"])
score = -outputs.loss.item()
if score > best_score:
best_score = score
best_idx = idx
if best_idx == label:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})")
return accuracy
def evaluate_arc(self, challenge: bool = True) -> float:
"""
Evaluate on ARC (AI2 Reasoning Challenge).
Args:
challenge: Use ARC-Challenge (harder) vs ARC-Easy
Returns:
Accuracy on ARC
"""
subset = "ARC-Challenge" if challenge else "ARC-Easy"
print(f"\n=== Evaluating {subset} ===")
dataset = load_dataset("ai2_arc", subset, split="test")
correct = 0
total = 0
for item in tqdm(dataset, desc=subset):
question = item["question"]
choices = item["choices"]["text"]
labels = item["choices"]["label"]
answer_key = item["answerKey"]
# Format prompt
prompt = f"Question: {question}\n"
for label, choice in zip(labels, choices):
prompt += f"{label}. {choice}\n"
prompt += "Answer:"
# Get model prediction
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=5,
temperature=0.0,
do_sample=False,
)
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
# Extract answer
predicted = response[0] if response else ""
if predicted.upper() == answer_key.upper():
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})")
return accuracy
def evaluate_gsm8k(self) -> float:
"""
Evaluate on GSM8K (grade school math).
Returns:
Accuracy on GSM8K
"""
print("\n=== Evaluating GSM8K ===")
dataset = load_dataset("gsm8k", "main", split="test")
correct = 0
total = 0
for item in tqdm(dataset[:500], desc="GSM8K"): # Sample for speed
question = item["question"]
answer = item["answer"].split("####")[-1].strip()
# Format with chain-of-thought prompt
prompt = f"Question: {question}\nLet's solve this step by step:\n"
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=400,
temperature=0.0,
do_sample=False,
)
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
# Extract numerical answer
numbers = re.findall(r'-?\d+\.?\d*', response)
if numbers:
predicted = numbers[-1] # Take last number
if predicted.replace('.', '').replace('-', '').isdigit():
if float(predicted) == float(answer):
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})")
return accuracy
def evaluate_truthfulqa(self) -> float:
"""
Evaluate on TruthfulQA (truthfulness and informativeness).
Returns:
MC2 accuracy
"""
print("\n=== Evaluating TruthfulQA ===")
dataset = load_dataset("truthful_qa", "multiple_choice", split="validation")
correct = 0
total = 0
for item in tqdm(dataset, desc="TruthfulQA"):
question = item["question"]
mc2_targets = item["mc2_targets"]
choices = mc2_targets["choices"]
labels = mc2_targets["labels"]
# Format prompt
prompt = f"Question: {question}\n"
for i, choice in enumerate(choices):
prompt += f"{i+1}. {choice}\n"
prompt += "Select all correct answers:\n"
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=100,
temperature=0.0,
do_sample=False,
)
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
# Simple scoring: if any correct answer is mentioned
response_lower = response.lower()
found_correct = False
for idx, (choice, label) in enumerate(zip(choices, labels)):
if label == 1 and (choice.lower() in response_lower or str(idx+1) in response):
found_correct = True
break
if found_correct:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})")
return accuracy
def run_all_benchmarks(self) -> Dict[str, float]:
"""
Run all available benchmarks.
Returns:
Dictionary of benchmark results
"""
results = {}
try:
results["MMLU"] = self.evaluate_mmlu()
except Exception as e:
print(f"MMLU evaluation failed: {e}")
results["MMLU"] = 0.0
try:
results["HellaSwag"] = self.evaluate_hellaswag()
except Exception as e:
print(f"HellaSwag evaluation failed: {e}")
results["HellaSwag"] = 0.0
try:
results["ARC-Challenge"] = self.evaluate_arc(challenge=True)
except Exception as e:
print(f"ARC-Challenge evaluation failed: {e}")
results["ARC-Challenge"] = 0.0
try:
results["GSM8K"] = self.evaluate_gsm8k()
except Exception as e:
print(f"GSM8K evaluation failed: {e}")
results["GSM8K"] = 0.0
try:
results["TruthfulQA"] = self.evaluate_truthfulqa()
except Exception as e:
print(f"TruthfulQA evaluation failed: {e}")
results["TruthfulQA"] = 0.0
return results
def main():
parser = argparse.ArgumentParser(description="Benchmark Helion-V2")
parser.add_argument(
"--model",
type=str,
default="DeepXR/Helion-V2",
help="Model name or path"
)
parser.add_argument(
"--device",
type=str,
default="cuda",
help="Device to use"
)
parser.add_argument(
"--benchmark",
type=str,
choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"],
default="all",
help="Benchmark to run"
)
parser.add_argument(
"--output",
type=str,
default="benchmark_results.json",
help="Output file for results"
)
args = parser.parse_args()
evaluator = BenchmarkEvaluator(args.model, args.device)
if args.benchmark == "all":
results = evaluator.run_all_benchmarks()
else:
benchmark_map = {
"mmlu": evaluator.evaluate_mmlu,
"hellaswag": evaluator.evaluate_hellaswag,
"arc": evaluator.evaluate_arc,
"gsm8k": evaluator.evaluate_gsm8k,
"truthfulqa": evaluator.evaluate_truthfulqa,
}
score = benchmark_map[args.benchmark]()
results = {args.benchmark: score}
# Save results
with open(args.output, 'w') as f:
json.dump(results, f, indent=2)
print(f"\n=== Final Results ===")
for benchmark, score in results.items():
print(f"{benchmark}: {score:.2%}")
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()