|
|
""" |
|
|
YOFO Benchmark Script. |
|
|
|
|
|
This script runs a rigorous comparison between YOFO and standard baselines. |
|
|
It measures: |
|
|
1. Latency (Time per example) |
|
|
2. Token Usage (Input + Output tokens) |
|
|
3. Extrapolated Cost (Based on GPT-4 pricing) |
|
|
|
|
|
Baselines: |
|
|
- YOFO (Ours): Single forward pass |
|
|
- N-Call Judge: 12 separate API calls (one per requirement) |
|
|
- CoT Judge: 1 call generating detailed reasoning |
|
|
""" |
|
|
|
|
|
import time |
|
|
import torch |
|
|
import pandas as pd |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from tqdm import tqdm |
|
|
import sys |
|
|
import os |
|
|
|
|
|
|
|
|
sys.path.append(os.getcwd()) |
|
|
from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS |
|
|
|
|
|
|
|
|
PRICE_INPUT_1K = 0.01 |
|
|
PRICE_OUTPUT_1K = 0.03 |
|
|
|
|
|
class Benchmark: |
|
|
def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"): |
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Initializing benchmark on {self.device}...") |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, |
|
|
device_map=self.device, |
|
|
trust_remote_code=True |
|
|
) |
|
|
self.model.eval() |
|
|
self.builder = YOFOTemplateBuilder(self.tokenizer) |
|
|
|
|
|
def _count_tokens(self, text): |
|
|
return len(self.tokenizer.encode(text, add_special_tokens=False)) |
|
|
|
|
|
def benchmark_yofo(self, prompt, response, n_repeats=5): |
|
|
"""Measure YOFO performance (Single Forward Pass).""" |
|
|
|
|
|
|
|
|
yofo_input = self.builder.build_template(prompt, response) |
|
|
|
|
|
|
|
|
actual_tokens = yofo_input.attention_mask.sum().item() |
|
|
print(f"DEBUG: YOFO actual tokens: {actual_tokens}") |
|
|
|
|
|
input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
self.model(input_ids) |
|
|
|
|
|
|
|
|
latencies = [] |
|
|
for _ in range(n_repeats): |
|
|
start = time.time() |
|
|
with torch.no_grad(): |
|
|
self.model(input_ids) |
|
|
latencies.append(time.time() - start) |
|
|
|
|
|
avg_latency = sum(latencies) / len(latencies) |
|
|
|
|
|
return { |
|
|
"method": "YOFO (Ours)", |
|
|
"latency_ms": avg_latency * 1000, |
|
|
"input_tokens": actual_tokens, |
|
|
"output_tokens": 0, |
|
|
"calls": 1 |
|
|
} |
|
|
|
|
|
def benchmark_n_call(self, prompt, response, n_repeats=1): |
|
|
"""Measure N-Call Baseline (12 separate calls).""" |
|
|
total_input_tokens = 0 |
|
|
total_output_tokens = 12 |
|
|
|
|
|
base_context = f"User: {prompt}\nResponse: {response}\n" |
|
|
base_tokens = self._count_tokens(base_context) |
|
|
print(f"DEBUG: Base context tokens: {base_tokens}") |
|
|
|
|
|
for req in YOFO_REQS: |
|
|
question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):" |
|
|
q_tokens = self._count_tokens(question_text) |
|
|
total_input_tokens += (base_tokens + q_tokens) |
|
|
|
|
|
print(f"DEBUG: N-Call total input tokens: {total_input_tokens}") |
|
|
|
|
|
|
|
|
sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):" |
|
|
inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device) |
|
|
|
|
|
start = time.time() |
|
|
with torch.no_grad(): |
|
|
self.model.generate(**inputs, max_new_tokens=1) |
|
|
one_call_time = time.time() - start |
|
|
|
|
|
return { |
|
|
"method": "N-Call Baseline", |
|
|
"latency_ms": one_call_time * 12 * 1000, |
|
|
"input_tokens": total_input_tokens, |
|
|
"output_tokens": total_output_tokens, |
|
|
"calls": 12 |
|
|
} |
|
|
|
|
|
def benchmark_cot(self, prompt, response): |
|
|
"""Measure CoT Baseline (Generate reasoning).""" |
|
|
|
|
|
text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:" |
|
|
input_tokens = self._count_tokens(text) |
|
|
|
|
|
inputs = self.tokenizer(text, return_tensors="pt").to(self.device) |
|
|
|
|
|
|
|
|
start = time.time() |
|
|
with torch.no_grad(): |
|
|
output = self.model.generate(**inputs, max_new_tokens=100) |
|
|
latency = time.time() - start |
|
|
|
|
|
output_tokens = len(output[0]) - len(inputs['input_ids'][0]) |
|
|
|
|
|
return { |
|
|
"method": "Chain-of-Thought", |
|
|
"latency_ms": latency * 1000, |
|
|
"input_tokens": input_tokens, |
|
|
"output_tokens": output_tokens, |
|
|
"calls": 1 |
|
|
} |
|
|
|
|
|
def calculate_cost(row): |
|
|
"""Calculate cost per 1k evaluations based on tokens.""" |
|
|
cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K |
|
|
cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K |
|
|
total_cost_per_eval = cost_in + cost_out |
|
|
return total_cost_per_eval * 1000 |
|
|
|
|
|
def run_benchmark(): |
|
|
bench = Benchmark() |
|
|
|
|
|
|
|
|
|
|
|
prompt = "Explain the history of the Roman Empire and its fall." * 10 |
|
|
response = "The Roman Empire was one of the largest... " * 20 |
|
|
|
|
|
|
|
|
|
|
|
print("\nRunning benchmarks with Long Context (~400 tokens)...") |
|
|
results = [] |
|
|
|
|
|
|
|
|
print("1. Benchmarking YOFO...") |
|
|
results.append(bench.benchmark_yofo(prompt, response)) |
|
|
|
|
|
|
|
|
print("2. Benchmarking N-Call Baseline...") |
|
|
results.append(bench.benchmark_n_call(prompt, response)) |
|
|
|
|
|
|
|
|
print("3. Benchmarking CoT Baseline...") |
|
|
results.append(bench.benchmark_cot(prompt, response)) |
|
|
|
|
|
|
|
|
df = pd.DataFrame(results) |
|
|
df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1) |
|
|
df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms'] |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)") |
|
|
print("="*80) |
|
|
print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False)) |
|
|
|
|
|
|
|
|
df.to_csv("benchmark_results.csv", index=False) |
|
|
print("\nSaved results to benchmark_results.csv") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
run_benchmark() |
|
|
|