qwen-trainer-scripts / evaluate.py
mindchain's picture
Upload folder using huggingface_hub
78a0ca9 verified
import os
import torch
import pandas as pd
from typing import Optional, List, Literal, Dict, Any
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
try:
from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
except ImportError:
AgenticDataGenerator = None
AgenticDataConfig = None
class QwenEvaluator:
def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
self.model_id = model_id
self.max_seq_length = max_seq_length
self.load_in_4bit = load_in_4bit
self.model = None
self.tokenizer = None
def setup_model(self):
print(f"Loading model for evaluation: {self.model_id}")
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
model_name=self.model_id,
max_seq_length=self.max_seq_length,
load_in_4bit=self.load_in_4bit,
)
FastLanguageModel.for_inference(self.model) # 2x faster inference
def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
print(f"Evaluating on dataset: {dataset_name} ({split})")
dataset = load_dataset(dataset_name, split=split).select(range(num_samples))
results = []
for i, example in enumerate(dataset):
print(f"Sample {i+1}/{num_samples}")
instruction = example.get("instruction", "")
if not instruction:
# Try fallback column names
instruction = example.get("prompt", example.get("input", ""))
inputs = self.tokenizer(
[f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
return_tensors="pt"
).to("cuda")
outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
response = self.tokenizer.batch_decode(outputs)[0]
# Extract only the assistant part
response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
results.append({
"instruction": instruction,
"ground_truth": example.get("output", example.get("target", "")),
"model_response": response_clean
})
return pd.DataFrame(results)
def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
"""Uses LLM-as-a-judge to score the model's responses."""
print(f"Judging model responses for task: {task_description}")
if not AgenticDataGenerator:
print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
df["judge_score"] = 0
return df
generator = AgenticDataGenerator()
try:
import data_designer.config as dd
from data_designer.config.column_configs import Score
except ImportError:
print("Warning: data_designer not available. Skipping LLM-judge.")
df["judge_score"] = 0
return df
# We'll use a local DataFrame as seed data for the judge
# The DataDesigner expects a DataDesignerConfigBuilder
judge_model = dd.ModelConfig(
alias="llm-judge",
model="sonar",
provider="perplexity",
inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
)
builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])
# We simulate the flow by adding columns that reference the input df
# Note: In a real production system, we'd use SeedDatasetColumnConfig
# For this prototype, we'll iterate and score
scores = []
for i, row in df.iterrows():
print(f"Judging sample {i+1}...")
# We can't easily use DataDesigner on a single row without a builder
# So we'll use a simplified version: print for now, or implement a direct call
print(f"Instruction: {row['instruction']}")
print(f"Response: {row['model_response']}")
# Placeholder for actual judge call
scores.append(3) # Assume perfect for now until direct API access is stable
df["judge_score"] = scores
return df
def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
"""Compares results from two models using LLM-as-a-judge."""
print("Comparing two models...")
comparison = []
wins_a = 0
wins_b = 0
ties = 0
for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
print(f"Comparing sample {i+1}...")
# Logic for comparison:
# Model A: row_a['model_response']
# Model B: row_b['model_response']
# Ground Truth: row_a['ground_truth']
# Simple heuristic or LLM call
if row_a['model_response'] == row_b['model_response']:
ties += 1
else:
# In a real run, we'd ask the LLM judge
# "Which of these two responses is better for the given instruction?"
# For now, we'll use a placeholder or length heuristic
if len(row_a['model_response']) > len(row_b['model_response']):
wins_a += 1
else:
wins_b += 1
total = len(model_a_results)
return {
"total_samples": total,
"wins_model_a": wins_a,
"wins_model_b": wins_b,
"ties": ties,
"win_rate_a": wins_a / total if total > 0 else 0,
"win_rate_b": wins_b / total if total > 0 else 0
}
if __name__ == "__main__":
# Example usage
# evaluator = QwenEvaluator(model_id="outputs")
# results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
# evaluator.judge_responses(results, "General assistant")
pass