Upload folder using huggingface_hub

78a0ca9 verified 14 days ago

6.21 kB

	import os
	import torch
	import pandas as pd
	from typing import Optional, List, Literal, Dict, Any
	from unsloth import FastLanguageModel
	from datasets import load_dataset, Dataset
	try:
	from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
	except ImportError:
	AgenticDataGenerator = None
	AgenticDataConfig = None

	class QwenEvaluator:
	def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
	self.model_id = model_id
	self.max_seq_length = max_seq_length
	self.load_in_4bit = load_in_4bit
	self.model = None
	self.tokenizer = None

	def setup_model(self):
	print(f"Loading model for evaluation: {self.model_id}")
	self.model, self.tokenizer = FastLanguageModel.from_pretrained(
	model_name=self.model_id,
	max_seq_length=self.max_seq_length,
	load_in_4bit=self.load_in_4bit,
	)
	FastLanguageModel.for_inference(self.model) # 2x faster inference

	def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
	print(f"Evaluating on dataset: {dataset_name} ({split})")
	dataset = load_dataset(dataset_name, split=split).select(range(num_samples))

	results = []
	for i, example in enumerate(dataset):
	print(f"Sample {i+1}/{num_samples}")
	instruction = example.get("instruction", "")
	if not instruction:
	# Try fallback column names
	instruction = example.get("prompt", example.get("input", ""))

	inputs = self.tokenizer(
	[f"<\|im_start\|>user\n{instruction}<\|im_end\|>\n<\|im_start\|>assistant\n"],
	return_tensors="pt"
	).to("cuda")

	outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
	response = self.tokenizer.batch_decode(outputs)[0]

	# Extract only the assistant part
	response_clean = response.split("<\|im_start\|>assistant\n")[-1].replace("<\|im_end\|>", "").strip()

	results.append({
	"instruction": instruction,
	"ground_truth": example.get("output", example.get("target", "")),
	"model_response": response_clean
	})

	return pd.DataFrame(results)

	def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
	"""Uses LLM-as-a-judge to score the model's responses."""
	print(f"Judging model responses for task: {task_description}")

	if not AgenticDataGenerator:
	print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
	df["judge_score"] = 0
	return df

	generator = AgenticDataGenerator()
	try:
	import data_designer.config as dd
	from data_designer.config.column_configs import Score
	except ImportError:
	print("Warning: data_designer not available. Skipping LLM-judge.")
	df["judge_score"] = 0
	return df


	# We'll use a local DataFrame as seed data for the judge
	# The DataDesigner expects a DataDesignerConfigBuilder

	judge_model = dd.ModelConfig(
	alias="llm-judge",
	model="sonar",
	provider="perplexity",
	inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
	)

	builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])

	# We simulate the flow by adding columns that reference the input df
	# Note: In a real production system, we'd use SeedDatasetColumnConfig
	# For this prototype, we'll iterate and score

	scores = []
	for i, row in df.iterrows():
	print(f"Judging sample {i+1}...")
	# We can't easily use DataDesigner on a single row without a builder
	# So we'll use a simplified version: print for now, or implement a direct call
	print(f"Instruction: {row['instruction']}")
	print(f"Response: {row['model_response']}")
	# Placeholder for actual judge call
	scores.append(3) # Assume perfect for now until direct API access is stable

	df["judge_score"] = scores
	return df

	def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
	"""Compares results from two models using LLM-as-a-judge."""
	print("Comparing two models...")

	comparison = []
	wins_a = 0
	wins_b = 0
	ties = 0

	for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
	print(f"Comparing sample {i+1}...")
	# Logic for comparison:
	# Model A: row_a['model_response']
	# Model B: row_b['model_response']
	# Ground Truth: row_a['ground_truth']

	# Simple heuristic or LLM call
	if row_a['model_response'] == row_b['model_response']:
	ties += 1
	else:
	# In a real run, we'd ask the LLM judge
	# "Which of these two responses is better for the given instruction?"
	# For now, we'll use a placeholder or length heuristic
	if len(row_a['model_response']) > len(row_b['model_response']):
	wins_a += 1
	else:
	wins_b += 1

	total = len(model_a_results)
	return {
	"total_samples": total,
	"wins_model_a": wins_a,
	"wins_model_b": wins_b,
	"ties": ties,
	"win_rate_a": wins_a / total if total > 0 else 0,
	"win_rate_b": wins_b / total if total > 0 else 0
	}

	if __name__ == "__main__":
	# Example usage
	# evaluator = QwenEvaluator(model_id="outputs")
	# results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
	# evaluator.judge_responses(results, "General assistant")
	pass