| """
|
| Science benchmarks for Vortex model.
|
| Evaluates performance across 7 science domains.
|
| """
|
|
|
| import torch
|
| from typing import Dict, List, Tuple
|
| from dataclasses import dataclass
|
|
|
|
|
| @dataclass
|
| class BenchmarkResult:
|
| """Results from a benchmark."""
|
| domain: str
|
| accuracy: float
|
| total_questions: int
|
| correct_answers: int
|
| details: List[Dict]
|
|
|
|
|
| class ScienceBenchmark:
|
| """
|
| Base class for science benchmarks.
|
| """
|
|
|
| def __init__(self, name: str, domain: str):
|
| self.name = name
|
| self.domain = domain
|
|
|
| def load_questions(self) -> List[Dict]:
|
| """Load benchmark questions."""
|
| raise NotImplementedError
|
|
|
| def evaluate(
|
| self,
|
| model,
|
| tokenizer,
|
| device: torch.device,
|
| max_samples: int = 100,
|
| ) -> BenchmarkResult:
|
| """
|
| Evaluate model on benchmark.
|
|
|
| Args:
|
| model: Vortex model
|
| tokenizer: Tokenizer
|
| device: Torch device
|
| max_samples: Maximum samples to evaluate
|
|
|
| Returns:
|
| BenchmarkResult
|
| """
|
| questions = self.load_questions()[:max_samples]
|
| correct = 0
|
|
|
| details = []
|
| for q in questions:
|
|
|
| prompt = self.format_prompt(q)
|
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
|
|
|
|
| with torch.no_grad():
|
| outputs = model.generate(
|
| **inputs,
|
| max_new_tokens=50,
|
| temperature=0.0,
|
| do_sample=False,
|
| )
|
|
|
|
|
| generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| answer = self.extract_answer(generated)
|
|
|
|
|
| is_correct = self.check_answer(answer, q["answer"])
|
| if is_correct:
|
| correct += 1
|
|
|
| details.append({
|
| "question": q["question"],
|
| "expected": q["answer"],
|
| "generated": answer,
|
| "correct": is_correct,
|
| })
|
|
|
| accuracy = correct / len(questions) if questions else 0.0
|
| return BenchmarkResult(
|
| domain=self.domain,
|
| accuracy=accuracy,
|
| total_questions=len(questions),
|
| correct_answers=correct,
|
| details=details,
|
| )
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| """Format question into prompt."""
|
| raise NotImplementedError
|
|
|
| def extract_answer(self, text: str) -> str:
|
| """Extract answer from generated text."""
|
| raise NotImplementedError
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| """Check if predicted answer matches expected."""
|
| raise NotImplementedError
|
|
|
|
|
| class PhysicsBenchmark(ScienceBenchmark):
|
| """Physics benchmark (Feynman Questions style)."""
|
|
|
| def __init__(self):
|
| super().__init__("physics_benchmark", "physics")
|
|
|
| def load_questions(self) -> List[Dict]:
|
|
|
| return [
|
| {
|
| "question": "What is the formula for kinetic energy?",
|
| "answer": "KE = 1/2 mv^2",
|
| "type": "formula",
|
| },
|
| {
|
| "question": "Explain Newton's first law of motion.",
|
| "answer": "An object at rest stays at rest unless acted upon by a force.",
|
| "type": "conceptual",
|
| },
|
| ]
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Question: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
|
|
| if "Answer:" in text:
|
| return text.split("Answer:")[-1].strip()
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
|
|
| pred_lower = predicted.lower()
|
| exp_lower = expected.lower()
|
| return exp_lower in pred_lower or pred_lower in exp_lower
|
|
|
|
|
| class MathBenchmark(ScienceBenchmark):
|
| """Math benchmark (MATH dataset style)."""
|
|
|
| def __init__(self):
|
| super().__init__("math_benchmark", "math")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return [
|
| {
|
| "question": "Solve for x: 2x + 5 = 15",
|
| "answer": "x = 5",
|
| "type": "algebra",
|
| },
|
| {
|
| "question": "What is the derivative of x^2?",
|
| "answer": "2x",
|
| "type": "calculus",
|
| },
|
| ]
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Problem: {question['question']}\nSolution:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| if "Solution:" in text:
|
| return text.split("Solution:")[-1].strip()
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
|
|
| pred = " ".join(predicted.lower().split())
|
| exp = " ".join(expected.lower().split())
|
| return pred == exp
|
|
|
|
|
| class ChemistryBenchmark(ScienceBenchmark):
|
| """Chemistry benchmark."""
|
|
|
| def __init__(self):
|
| super().__init__("chemistry_benchmark", "chemistry")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return [
|
| {
|
| "question": "What is the chemical formula for water?",
|
| "answer": "H2O",
|
| "type": "factual",
|
| },
|
| {
|
| "question": "How many protons does carbon have?",
|
| "answer": "6",
|
| "type": "factual",
|
| },
|
| ]
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Chemistry question: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| if "Answer:" in text:
|
| return text.split("Answer:")[-1].strip()
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| pred = predicted.strip().lower()
|
| exp = expected.strip().lower()
|
| return exp in pred
|
|
|
|
|
| class BiologyBenchmark(ScienceBenchmark):
|
| """Biology benchmark."""
|
|
|
| def __init__(self):
|
| super().__init__("biology_benchmark", "biology")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return [
|
| {
|
| "question": "What is the powerhouse of the cell?",
|
| "answer": "mitochondria",
|
| "type": "factual",
|
| },
|
| {
|
| "question": "What molecule carries genetic information?",
|
| "answer": "DNA",
|
| "type": "factual",
|
| },
|
| ]
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Biology: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| if "Answer:" in text:
|
| return text.split("Answer:")[-1].strip()
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| pred = predicted.strip().lower()
|
| exp = expected.strip().lower()
|
| return exp in pred
|
|
|
|
|
|
|
| class EarthScienceBenchmark(ScienceBenchmark):
|
| def __init__(self):
|
| super().__init__("earth_science_benchmark", "earth")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return []
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Earth Science: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| return predicted.strip().lower() == expected.strip().lower()
|
|
|
|
|
| class SpaceScienceBenchmark(ScienceBenchmark):
|
| def __init__(self):
|
| super().__init__("space_science_benchmark", "space")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return []
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Space Science: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| return predicted.strip().lower() == expected.strip().lower()
|
|
|
|
|
| class ZoologyBenchmark(ScienceBenchmark):
|
| def __init__(self):
|
| super().__init__("zoology_benchmark", "zoology")
|
|
|
| def load_questions(self) -> List[Dict]:
|
| return []
|
|
|
| def format_prompt(self, question: Dict) -> str:
|
| return f"Zoology: {question['question']}\nAnswer:"
|
|
|
| def extract_answer(self, text: str) -> str:
|
| return text.strip()
|
|
|
| def check_answer(self, predicted: str, expected: str) -> bool:
|
| return predicted.strip().lower() == expected.strip().lower()
|
|
|
|
|
| def run_all_benchmarks(
|
| model,
|
| tokenizer,
|
| device: torch.device,
|
| max_samples_per_domain: int = 50,
|
| ) -> Dict[str, BenchmarkResult]:
|
| """
|
| Run all benchmarks and return results.
|
|
|
| Args:
|
| model: Vortex model
|
| tokenizer: Tokenizer
|
| device: Torch device
|
| max_samples_per_domain: Max samples per domain
|
|
|
| Returns:
|
| Dictionary mapping domain to results
|
| """
|
| benchmarks = [
|
| PhysicsBenchmark(),
|
| MathBenchmark(),
|
| ChemistryBenchmark(),
|
| BiologyBenchmark(),
|
| EarthScienceBenchmark(),
|
| SpaceScienceBenchmark(),
|
| ZoologyBenchmark(),
|
| ]
|
|
|
| results = {}
|
| for bench in benchmarks:
|
| print(f"Running {bench.name}...")
|
| result = bench.evaluate(model, tokenizer, device, max_samples=max_samples_per_domain)
|
| results[bench.domain] = result
|
| print(f" Accuracy: {result.accuracy:.2%} ({result.correct_answers}/{result.total_questions})")
|
|
|
| return results
|
|
|
|
|
| def print_summary(results: Dict[str, BenchmarkResult]):
|
| """Print summary of benchmark results."""
|
| print("\n" + "="*60)
|
| print("BENCHMARK RESULTS")
|
| print("="*60)
|
|
|
| for domain, result in results.items():
|
| print(f"{domain:15} {result.accuracy:6.2%} ({result.correct_answers}/{result.total_questions})")
|
|
|
|
|
| all_accuracies = [r.accuracy for r in results.values() if r.total_questions > 0]
|
| if all_accuracies:
|
| avg = sum(all_accuracies) / len(all_accuracies)
|
| print(f"{'OVERALL':15} {avg:6.2%}")
|
| print("="*60)
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| print("This script benchmarks the model across science domains.")
|
| print("To run full benchmarks, integrate with a trained model.")
|
|
|