blue-tundra-42's picture
Upload UNO Scorer (initial version)
f1f682e verified
import os
import re
import json
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from utils import EvaluationRecord
class BaseDataset(ABC):
def __init__(self, **kwargs):
self.evaluation_records: List[EvaluationRecord] = []
self.kwargs = kwargs
def __len__(self):
return len(self.evaluation_records)
@abstractmethod
def load_and_prepare(self):
"""
Load data and populate the self.evaluation_records list.
Each element is an EvaluationRecord object.
"""
pass
@abstractmethod
def build_message(self) -> dict:
""" Prepare the request message for inference and the format is OpenAI Chat Message Format:
{"role": "user", "content": [{"type": "text", "text":"xxx"}, {"type": "image", "image": "xx.png"}, {"type":"audio", "audio":"xx.mp3"}]}
"""
pass
@abstractmethod
def build_score_message(self, record: EvaluationRecord) -> dict:
""" Prepare the request message for scorer and the format is OpenAI Chat Message Format:
{"role": "user", "content": [{"type": "text", "text":"xxx"}}
"""
pass
@abstractmethod
def compute_score(self, record: EvaluationRecord) -> float:
"""
Compute score for a single completed record.
:param record: An EvaluationRecord object with prediction filled.
:return: Score (float).
"""
pass
@abstractmethod
def compute_metrics(self) -> Dict[str, Any]:
"""Compute final aggregated metrics based on all records."""
pass
def save_results(self, file_path: str):
"""Save detailed results and final scores."""
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))
EvaluationRecord.save_records_to_json(self.evaluation_records, file_path)
print(f"Results saved to {file_path}")
def load_results(self, file_path: str):
"""Load data from JSON file into evaluation_records."""
if not os.path.exists(file_path):
print(f"File {file_path} does not exist")
return
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.evaluation_records = []
for item in data:
record = EvaluationRecord(
id=item['id'],
question=item['question'],
message=item['message'],
answer=item['answer'],
response=item.get('response'),
request_status=item.get('request_status', 'pending'),
score_response=item.get('score_response'),
score_status=item.get('score_status', 'pending'),
score=item.get('score'),
extra_info=item.get('extra_info', {})
)
self.evaluation_records.append(record)
print(f"Loaded {len(self.evaluation_records)} records from {file_path}")