|
from collections import defaultdict |
|
from typing import Dict, List |
|
|
|
import mmengine |
|
from mmengine import ConfigDict, track_parallel_progress |
|
|
|
from opencompass.registry import EVALUATORS, MODELS |
|
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg |
|
|
|
|
|
@EVALUATORS.register_module() |
|
class ModelEvaluator: |
|
"""TODO: Finish the implementation""" |
|
|
|
def __init__( |
|
self, |
|
config: ConfigDict, |
|
) -> None: |
|
self.tasks = [] |
|
self.cfg = config |
|
self.parse_cfg(self.cfg.pop('evaluator', ConfigDict({}))) |
|
self.dataset_abbrs = [ |
|
dataset_abbr_from_cfg(d) for d in self.cfg['datasets'] |
|
] |
|
self.model_abbrs = [model_abbr_from_cfg(m) for m in self.cfg['models']] |
|
assert len(self.model_abbrs) > 1 |
|
|
|
def parse_cfg(self, cfg: ConfigDict): |
|
|
|
self.judger = MODELS.build(cfg['judger']) |
|
|
|
self.max_num_workers = cfg.get('max_num_workers', 4) |
|
|
|
def evaluate(self): |
|
model_scores = defaultdict(int) |
|
all_partial_scores = track_parallel_progress( |
|
self._evaluate_dataset, |
|
self.dataset_abbrs, |
|
nproc=self.max_num_workers, |
|
keep_order=True) |
|
for partial_scores in all_partial_scores: |
|
for model_idx, score in partial_scores.items(): |
|
model_scores[self.model_abbrs[model_idx]] += score |
|
print(model_scores) |
|
|
|
def _load_dataset(self, dataset_abbr: str): |
|
|
|
original_datasets = [] |
|
self.responses: List[List[str]] = [] |
|
self.questions: List[str] = [] |
|
for model_abbr in self.model_abbrs: |
|
filename = f'output_model/{model_abbr}/{dataset_abbr}.json' |
|
original_datasets.append(mmengine.load(filename)) |
|
for key in original_datasets[-1].keys(): |
|
self.questions.append(original_datasets[-1][key]['origin_prompt']) |
|
responses = [] |
|
for i in range(len(self.model_abbrs)): |
|
responses.append(original_datasets[i][key]['prediction']) |
|
self.responses.append(responses) |
|
|
|
def _evaluate_dataset(self, dataset_abbr: str): |
|
self._load_dataset(dataset_abbr=dataset_abbr) |
|
model_scores = defaultdict(int) |
|
for question, responses in zip(self.questions, self.responses): |
|
prompt = self._make_prompt(question, responses) |
|
print(prompt) |
|
output = self.judger.generate(prompt, |
|
max_out_len=2 * |
|
len(self.model_abbrs)) |
|
model_scores = self._rank_models(output, model_scores) |
|
return model_scores |
|
|
|
def _make_prompt(self, question: str, responses: List[str]) -> str: |
|
prompt = ('Below are a question and a set of answers, each numbered by' |
|
' a digit. Please sort the answers from least to most ' |
|
'appropriate to the question. Only return the digit ' |
|
'seperated by a blank space. For example, when there are ' |
|
'three answers presented, you should say "1 0 2" when the ' |
|
'second answer is the best and the third is the worst.\n' |
|
f'Q: {question}\n') |
|
for i, response in enumerate(responses): |
|
prompt += f'A{i + 1}: {response}\n' |
|
return prompt |
|
|
|
def _rank_models(self, output: str, |
|
model_scores: defaultdict) -> Dict[str, int]: |
|
"""Returns model ranking.""" |
|
output = output.strip().split(' ') |
|
for score, model_idx in enumerate(output): |
|
model_scores[model_idx] += int(score) |
|
return model_scores |
|
|