File size: 13,162 Bytes
256a159 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
# flake8: noqa: E501
import csv
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime
import numpy as np
from mmengine import ConfigDict
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from opencompass.utils import model_abbr_from_cfg
from .subjective_post_process import post_process_autoj, post_process_judgelm
from .utils import get_judgeanswer_and_reference, get_outdir
CATEGORIES = {
'中文推理': ['数学计算', '逻辑推理'],
'中文语言': ['基本任务', '中文理解', '综合问答', '文本写作', '角色扮演', '专业能力'],
}
All_Dimensions = [
'事实正确性', '满足用户需求', '安全无害', '清晰度', '逻辑性', '完备性', '创造性', '可负责程度', '逻辑连贯性',
'公平与可负责程度', '丰富度', '综合得分'
]
MAPPING = {
'事实与解释型回答': ['事实正确性', '满足用户需求', '清晰度', '完备性'],
'逻辑推理型回答': ['事实正确性', '满足用户需求', '逻辑连贯性', '完备性'],
'生成型回答': ['事实正确性', '满足用户需求', '逻辑连贯性', '创造性', '丰富度'],
'建议型回答': ['事实正确性', '满足用户需求', '公平与可负责程度', '创造性']
}
def detect_mapping(text):
if '清晰度' in text and '完备性' in text:
return '事实与解释型回答'
elif '完备性' in text and '逻辑连贯性' in text:
return '逻辑推理型回答'
elif '创造性' in text and '丰富度' in text:
return '生成型回答'
elif '创造性' in text and '公平与可负责程度' in text:
return '建议型回答'
else:
return None
def extract_missing_rating(text, search_type):
searching_keys = MAPPING[search_type]
result_dict = {}
for k in searching_keys:
matches = re.findall(rf'{k}.*?\n', text)
result_dict[k] = None
for match in reversed(matches):
if re.findall(r'\d{1,2}', match):
result_dict[k] = int(re.findall(r'\d{1,2}', match)[-1])
break
overall_number = re.findall('\d{1,2}', text)
try:
result_dict['综合得分'] = int(overall_number[-1])
except:
return {}
return result_dict
def extract_rating_plus(text):
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
match = re.search(pattern, text)
if match:
dictionary_str = match.group(1)
kv_pattern = r"'(.*?)': (\d+)"
matches = re.findall(kv_pattern, dictionary_str)
result_dict = {key: int(value) for key, value in matches}
return result_dict
else:
match_type = detect_mapping(text=text)
if match_type is not None:
return extract_missing_rating(text=text, search_type=match_type)
else:
return None
def extract_rating(text):
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
match = re.search(pattern, text)
if match:
dictionary_str = match.group(1)
kv_pattern = r"'(.*?)': (\d+)"
matches = re.findall(kv_pattern, dictionary_str)
result_dict = {key: int(value) for key, value in matches}
return result_dict
else:
return None
def check_rating(rating, all_dimensions):
for k, v in rating.items():
if isinstance(v, (int, float)) and k in all_dimensions: # 确保值是数字
if v >= 0 and v <= 10:
pass
else:
return None
else:
return None
return rating
def post_process_alignbench_plus(judgement: str,
all_dimensions=All_Dimensions,
possible_keys=['综合得分']):
"""Input a string like below:
xxx{'事实正确性': 1, '满足用户需求': 1, '清晰度': 2, '完备性': 1, '综合得分': 1}xxx,
and extract each score
"""
def extract_score(text):
keys_pattern = '|'.join(map(re.escape, possible_keys))
pattern = rf"({'|'.join(possible_keys)}): (\d+(\.\d{{1,2}})?)"
match = re.search(pattern, text)
if match:
try:
return float(match.group(1))
except ValueError:
return -1
return -1
# judgement = judgement.replace('\n', '')
rating = extract_rating_plus(judgement)
if rating is not None:
score = -1
for key in possible_keys:
score = rating.get(key, -1)
if score != -1:
break
if score == -1:
score = extract_score(judgement)
if score >= 0 and score <= 10:
pass
else:
score = -1
rating = check_rating(rating, all_dimensions)
else:
score = -1
if rating == None or score == -1:
return None
else:
return {'rating': rating, 'score': score}
def post_process_alignbench(judgement: str,
all_dimensions=All_Dimensions,
possible_keys=['综合得分']):
"""Input a string like below:
xxx{'事实正确性': 1, '满足用户需求': 1, '清晰度': 2, '完备性': 1, '综合得分': 1}xxx,
and extract each score
"""
def extract_score(text):
keys_pattern = '|'.join(map(re.escape, possible_keys))
pattern = rf"({'|'.join(possible_keys)}): (\d+(\.\d{{1,2}})?)"
match = re.search(pattern, text)
if match:
try:
return float(match.group(1))
except ValueError:
return -1
return -1
judgement = judgement.replace('\n', '')
rating = extract_rating(judgement)
if rating is not None:
score = -1
for key in possible_keys:
score = rating.get(key, -1)
if score != -1:
break
if score == -1:
score = extract_score(judgement)
if score >= 0 and score <= 10:
pass
else:
score = -1
rating = check_rating(rating, all_dimensions)
else:
score = -1
if rating == None or score == -1:
return None
else:
return {'rating': rating, 'score': score}
def get_dimension_results(judged_answers, references, fout, fout_flag, model):
dimension_ratings = defaultdict(int)
dimension_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
for k, v in ans['rating'].items():
if k != '综合得分' or k != 'Overall Score':
dimension_ratings[k] += v
dimension_counts[k] += 1
else:
if k == '综合得分':
dimension_ratings['综合得分'] += ans['score']
dimension_counts['综合得分'] += 1
else:
dimension_ratings['Overall Score'] += ans['score']
dimension_counts['Overall Score'] += 1
dimension_avg_ratings = defaultdict(float)
for dimension, total_score in dimension_ratings.items():
dimension_avg_ratings[
dimension] = total_score / dimension_counts[dimension]
scores = {model: dimension_avg_ratings}
rows = list(scores.keys())
columns = list(scores[rows[0]].keys())
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(['模型'] + columns)
for row in rows:
writer.writerow([row] +
[scores[row][column] for column in columns])
def get_capability_results(judged_answers,
references,
fout,
fout_flag,
model,
categories=CATEGORIES):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
capability_avg_ratings[
capability] = total_score / capability_counts[capability]
temp_list = []
total_column_num = 2
for category, sub_categories in categories.items():
total_column_num += 1 + len(sub_categories)
capability_avg_ratings[category + '总分'] = np.mean([
np.mean(capability_avg_ratings[cat])
for cat in categories[category]
])
temp_list.append(category + '总分')
capability_avg_ratings['总分'] = 0
for temp in temp_list:
capability_avg_ratings['总分'] += capability_avg_ratings[temp]
capability_avg_ratings['总分'] /= len(temp_list)
scores = {model: capability_avg_ratings}
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
num_header = [str(i) for i in range(total_column_num)]
writer.writerow(num_header)
header = ['模型', '总分']
for category, sub_categories in categories.items():
header.append(category)
header.extend([None for _ in range(len(sub_categories))])
writer.writerow(header)
sub_header = ['模型', '总分']
for category, sub_categories in categories.items():
sub_header.extend([category + '总分'])
sub_header.extend(sub_categories)
writer.writerow(sub_header)
row = [model]
row.append(scores[model]['总分'])
for category, sub_categories in categories.items():
row.append(scores[model][category + '总分'])
for sub_category in sub_categories:
row.append(scores[model][sub_category])
writer.writerow(row)
class AlignmentBenchSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, judge_type='general') -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
self.judge_type = judge_type
assert self.judge_type in [
'general', 'autoj', 'judgelm', 'general_plus'
]
self.judge_map = {
'general': post_process_alignbench,
'general_plus': post_process_alignbench_plus,
'autoj': post_process_autoj,
'judgelm': post_process_judgelm
}
self.judge_function = self.judge_map[self.judge_type]
self.category = CATEGORIES
def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag, fout_flag2 = 0, 0
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr
if self.judge_type == 'general':
fout = osp.join(
output_dir,
'judged-by--' + judge_model + '-dimension.csv')
fout2 = osp.join(
output_dir,
'judged-by--' + judge_model + '-capability.csv')
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
if self.judge_type == 'general':
get_dimension_results(judged_answers, references, fout,
fout_flag, model)
fout_flag += 1
get_capability_results(judged_answers, references, fout2,
fout_flag2, model, self.category)
fout_flag2 += 1
else:
print(subdir_path + ' is not exist! please check!')
if self.judge_type == 'general':
with open(fout, 'r') as f:
x = from_csv(f)
print(x)
with open(fout2, 'r') as f:
x = from_csv(f)
print(x)
|