Update scorer.py
Browse files
scorer.py
CHANGED
@@ -18,6 +18,25 @@ def normalize_number_str(number_str: str) -> float:
|
|
18 |
return float("inf")
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def split_string(
|
22 |
s: str,
|
23 |
char_list: list[str] = [",", ";"],
|
@@ -37,6 +56,57 @@ def question_scorer(
|
|
37 |
except ValueError:
|
38 |
return False
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
score = 0
|
41 |
if user_task["final_answer"] == val["Final answer"]:
|
42 |
score = val["Total score"]
|
|
|
18 |
return float("inf")
|
19 |
|
20 |
|
21 |
+
def normalize_answer(a):
|
22 |
+
# Lower case
|
23 |
+
# Trim (left and right)
|
24 |
+
# Replace multiple spaces with one space
|
25 |
+
# Remove trailing punctuation
|
26 |
+
# return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
|
27 |
+
if isinstance(a, list):
|
28 |
+
a = ''.join(a)
|
29 |
+
return a.strip().lower()
|
30 |
+
|
31 |
+
|
32 |
+
def exact_match(answer, ground_truth):
|
33 |
+
return normalize_answer(answer) == normalize_answer(ground_truth)
|
34 |
+
|
35 |
+
|
36 |
+
def keyword_match(answer, ground_truth):
|
37 |
+
return normalize_answer(ground_truth) in normalize_answer(answer)
|
38 |
+
|
39 |
+
|
40 |
def split_string(
|
41 |
s: str,
|
42 |
char_list: list[str] = [",", ";"],
|
|
|
56 |
except ValueError:
|
57 |
return False
|
58 |
|
59 |
+
# 打分机制
|
60 |
+
level = 0
|
61 |
+
expertise = 0
|
62 |
+
reasoning = 0
|
63 |
+
comprehension = 0
|
64 |
+
|
65 |
+
final_answer = user_task["final_answer"]
|
66 |
+
expected_answer = val["Final answer"]
|
67 |
+
|
68 |
+
data = val["score"]
|
69 |
+
chat_score = []
|
70 |
+
for i in range(len(data['type'])):
|
71 |
+
item = {
|
72 |
+
'type': data['type'][i],
|
73 |
+
'question': data['question'][i],
|
74 |
+
'choices': data['choices'][i],
|
75 |
+
'answer': data['answer'][i],
|
76 |
+
'expertise': data['expertise'][i],
|
77 |
+
'reasoning': data['reasoning'][i],
|
78 |
+
'comprehension': data['comprehension'][i],
|
79 |
+
'score': data['score'][i]
|
80 |
+
}
|
81 |
+
chat_score.append(item)
|
82 |
+
|
83 |
+
for i, score_item in enumerate(chat_score):
|
84 |
+
answer_true = False
|
85 |
+
if score_item['type'].lower() == 'multiple choice':
|
86 |
+
if exact_match(user_task["score_answer"][i], score_item['answer']):
|
87 |
+
answer_true = True
|
88 |
+
elif score_item['type'].lower() == 'fill in the blanks':
|
89 |
+
if keyword_match(user_task["score_answer"][i], score_item['answer']):
|
90 |
+
answer_true = True
|
91 |
+
elif score_item['type'].lower() == 'short answer questions':
|
92 |
+
for ground_truth in score_item['answer']:
|
93 |
+
if keyword_match(user_task["score_answer"][i], ground_truth):
|
94 |
+
answer_true = True
|
95 |
+
break
|
96 |
+
# print(answer, score_item['answer'], answer_true)
|
97 |
+
# 加分
|
98 |
+
if answer_true:
|
99 |
+
expertise += score_item['expertise']
|
100 |
+
reasoning += score_item['reasoning']
|
101 |
+
comprehension += score_item['comprehension']
|
102 |
+
if score_item['score'] > level:
|
103 |
+
level = score_item['score']
|
104 |
+
print([level, expertise, reasoning, comprehension])
|
105 |
+
# final_answer正确 则满分,但是能力分不加了
|
106 |
+
if expected_answer and exact_match(final_answer, expected_answer):
|
107 |
+
level = 10
|
108 |
+
return [level, expertise, reasoning, comprehension]
|
109 |
+
|
110 |
score = 0
|
111 |
if user_task["final_answer"] == val["Final answer"]:
|
112 |
score = val["Total score"]
|