Zekun Wu
commited on
Commit
•
ef3367f
1
Parent(s):
90100ff
add
Browse files- util/evaluator.py +85 -11
util/evaluator.py
CHANGED
@@ -20,7 +20,7 @@ class evaluator:
|
|
20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
22 |
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
23 |
-
and 1 indicates that the principle is fully satisfied.
|
24 |
|
25 |
Question:
|
26 |
{question}
|
@@ -50,17 +50,37 @@ class evaluator:
|
|
50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
51 |
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
52 |
|
53 |
-
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the
|
54 |
|
55 |
Example JSON format:
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
63 |
-
|
64 |
print(response)
|
65 |
try:
|
66 |
scores = json.loads(response)
|
@@ -139,12 +159,18 @@ class evaluator:
|
|
139 |
|
140 |
return self.validate_scores(scores)
|
141 |
|
|
|
142 |
def write_evaluation_commentary(scores):
|
143 |
evaluation_details = []
|
144 |
-
|
|
|
|
|
|
|
145 |
|
146 |
if score == -1:
|
147 |
-
evaluation_details.append(
|
|
|
|
|
148 |
continue
|
149 |
|
150 |
if principle == "Factually Correct":
|
@@ -183,8 +209,56 @@ def write_evaluation_commentary(scores):
|
|
183 |
else:
|
184 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
185 |
|
186 |
-
evaluation_details.append(
|
|
|
|
|
187 |
return evaluation_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
if __name__ == '__main__':
|
190 |
|
|
|
20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
22 |
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
23 |
+
and 1 indicates that the principle is fully satisfied. Additionally, provide a brief explanation for each score to justify your rating.
|
24 |
|
25 |
Question:
|
26 |
{question}
|
|
|
50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
51 |
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
52 |
|
53 |
+
After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
|
54 |
|
55 |
Example JSON format:
|
56 |
+
{{
|
57 |
+
"Factually Correct": {{
|
58 |
+
"Justification": "The explanation is mostly accurate with only minor inaccuracies.",
|
59 |
+
"Score": 0.9
|
60 |
+
}},
|
61 |
+
"Useful": {{
|
62 |
+
"Justification": "The explanation is very helpful in understanding the main concept.",
|
63 |
+
"Score": 0.85
|
64 |
+
}},
|
65 |
+
"Context Specific": {{
|
66 |
+
"Justification": "The explanation is generally relevant to the specific context but lacks some detail.",
|
67 |
+
"Score": 0.8
|
68 |
+
}},
|
69 |
+
"User Specific": {{
|
70 |
+
"Justification": "The explanation is appropriate for the typical user but may be too technical for some.",
|
71 |
+
"Score": 0.75
|
72 |
+
}},
|
73 |
+
"Provides Pluralism": {{
|
74 |
+
"Justification": "The explanation considers multiple perspectives but could include more viewpoints.",
|
75 |
+
"Score": 0.7
|
76 |
+
}}
|
77 |
+
}}
|
78 |
+
|
79 |
+
Answer:
|
80 |
+
"""
|
81 |
|
82 |
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
83 |
+
|
84 |
print(response)
|
85 |
try:
|
86 |
scores = json.loads(response)
|
|
|
159 |
|
160 |
return self.validate_scores(scores)
|
161 |
|
162 |
+
|
163 |
def write_evaluation_commentary(scores):
|
164 |
evaluation_details = []
|
165 |
+
|
166 |
+
for principle, details in scores.items():
|
167 |
+
score = details.get('Score', -1)
|
168 |
+
justification = details.get('Justification', '')
|
169 |
|
170 |
if score == -1:
|
171 |
+
evaluation_details.append(
|
172 |
+
{'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
|
173 |
+
'Justification': justification})
|
174 |
continue
|
175 |
|
176 |
if principle == "Factually Correct":
|
|
|
209 |
else:
|
210 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
211 |
|
212 |
+
evaluation_details.append(
|
213 |
+
{'Principle': principle, 'Score': score, 'Commentary': comment, 'Justification': justification})
|
214 |
+
|
215 |
return evaluation_details
|
216 |
+
# def write_evaluation_commentary(scores):
|
217 |
+
# evaluation_details = []
|
218 |
+
# for principle, score in scores.items():
|
219 |
+
#
|
220 |
+
# if score == -1:
|
221 |
+
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
222 |
+
# continue
|
223 |
+
#
|
224 |
+
# if principle == "Factually Correct":
|
225 |
+
# if score >= 0.8:
|
226 |
+
# comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
227 |
+
# elif score >= 0.5:
|
228 |
+
# comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
229 |
+
# else:
|
230 |
+
# comment = "The explanation contains significant inaccuracies or irrelevant information."
|
231 |
+
# elif principle == "Useful":
|
232 |
+
# if score >= 0.8:
|
233 |
+
# comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
234 |
+
# elif score >= 0.5:
|
235 |
+
# comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
236 |
+
# else:
|
237 |
+
# comment = "The explanation does little to help understand or apply the information provided."
|
238 |
+
# elif principle == "Context Specific":
|
239 |
+
# if score >= 0.8:
|
240 |
+
# comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
241 |
+
# elif score >= 0.5:
|
242 |
+
# comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
243 |
+
# else:
|
244 |
+
# comment = "Fails to address the context of the question, lacking relevance or specificity."
|
245 |
+
# elif principle == "User Specific":
|
246 |
+
# if score >= 0.8:
|
247 |
+
# comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
248 |
+
# elif score >= 0.5:
|
249 |
+
# comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
250 |
+
# else:
|
251 |
+
# comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
252 |
+
# elif principle == "Provides Pluralism":
|
253 |
+
# if score >= 0.8:
|
254 |
+
# comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
255 |
+
# elif score >= 0.5:
|
256 |
+
# comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
257 |
+
# else:
|
258 |
+
# comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
259 |
+
#
|
260 |
+
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
261 |
+
# return evaluation_details
|
262 |
|
263 |
if __name__ == '__main__':
|
264 |
|