Zekun Wu commited on
Commit
ef3367f
1 Parent(s): 90100ff
Files changed (1) hide show
  1. util/evaluator.py +85 -11
util/evaluator.py CHANGED
@@ -20,7 +20,7 @@ class evaluator:
20
  evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
21
  an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
22
  should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
23
- and 1 indicates that the principle is fully satisfied.
24
 
25
  Question:
26
  {question}
@@ -50,17 +50,37 @@ class evaluator:
50
  Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
51
  Score: (0-1) How well does the explanation provide or support multiple perspectives?
52
 
53
- After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
54
 
55
  Example JSON format:
56
-
57
- Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
58
-
59
- Answer:
60
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
63
- #response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
64
  print(response)
65
  try:
66
  scores = json.loads(response)
@@ -139,12 +159,18 @@ class evaluator:
139
 
140
  return self.validate_scores(scores)
141
 
 
142
  def write_evaluation_commentary(scores):
143
  evaluation_details = []
144
- for principle, score in scores.items():
 
 
 
145
 
146
  if score == -1:
147
- evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
 
 
148
  continue
149
 
150
  if principle == "Factually Correct":
@@ -183,8 +209,56 @@ def write_evaluation_commentary(scores):
183
  else:
184
  comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
185
 
186
- evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
 
 
187
  return evaluation_details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  if __name__ == '__main__':
190
 
 
20
  evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
21
  an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
22
  should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
23
+ and 1 indicates that the principle is fully satisfied. Additionally, provide a brief explanation for each score to justify your rating.
24
 
25
  Question:
26
  {question}
 
50
  Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
51
  Score: (0-1) How well does the explanation provide or support multiple perspectives?
52
 
53
+ After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
54
 
55
  Example JSON format:
56
+ {{
57
+ "Factually Correct": {{
58
+ "Justification": "The explanation is mostly accurate with only minor inaccuracies.",
59
+ "Score": 0.9
60
+ }},
61
+ "Useful": {{
62
+ "Justification": "The explanation is very helpful in understanding the main concept.",
63
+ "Score": 0.85
64
+ }},
65
+ "Context Specific": {{
66
+ "Justification": "The explanation is generally relevant to the specific context but lacks some detail.",
67
+ "Score": 0.8
68
+ }},
69
+ "User Specific": {{
70
+ "Justification": "The explanation is appropriate for the typical user but may be too technical for some.",
71
+ "Score": 0.75
72
+ }},
73
+ "Provides Pluralism": {{
74
+ "Justification": "The explanation considers multiple perspectives but could include more viewpoints.",
75
+ "Score": 0.7
76
+ }}
77
+ }}
78
+
79
+ Answer:
80
+ """
81
 
82
  response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
83
+
84
  print(response)
85
  try:
86
  scores = json.loads(response)
 
159
 
160
  return self.validate_scores(scores)
161
 
162
+
163
  def write_evaluation_commentary(scores):
164
  evaluation_details = []
165
+
166
+ for principle, details in scores.items():
167
+ score = details.get('Score', -1)
168
+ justification = details.get('Justification', '')
169
 
170
  if score == -1:
171
+ evaluation_details.append(
172
+ {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
173
+ 'Justification': justification})
174
  continue
175
 
176
  if principle == "Factually Correct":
 
209
  else:
210
  comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
211
 
212
+ evaluation_details.append(
213
+ {'Principle': principle, 'Score': score, 'Commentary': comment, 'Justification': justification})
214
+
215
  return evaluation_details
216
+ # def write_evaluation_commentary(scores):
217
+ # evaluation_details = []
218
+ # for principle, score in scores.items():
219
+ #
220
+ # if score == -1:
221
+ # evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
222
+ # continue
223
+ #
224
+ # if principle == "Factually Correct":
225
+ # if score >= 0.8:
226
+ # comment = "Excellent accuracy! The information is precise and directly relevant to the question."
227
+ # elif score >= 0.5:
228
+ # comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
229
+ # else:
230
+ # comment = "The explanation contains significant inaccuracies or irrelevant information."
231
+ # elif principle == "Useful":
232
+ # if score >= 0.8:
233
+ # comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
234
+ # elif score >= 0.5:
235
+ # comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
236
+ # else:
237
+ # comment = "The explanation does little to help understand or apply the information provided."
238
+ # elif principle == "Context Specific":
239
+ # if score >= 0.8:
240
+ # comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
241
+ # elif score >= 0.5:
242
+ # comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
243
+ # else:
244
+ # comment = "Fails to address the context of the question, lacking relevance or specificity."
245
+ # elif principle == "User Specific":
246
+ # if score >= 0.8:
247
+ # comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
248
+ # elif score >= 0.5:
249
+ # comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
250
+ # else:
251
+ # comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
252
+ # elif principle == "Provides Pluralism":
253
+ # if score >= 0.8:
254
+ # comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
255
+ # elif score >= 0.5:
256
+ # comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
257
+ # else:
258
+ # comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
259
+ #
260
+ # evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
261
+ # return evaluation_details
262
 
263
  if __name__ == '__main__':
264