Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

App Files Files Community

Zekun Wu commited on Jun 23

Commit

ef3367f

•

1 Parent(s): 90100ff

add

Browse files

Files changed (1) hide show

util/evaluator.py +85 -11

util/evaluator.py CHANGED Viewed

@@ -20,7 +20,7 @@ class evaluator:
         evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
         an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
         should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
-        and 1 indicates that the principle is fully satisfied.
         Question:
         {question}
@@ -50,17 +50,37 @@ class evaluator:
         Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
         Score: (0-1) How well does the explanation provide or support multiple perspectives?
-        After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
         Example JSON format:
-        Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
-        Answer:
-        """
         response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
-        #response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
         print(response)
         try:
             scores = json.loads(response)
@@ -139,12 +159,18 @@ class evaluator:
         return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
-    for principle, score in scores.items():
         if score == -1:
-            evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
             continue
         if principle == "Factually Correct":
@@ -183,8 +209,56 @@ def write_evaluation_commentary(scores):
             else:
                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
-        evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
     return evaluation_details
 if __name__ == '__main__':

         evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
         an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
         should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
+        and 1 indicates that the principle is fully satisfied. Additionally, provide a brief explanation for each score to justify your rating.
         Question:
         {question}
         Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
         Score: (0-1) How well does the explanation provide or support multiple perspectives?
+        After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
         Example JSON format:
+        {{
+        "Factually Correct": {{
+            "Justification": "The explanation is mostly accurate with only minor inaccuracies.",
+            "Score": 0.9
+        }},
+        "Useful": {{
+            "Justification": "The explanation is very helpful in understanding the main concept.",
+            "Score": 0.85
+        }},
+        "Context Specific": {{
+            "Justification": "The explanation is generally relevant to the specific context but lacks some detail.",
+            "Score": 0.8
+        }},
+        "User Specific": {{
+            "Justification": "The explanation is appropriate for the typical user but may be too technical for some.",
+            "Score": 0.75
+        }},
+        "Provides Pluralism": {{
+            "Justification": "The explanation considers multiple perspectives but could include more viewpoints.",
+            "Score": 0.7
+        }}
+    }}
+    Answer:
+    """
         response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
         print(response)
         try:
             scores = json.loads(response)
         return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
+    for principle, details in scores.items():
+        score = details.get('Score', -1)
+        justification = details.get('Justification', '')
         if score == -1:
+            evaluation_details.append(
+                {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
+                 'Justification': justification})
             continue
         if principle == "Factually Correct":
             else:
                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
+        evaluation_details.append(
+            {'Principle': principle, 'Score': score, 'Commentary': comment, 'Justification': justification})
     return evaluation_details
+# def write_evaluation_commentary(scores):
+#     evaluation_details = []
+#     for principle, score in scores.items():
+#
+#         if score == -1:
+#             evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
+#             continue
+#
+#         if principle == "Factually Correct":
+#             if score >= 0.8:
+#                 comment = "Excellent accuracy! The information is precise and directly relevant to the question."
+#             elif score >= 0.5:
+#                 comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
+#             else:
+#                 comment = "The explanation contains significant inaccuracies or irrelevant information."
+#         elif principle == "Useful":
+#             if score >= 0.8:
+#                 comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
+#             elif score >= 0.5:
+#                 comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
+#             else:
+#                 comment = "The explanation does little to help understand or apply the information provided."
+#         elif principle == "Context Specific":
+#             if score >= 0.8:
+#                 comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
+#             elif score >= 0.5:
+#                 comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
+#             else:
+#                 comment = "Fails to address the context of the question, lacking relevance or specificity."
+#         elif principle == "User Specific":
+#             if score >= 0.8:
+#                 comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
+#             elif score >= 0.5:
+#                 comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
+#             else:
+#                 comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
+#         elif principle == "Provides Pluralism":
+#             if score >= 0.8:
+#                 comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
+#             elif score >= 0.5:
+#                 comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
+#             else:
+#                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
+#
+#         evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
+#     return evaluation_details
 if __name__ == '__main__':