Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

App Files Files Community

Zekun Wu commited on May 17

Commit

d3bca1f

•

1 Parent(s): 68b50ab

update

Browse files

Files changed (4) hide show

pages/1_Single_Evaluation.py +1 -1
pages/4_Batch_Evaluation.py +1 -1
pages/5_Conversation_Evaluation.py +127 -0
util/evaluator.py +75 -4

pages/1_Single_Evaluation.py CHANGED Viewed

@@ -80,7 +80,7 @@ else:
     if st.button('Evaluate Explanation'):
         if question and explanation:
             eval = evaluator(model_name)
-            scores = eval(question, explanation)
             st.write('### Scores')
             details = write_evaluation_commentary(scores)
             df = pd.DataFrame(details)

     if st.button('Evaluate Explanation'):
         if question and explanation:
             eval = evaluator(model_name)
+            scores = evaluate_single(question, explanation)
             st.write('### Scores')
             details = write_evaluation_commentary(scores)
             df = pd.DataFrame(details)

pages/4_Batch_Evaluation.py CHANGED Viewed

@@ -32,7 +32,7 @@ def batch_evaluate(uploaded_file):
     for index, row in enumerate(df.itertuples(), start=1):
         question = row.question
         explanation = row.explanation
-        scores = eval_instance(question, explanation)  # Evaluate using the evaluator
         commentary_details = write_evaluation_commentary(scores)  # Generate commentary based on scores
         results.append({
             'Question': question,

     for index, row in enumerate(df.itertuples(), start=1):
         question = row.question
         explanation = row.explanation
+        scores = eval_instance.evaluate_single(question, explanation)  # Evaluate using the evaluator
         commentary_details = write_evaluation_commentary(scores)  # Generate commentary based on scores
         results.append({
             'Question': question,

pages/5_Conversation_Evaluation.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import json
+import pandas as pd
+import streamlit as st
+from util.evaluator import Evaluator, write_evaluation_commentary
+import os
+# Predefined examples
+examples = {
+    'good': [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What causes rainbows to appear in the sky?"},
+        {"role": "assistant",
+         "content": "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky."},
+        {"role": "user", "content": "That's interesting! Why does it create so many colors?"}
+    ],
+    'bad': [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What causes rainbows to appear in the sky?"},
+        {"role": "assistant",
+         "content": "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around."},
+        {"role": "user", "content": "That doesn't seem very clear."}
+    ]
+}
+# Function to check password
+def check_password():
+    def password_entered():
+        if password_input == os.getenv('PASSWORD'):
+            st.session_state['password_correct'] = True
+        else:
+            st.error("Incorrect Password, please try again.")
+    password_input = st.text_input("Enter Password:", type="password")
+    submit_button = st.button("Submit", on_click=password_entered)
+    if submit_button and not st.session_state.get('password_correct', False):
+        st.error("Please enter a valid password to access the demo.")
+# Title of the application
+st.title('Single Evaluation of Conversations')
+# Description of the application
+st.sidebar.write("""
+### Welcome to the Single Evaluation of Conversations Demo
+This application allows you to evaluate the quality of conversations generated for various contexts using different language models. You can either use predefined examples or input your own conversations and contexts.
+""")
+# Explanation of principles
+st.sidebar.write("""
+### Explanation Principles
+When evaluating conversations, consider the following principles mapped to user empowerment and regulatory compliance outcomes:
+1. **Factually Correct**: The information should be accurate and relevant to empower users and meet external audit requirements.
+2. **Useful**: Explanations should be clear and meaningful, helping users make informed decisions.
+3. **Context Specific**: Explanations should be tailored to the context of use, enhancing their relevance and utility.
+4. **User Specific**: Explanations should address the needs and preferences of the user, enabling better decision-making.
+5. **Provide Pluralism**: Explanations should present diverse perspectives, allowing users to understand different viewpoints and make well-rounded decisions.
+""")
+# Check if password has been validated
+if not st.session_state.get('password_correct', False):
+    check_password()
+else:
+    st.sidebar.success("Password Verified. Proceed with the demo.")
+    model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
+    # User choice between predefined examples or their own input
+    input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
+    if input_type == 'Use predefined example':
+        example_type = st.radio("Select an example type:", ('good', 'bad'))
+        conversation = examples[example_type]
+        context = "Example context"
+    else:
+        conversation_input = st.text_area('Enter your conversation (JSON format):',
+                                          '[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020?"}, {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}]')
+        context_input = st.text_input('Enter your context:', 'general user')
+        try:
+            conversation = json.loads(conversation_input)
+            context = context_input
+        except json.JSONDecodeError:
+            st.error("Invalid JSON format for conversation.")
+            conversation = None
+            context = None
+    st.write('### Conversation')
+    if conversation:
+        st.write(conversation)
+    else:
+        st.write('No conversation entered yet.')
+    st.write('### Context')
+    if context:
+        st.write(context)
+    else:
+        st.write('No context entered yet.')
+    if st.button('Evaluate Conversation'):
+        if conversation and context:
+            eval = Evaluator(model_name)
+            scores = eval.evaluate_conversation(conversation, context)
+            st.write('### Scores')
+            details = write_evaluation_commentary(scores["aggregate_scores"])
+            df = pd.DataFrame(details)
+            st.write(df)
+            data = {
+                'Conversation': conversation,
+                'Context': context,
+                **{detail['Principle']: detail['Score'] for detail in details}
+            }
+            df = pd.DataFrame([data])
+            # Convert DataFrame to CSV for download
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="Download evaluation as CSV",
+                data=csv,
+                file_name='evaluation.csv',
+                mime='text/csv',
+            )
+        else:
+            st.error('Please enter both a conversation and a context to evaluate.')

util/evaluator.py CHANGED Viewed

@@ -15,7 +15,7 @@ class evaluator:
         return scores
-    def __call__(self, question,explanation):
         evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
         an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
@@ -76,6 +76,69 @@ class evaluator:
         return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
     for principle, score in scores.items():
@@ -124,7 +187,15 @@ def write_evaluation_commentary(scores):
     return evaluation_details
 if __name__ == '__main__':
     eval = evaluator()
-    question = "What is the capital of France?"
-    explanation = "The capital of France is Paris."
-    print(eval(question, explanation))

         return scores
+    def evaluate_single(self, question,explanation):
         evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
         an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
         return self.validate_scores(scores)
+    def format_conversation(self, conversation):
+        formatted_conversation = "\n".join(
+            f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation
+        )
+        return formatted_conversation
+    def evaluate_conversation(self, conversation, context):
+        formatted_conversation = self.format_conversation(conversation)
+        evaluation_prompt = f"""
+        You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
+        Conversation:
+        {formatted_conversation}
+        Context:
+        {context}
+        Evaluation Criteria:
+        Factually Correct:
+        Definition: The explanation must be accurate and relevant to the question and the subject matter.
+        Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
+        Useful:
+        Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
+        Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
+        Context Specific:
+        Definition: The explanation should be relevant to the specific context or scenario implied by the question.
+        Score: (0-1) How well does the explanation address the specific context or scenario of the question?
+        User Specific:
+        Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
+        Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
+        Provides Pluralism:
+        Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
+        Score: (0-1) How well does the explanation provide or support multiple perspectives?
+        After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
+        Example JSON format:
+        Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
+        Answer:
+        """
+        print(evaluation_prompt)
+        response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
+        try:
+            scores = json.loads(response)
+        except json.JSONDecodeError:
+            repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
+            try:
+                scores = json.loads(repaired_json)
+            except json.JSONDecodeError:
+                print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
+                return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]}
+        return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
     for principle, score in scores.items():
     return evaluation_details
 if __name__ == '__main__':
     eval = evaluator()
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Who won the world series in 2020?"},
+        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
+        {"role": "user", "content": "Where was it played?"}
+    ]
+    context = "general user, user_background is sports enthusiast"
+    results = eval.evaluate_conversation(conversation, context)
+    print(results)
+    print(write_evaluation_commentary(results))