Spaces:

holistic-ai
/

explainbility_benchmark

Running

App Files Files Community

Zekun Wu commited on May 13

Commit

0a026c0

•

1 Parent(s): e6cc5be

update

Browse files

Files changed (5) hide show

__pycache__/assistants.cpython-310.pyc +0 -0
app.py +27 -0
assistants.py +22 -0
evaluator.py +82 -0
requirements.txt +3 -0

__pycache__/assistants.cpython-310.pyc ADDED Viewed

Binary file (1.31 kB). View file

app.py CHANGED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+from evaluator import evaluator
+st.title('Natural Language Explanation Demo')
+model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
+question = st.text_input('Enter question:', '')
+explaination = st.text_input('Enter explanation:', '')
+if st.button('Evaluate Explanation'):
+    # print the question and explanation
+    st.write('### Question')
+    st.write(question)
+    st.write('### Explanation')
+    st.write(explaination)
+    # Evaluate the question and expl
+    if question and explaination:
+        eval = evaluator(model_name)
+        scores = eval(question,explaination)  # You need to handle the model logic
+        st.write('### Scores')
+        for principle, score in scores.items():
+            st.write(f"{principle}: {score}")
+    else:
+        st.write('Please enter question and explanation to evaluate')

assistants.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from openai import AzureOpenAI
+import os
+class GPTAgent:
+    def __init__(self, model_name):
+        self.client = AzureOpenAI(
+            api_key=os.getenv('AZURE_OPENAI_KEY'),
+            api_version=os.getenv('AZURE_OPENAI_VERSION'),
+            azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')
+        )
+        self.deployment_name = model_name
+    def invoke(self, text, **kwargs):
+        response = self.client.chat.completions.create(
+            model=self.deployment_name,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": text}
+            ],
+            **kwargs
+        )
+        return response.choices[0].message.content

evaluator.py CHANGED Viewed

	@@ -0,0 +1,82 @@

+import json
+from assistants import GPTAgent
+import json_repair
+class evaluator:
+    def __init__(self, model_name='GPT4-turbo'):
+        self.model = GPTAgent(model_name)
+    def validate_scores(self, scores):
+        required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
+        for key in required_keys:
+            if key not in scores or not isinstance(scores[key], (int, float)) or not (0 <= scores[key] <= 1):
+                raise ValueError(f"Score for '{key}' is missing or out of range. Received: {scores.get(key)}")
+        return scores
+    def __call__(self, question,explanation):
+        evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
+        an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
+        should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
+        and 1 indicates that the principle is fully satisfied.
+        Question:
+        {question}
+        Provided Explanation:
+        {explanation}
+        Evaluation Criteria:
+        Factually Correct:
+        Definition: The explanation must be accurate and relevant to the question and the subject matter.
+        Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
+        Useful:
+        Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
+        Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
+        Context Specific:
+        Definition: The explanation should be relevant to the specific context or scenario implied by the question.
+        Score: (0-1) How well does the explanation address the specific context or scenario of the question?
+        User Specific:
+        Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
+        Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
+        Provides Pluralism:
+        Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
+        Score: (0-1) How well does the explanation provide or support multiple perspectives?
+        After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary.
+        Example JSON format:
+        {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
+        Answer:
+        """
+        #response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=60).strip()
+        response = """        {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
+        try:
+            scores = json.loads(response)
+            print(scores)
+        except json.JSONDecodeError:
+            # Attempt to repair the JSON if decoding fails
+            repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
+            try:
+                scores = json.loads(repaired_json)
+            except json.JSONDecodeError:
+                print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
+                return None
+        return self.validate_scores(scores)
+if __name__ == '__main__':
+    eval = evaluator()
+    question = "What is the capital of France?"
+    explanation = "The capital of France is Paris."
+    print(eval(question, explanation))

requirements.txt CHANGED Viewed

	@@ -0,0 +1,3 @@

+backoff
+openai
+json-repair