Zekun Wu commited on
Commit
d3bca1f
1 Parent(s): 68b50ab
pages/1_Single_Evaluation.py CHANGED
@@ -80,7 +80,7 @@ else:
80
  if st.button('Evaluate Explanation'):
81
  if question and explanation:
82
  eval = evaluator(model_name)
83
- scores = eval(question, explanation)
84
  st.write('### Scores')
85
  details = write_evaluation_commentary(scores)
86
  df = pd.DataFrame(details)
 
80
  if st.button('Evaluate Explanation'):
81
  if question and explanation:
82
  eval = evaluator(model_name)
83
+ scores = evaluate_single(question, explanation)
84
  st.write('### Scores')
85
  details = write_evaluation_commentary(scores)
86
  df = pd.DataFrame(details)
pages/4_Batch_Evaluation.py CHANGED
@@ -32,7 +32,7 @@ def batch_evaluate(uploaded_file):
32
  for index, row in enumerate(df.itertuples(), start=1):
33
  question = row.question
34
  explanation = row.explanation
35
- scores = eval_instance(question, explanation) # Evaluate using the evaluator
36
  commentary_details = write_evaluation_commentary(scores) # Generate commentary based on scores
37
  results.append({
38
  'Question': question,
 
32
  for index, row in enumerate(df.itertuples(), start=1):
33
  question = row.question
34
  explanation = row.explanation
35
+ scores = eval_instance.evaluate_single(question, explanation) # Evaluate using the evaluator
36
  commentary_details = write_evaluation_commentary(scores) # Generate commentary based on scores
37
  results.append({
38
  'Question': question,
pages/5_Conversation_Evaluation.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from util.evaluator import Evaluator, write_evaluation_commentary
6
+ import os
7
+
8
+ # Predefined examples
9
+ examples = {
10
+ 'good': [
11
+ {"role": "system", "content": "You are a helpful assistant."},
12
+ {"role": "user", "content": "What causes rainbows to appear in the sky?"},
13
+ {"role": "assistant",
14
+ "content": "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky."},
15
+ {"role": "user", "content": "That's interesting! Why does it create so many colors?"}
16
+ ],
17
+ 'bad': [
18
+ {"role": "system", "content": "You are a helpful assistant."},
19
+ {"role": "user", "content": "What causes rainbows to appear in the sky?"},
20
+ {"role": "assistant",
21
+ "content": "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around."},
22
+ {"role": "user", "content": "That doesn't seem very clear."}
23
+ ]
24
+ }
25
+
26
+
27
+ # Function to check password
28
+ def check_password():
29
+ def password_entered():
30
+ if password_input == os.getenv('PASSWORD'):
31
+ st.session_state['password_correct'] = True
32
+ else:
33
+ st.error("Incorrect Password, please try again.")
34
+
35
+ password_input = st.text_input("Enter Password:", type="password")
36
+ submit_button = st.button("Submit", on_click=password_entered)
37
+
38
+ if submit_button and not st.session_state.get('password_correct', False):
39
+ st.error("Please enter a valid password to access the demo.")
40
+
41
+
42
+ # Title of the application
43
+ st.title('Single Evaluation of Conversations')
44
+
45
+ # Description of the application
46
+ st.sidebar.write("""
47
+ ### Welcome to the Single Evaluation of Conversations Demo
48
+ This application allows you to evaluate the quality of conversations generated for various contexts using different language models. You can either use predefined examples or input your own conversations and contexts.
49
+ """)
50
+
51
+ # Explanation of principles
52
+ st.sidebar.write("""
53
+ ### Explanation Principles
54
+ When evaluating conversations, consider the following principles mapped to user empowerment and regulatory compliance outcomes:
55
+
56
+ 1. **Factually Correct**: The information should be accurate and relevant to empower users and meet external audit requirements.
57
+ 2. **Useful**: Explanations should be clear and meaningful, helping users make informed decisions.
58
+ 3. **Context Specific**: Explanations should be tailored to the context of use, enhancing their relevance and utility.
59
+ 4. **User Specific**: Explanations should address the needs and preferences of the user, enabling better decision-making.
60
+ 5. **Provide Pluralism**: Explanations should present diverse perspectives, allowing users to understand different viewpoints and make well-rounded decisions.
61
+ """)
62
+
63
+ # Check if password has been validated
64
+ if not st.session_state.get('password_correct', False):
65
+ check_password()
66
+ else:
67
+ st.sidebar.success("Password Verified. Proceed with the demo.")
68
+ model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
69
+
70
+ # User choice between predefined examples or their own input
71
+ input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
72
+
73
+ if input_type == 'Use predefined example':
74
+ example_type = st.radio("Select an example type:", ('good', 'bad'))
75
+ conversation = examples[example_type]
76
+ context = "Example context"
77
+ else:
78
+ conversation_input = st.text_area('Enter your conversation (JSON format):',
79
+ '[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020?"}, {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}]')
80
+ context_input = st.text_input('Enter your context:', 'general user')
81
+
82
+ try:
83
+ conversation = json.loads(conversation_input)
84
+ context = context_input
85
+ except json.JSONDecodeError:
86
+ st.error("Invalid JSON format for conversation.")
87
+ conversation = None
88
+ context = None
89
+
90
+ st.write('### Conversation')
91
+ if conversation:
92
+ st.write(conversation)
93
+ else:
94
+ st.write('No conversation entered yet.')
95
+
96
+ st.write('### Context')
97
+ if context:
98
+ st.write(context)
99
+ else:
100
+ st.write('No context entered yet.')
101
+
102
+ if st.button('Evaluate Conversation'):
103
+ if conversation and context:
104
+ eval = Evaluator(model_name)
105
+ scores = eval.evaluate_conversation(conversation, context)
106
+ st.write('### Scores')
107
+ details = write_evaluation_commentary(scores["aggregate_scores"])
108
+ df = pd.DataFrame(details)
109
+ st.write(df)
110
+
111
+ data = {
112
+ 'Conversation': conversation,
113
+ 'Context': context,
114
+ **{detail['Principle']: detail['Score'] for detail in details}
115
+ }
116
+ df = pd.DataFrame([data])
117
+
118
+ # Convert DataFrame to CSV for download
119
+ csv = df.to_csv(index=False)
120
+ st.download_button(
121
+ label="Download evaluation as CSV",
122
+ data=csv,
123
+ file_name='evaluation.csv',
124
+ mime='text/csv',
125
+ )
126
+ else:
127
+ st.error('Please enter both a conversation and a context to evaluate.')
util/evaluator.py CHANGED
@@ -15,7 +15,7 @@ class evaluator:
15
 
16
  return scores
17
 
18
- def __call__(self, question,explanation):
19
 
20
  evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
21
  an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
@@ -76,6 +76,69 @@ class evaluator:
76
 
77
  return self.validate_scores(scores)
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def write_evaluation_commentary(scores):
80
  evaluation_details = []
81
  for principle, score in scores.items():
@@ -124,7 +187,15 @@ def write_evaluation_commentary(scores):
124
  return evaluation_details
125
 
126
  if __name__ == '__main__':
 
127
  eval = evaluator()
128
- question = "What is the capital of France?"
129
- explanation = "The capital of France is Paris."
130
- print(eval(question, explanation))
 
 
 
 
 
 
 
 
15
 
16
  return scores
17
 
18
+ def evaluate_single(self, question,explanation):
19
 
20
  evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
21
  an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
 
76
 
77
  return self.validate_scores(scores)
78
 
79
+ def format_conversation(self, conversation):
80
+ formatted_conversation = "\n".join(
81
+ f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation
82
+ )
83
+ return formatted_conversation
84
+
85
+ def evaluate_conversation(self, conversation, context):
86
+ formatted_conversation = self.format_conversation(conversation)
87
+ evaluation_prompt = f"""
88
+ You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
89
+
90
+ Conversation:
91
+ {formatted_conversation}
92
+
93
+ Context:
94
+ {context}
95
+
96
+ Evaluation Criteria:
97
+
98
+ Factually Correct:
99
+ Definition: The explanation must be accurate and relevant to the question and the subject matter.
100
+ Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
101
+
102
+ Useful:
103
+ Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
104
+ Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
105
+
106
+ Context Specific:
107
+ Definition: The explanation should be relevant to the specific context or scenario implied by the question.
108
+ Score: (0-1) How well does the explanation address the specific context or scenario of the question?
109
+
110
+ User Specific:
111
+ Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
112
+ Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
113
+
114
+ Provides Pluralism:
115
+ Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
116
+ Score: (0-1) How well does the explanation provide or support multiple perspectives?
117
+
118
+ After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
119
+
120
+ Example JSON format:
121
+
122
+ Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
123
+
124
+ Answer:
125
+ """
126
+
127
+ print(evaluation_prompt)
128
+
129
+ response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
130
+ try:
131
+ scores = json.loads(response)
132
+ except json.JSONDecodeError:
133
+ repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
134
+ try:
135
+ scores = json.loads(repaired_json)
136
+ except json.JSONDecodeError:
137
+ print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
138
+ return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]}
139
+
140
+ return self.validate_scores(scores)
141
+
142
  def write_evaluation_commentary(scores):
143
  evaluation_details = []
144
  for principle, score in scores.items():
 
187
  return evaluation_details
188
 
189
  if __name__ == '__main__':
190
+
191
  eval = evaluator()
192
+ conversation = [
193
+ {"role": "system", "content": "You are a helpful assistant."},
194
+ {"role": "user", "content": "Who won the world series in 2020?"},
195
+ {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
196
+ {"role": "user", "content": "Where was it played?"}
197
+ ]
198
+ context = "general user, user_background is sports enthusiast"
199
+ results = eval.evaluate_conversation(conversation, context)
200
+ print(results)
201
+ print(write_evaluation_commentary(results))