justinxzhao commited on
Commit
a056e0b
1 Parent(s): 312e7a9

Initial version of data tab browser.

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. app.py +247 -2
.gitignore CHANGED
@@ -1 +1,2 @@
1
- env/
 
 
1
+ env/
2
+ .DS_Store
app.py CHANGED
@@ -1,4 +1,249 @@
1
  import streamlit as st
 
2
 
3
- x = st.slider("Select a value")
4
- st.write(x, "squared is", x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
 
4
+ # Define constants
5
+ MAJOR_A_WIN = "A>>B"
6
+ MINOR_A_WIN = "A>B"
7
+ MINOR_B_WIN = "B>A"
8
+ MAJOR_B_WIN = "B>>A"
9
+ TIE = "A=B"
10
+
11
+
12
+ def is_consistent(rating, reverse_rating):
13
+ if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
14
+ MAJOR_B_WIN,
15
+ MINOR_B_WIN,
16
+ }:
17
+ return True
18
+ if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
19
+ MAJOR_A_WIN,
20
+ MINOR_A_WIN,
21
+ }:
22
+ return True
23
+ if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
24
+ MAJOR_B_WIN,
25
+ MINOR_B_WIN,
26
+ }:
27
+ return True
28
+ if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
29
+ MAJOR_A_WIN,
30
+ MINOR_A_WIN,
31
+ }:
32
+ return True
33
+ if reverse_rating in {TIE} and rating in {TIE}:
34
+ return True
35
+ if reverse_rating in {TIE} and rating not in {TIE}:
36
+ return False
37
+ if rating in {TIE} and reverse_rating not in {TIE}:
38
+ return False
39
+ return False
40
+
41
+
42
+ # Load your dataframes
43
+ df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
44
+ df_responses = pd.read_json("data/responses.jsonl", lines=True)
45
+ df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
46
+
47
+ # Prepare the scenario selector options
48
+ df_test_set["scenario_option"] = (
49
+ df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
50
+ )
51
+ scenario_options = df_test_set["scenario_option"].tolist()
52
+
53
+ # Prepare the model selector options
54
+ model_options = df_responses["llm_responder"].unique().tolist()
55
+
56
+ # Prepare the judge selector options
57
+ judge_options = df_response_judging["llm_judge"].unique().tolist()
58
+
59
+ st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")
60
+
61
+ # Create three columns
62
+ col1, col2, col3 = st.columns(3)
63
+
64
+ # Define CSS to make buttons take full space
65
+ full_width_button_css = """
66
+ <style>
67
+ div.stButton > button {
68
+ width: 100%;
69
+ }
70
+ </style>
71
+ """
72
+
73
+ st.markdown(full_width_button_css, unsafe_allow_html=True)
74
+
75
+ # Place a button in each column
76
+ with col1:
77
+ if st.button("Blog"):
78
+ st.write("Button 1 clicked")
79
+
80
+ with col2:
81
+ if st.button("Paper"):
82
+ st.write("Button 2 clicked")
83
+
84
+ with col3:
85
+ if st.button("Github"):
86
+ st.write("Button 3 clicked")
87
+
88
+ # Custom CSS to center title and header
89
+ center_css = """
90
+ <style>
91
+ h1, h2, h3, h4, h5, h6 {
92
+ text-align: center;
93
+ }
94
+ </style>
95
+ """
96
+
97
+ st.markdown(center_css, unsafe_allow_html=True)
98
+
99
+ st.title("Language Model Council")
100
+ st.subheader("Applied to emotional intelligence")
101
+
102
+ # Create horizontal tabs
103
+ tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
104
+
105
+ # Define content for each tab
106
+ with tabs[0]:
107
+ st.write("This is the leaderboard results page.")
108
+ # Add your leaderboard results content here
109
+ leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
110
+ st.table(leaderboard)
111
+
112
+ with tabs[1]:
113
+ # Create the selectors
114
+ selected_scenario = st.selectbox("Select Scenario", scenario_options)
115
+
116
+ # Get the selected scenario details
117
+ if selected_scenario:
118
+ selected_emobench_id = int(selected_scenario.split(": ")[0])
119
+ scenario_details = df_test_set[
120
+ df_test_set["emobench_id"] == selected_emobench_id
121
+ ].iloc[0]
122
+
123
+ # Display the detailed dilemma and additional information
124
+ st.write(scenario_details["detailed_dilemma"])
125
+ with st.expander("Additional Information"):
126
+ st.write(f"**LLM Author:** {scenario_details['llm_author']}")
127
+ st.write(f"**Problem:** {scenario_details['problem']}")
128
+ st.write(f"**Relationship:** {scenario_details['relationship']}")
129
+ st.write(f"**Scenario:** {scenario_details['scenario']}")
130
+
131
+ st.divider()
132
+
133
+ # Create two columns for model selectors
134
+ col1, col2 = st.columns(2)
135
+
136
+ with col1:
137
+ fixed_model = "qwen1.5-32B-Chat"
138
+ st.selectbox("Select Model", [fixed_model], key="fixed_model")
139
+
140
+ # Get the response string for the fixed model
141
+ if selected_scenario:
142
+ response_details_fixed = df_responses[
143
+ (df_responses["emobench_id"] == selected_emobench_id)
144
+ & (df_responses["llm_responder"] == fixed_model)
145
+ ].iloc[0]
146
+
147
+ # Display the response string
148
+ st.write(response_details_fixed["response_string"])
149
+
150
+ with col2:
151
+ selected_model = st.selectbox(
152
+ "Select Model", model_options, key="dynamic_model"
153
+ )
154
+
155
+ # Get the response string for the selected model
156
+ if selected_model and selected_scenario:
157
+ response_details_dynamic = df_responses[
158
+ (df_responses["emobench_id"] == selected_emobench_id)
159
+ & (df_responses["llm_responder"] == selected_model)
160
+ ].iloc[0]
161
+
162
+ # Display the response string
163
+ st.write(response_details_dynamic["response_string"])
164
+
165
+ st.divider()
166
+
167
+ # Create the llm_judge selector
168
+ selected_judge = st.selectbox("Select Judge", judge_options)
169
+
170
+ # Get the judging details for the selected judge and models
171
+ if selected_judge and selected_scenario:
172
+ col1, col2 = st.columns(2)
173
+
174
+ judging_details_left = df_response_judging[
175
+ (df_response_judging["llm_judge"] == selected_judge)
176
+ & (df_response_judging["first_completion_by"] == fixed_model)
177
+ & (df_response_judging["second_completion_by"] == selected_model)
178
+ ].iloc[0]
179
+
180
+ judging_details_right = df_response_judging[
181
+ (df_response_judging["llm_judge"] == selected_judge)
182
+ & (df_response_judging["first_completion_by"] == selected_model)
183
+ & (df_response_judging["second_completion_by"] == fixed_model)
184
+ ].iloc[0]
185
+
186
+ if is_consistent(
187
+ judging_details_left["pairwise_choice"],
188
+ judging_details_right["pairwise_choice"],
189
+ ):
190
+ st.success("The judge ratings are consistent.", icon="✅")
191
+ else:
192
+ st.warning("The judge ratings are inconsistent.", icon="⚠️")
193
+
194
+ # Display the judging details
195
+ with col1:
196
+ st.write(f"**{fixed_model}** vs **{selected_model}**")
197
+ if not judging_details_left.empty:
198
+ st.write(
199
+ f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
200
+ )
201
+ st.code(judging_details_left["judging_response_string"])
202
+ else:
203
+ st.write("No judging details found for the selected combination.")
204
+
205
+ with col2:
206
+ st.write(f"**{selected_model}** vs **{fixed_model}**")
207
+ if not judging_details_right.empty:
208
+ st.write(
209
+ f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
210
+ )
211
+ st.code(judging_details_right["judging_response_string"])
212
+ else:
213
+ st.write("No judging details found for the selected combination.")
214
+
215
+ st.divider()
216
+
217
+ # Add bar charts for value counts of pairwise choices over all judges
218
+ col1, col2 = st.columns(2)
219
+
220
+ with col1:
221
+ pairwise_counts_left = df_response_judging[
222
+ (df_response_judging["first_completion_by"] == fixed_model)
223
+ & (df_response_judging["second_completion_by"] == selected_model)
224
+ ]["pairwise_choice"].value_counts()
225
+
226
+ st.bar_chart(pairwise_counts_left)
227
+
228
+ with col2:
229
+ pairwise_counts_right = df_response_judging[
230
+ (df_response_judging["first_completion_by"] == selected_model)
231
+ & (df_response_judging["second_completion_by"] == fixed_model)
232
+ ]["pairwise_choice"].value_counts()
233
+
234
+ st.bar_chart(pairwise_counts_right)
235
+
236
+ with tabs[2]:
237
+ st.write("This is the about us page.")
238
+ # Add your about us content here
239
+ st.write(
240
+ """
241
+ **Our Mission:**
242
+ To provide the best service and data insights.
243
+
244
+ **Our Team:**
245
+ - Alice
246
+ - Bob
247
+ - Charlie
248
+ """
249
+ )