import streamlit as st
import pandas as pd
# Define constants
MAJOR_A_WIN = "A>>B"
MINOR_A_WIN = "A>B"
MINOR_B_WIN = "B>A"
MAJOR_B_WIN = "B>>A"
TIE = "A=B"
def is_consistent(rating, reverse_rating):
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
MAJOR_B_WIN,
MINOR_B_WIN,
}:
return True
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
MAJOR_A_WIN,
MINOR_A_WIN,
}:
return True
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
MAJOR_B_WIN,
MINOR_B_WIN,
}:
return True
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
MAJOR_A_WIN,
MINOR_A_WIN,
}:
return True
if reverse_rating in {TIE} and rating in {TIE}:
return True
if reverse_rating in {TIE} and rating not in {TIE}:
return False
if rating in {TIE} and reverse_rating not in {TIE}:
return False
return False
# Load your dataframes
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
df_responses = pd.read_json("data/responses.jsonl", lines=True)
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
# Prepare the scenario selector options
df_test_set["scenario_option"] = (
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
)
scenario_options = df_test_set["scenario_option"].tolist()
# Prepare the model selector options
model_options = df_responses["llm_responder"].unique().tolist()
# Prepare the judge selector options
judge_options = df_response_judging["llm_judge"].unique().tolist()
st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")
# Create three columns
col1, col2, col3 = st.columns(3)
# Define CSS to make buttons take full space
full_width_button_css = """
"""
st.markdown(full_width_button_css, unsafe_allow_html=True)
# Place a button in each column
with col1:
if st.button("Blog"):
st.write("Button 1 clicked")
with col2:
if st.button("Paper"):
st.write("Button 2 clicked")
with col3:
if st.button("Github"):
st.write("Button 3 clicked")
# Custom CSS to center title and header
center_css = """
"""
st.markdown(center_css, unsafe_allow_html=True)
st.title("Language Model Council")
st.subheader("Applied to emotional intelligence")
# Create horizontal tabs
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
# Define content for each tab
with tabs[0]:
st.write("This is the leaderboard results page.")
# Add your leaderboard results content here
leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
st.table(leaderboard)
with tabs[1]:
# Create the selectors
selected_scenario = st.selectbox("Select Scenario", scenario_options)
# Get the selected scenario details
if selected_scenario:
selected_emobench_id = int(selected_scenario.split(": ")[0])
scenario_details = df_test_set[
df_test_set["emobench_id"] == selected_emobench_id
].iloc[0]
# Display the detailed dilemma and additional information
st.write(scenario_details["detailed_dilemma"])
with st.expander("Additional Information"):
st.write(f"**LLM Author:** {scenario_details['llm_author']}")
st.write(f"**Problem:** {scenario_details['problem']}")
st.write(f"**Relationship:** {scenario_details['relationship']}")
st.write(f"**Scenario:** {scenario_details['scenario']}")
st.divider()
# Create two columns for model selectors
col1, col2 = st.columns(2)
with col1:
fixed_model = "qwen1.5-32B-Chat"
st.selectbox("Select Model", [fixed_model], key="fixed_model")
# Get the response string for the fixed model
if selected_scenario:
response_details_fixed = df_responses[
(df_responses["emobench_id"] == selected_emobench_id)
& (df_responses["llm_responder"] == fixed_model)
].iloc[0]
# Display the response string
st.write(response_details_fixed["response_string"])
with col2:
selected_model = st.selectbox(
"Select Model", model_options, key="dynamic_model"
)
# Get the response string for the selected model
if selected_model and selected_scenario:
response_details_dynamic = df_responses[
(df_responses["emobench_id"] == selected_emobench_id)
& (df_responses["llm_responder"] == selected_model)
].iloc[0]
# Display the response string
st.write(response_details_dynamic["response_string"])
st.divider()
# Create the llm_judge selector
selected_judge = st.selectbox("Select Judge", judge_options)
# Get the judging details for the selected judge and models
if selected_judge and selected_scenario:
col1, col2 = st.columns(2)
judging_details_left = df_response_judging[
(df_response_judging["llm_judge"] == selected_judge)
& (df_response_judging["first_completion_by"] == fixed_model)
& (df_response_judging["second_completion_by"] == selected_model)
].iloc[0]
judging_details_right = df_response_judging[
(df_response_judging["llm_judge"] == selected_judge)
& (df_response_judging["first_completion_by"] == selected_model)
& (df_response_judging["second_completion_by"] == fixed_model)
].iloc[0]
if is_consistent(
judging_details_left["pairwise_choice"],
judging_details_right["pairwise_choice"],
):
st.success("The judge ratings are consistent.", icon="✅")
else:
st.warning("The judge ratings are inconsistent.", icon="⚠️")
# Display the judging details
with col1:
st.write(f"**{fixed_model}** vs **{selected_model}**")
if not judging_details_left.empty:
st.write(
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
)
st.code(judging_details_left["judging_response_string"])
else:
st.write("No judging details found for the selected combination.")
with col2:
st.write(f"**{selected_model}** vs **{fixed_model}**")
if not judging_details_right.empty:
st.write(
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
)
st.code(judging_details_right["judging_response_string"])
else:
st.write("No judging details found for the selected combination.")
st.divider()
# Add bar charts for value counts of pairwise choices over all judges
col1, col2 = st.columns(2)
with col1:
pairwise_counts_left = df_response_judging[
(df_response_judging["first_completion_by"] == fixed_model)
& (df_response_judging["second_completion_by"] == selected_model)
]["pairwise_choice"].value_counts()
st.bar_chart(pairwise_counts_left)
with col2:
pairwise_counts_right = df_response_judging[
(df_response_judging["first_completion_by"] == selected_model)
& (df_response_judging["second_completion_by"] == fixed_model)
]["pairwise_choice"].value_counts()
st.bar_chart(pairwise_counts_right)
with tabs[2]:
st.write("This is the about us page.")
# Add your about us content here
st.write(
"""
**Our Mission:**
To provide the best service and data insights.
**Our Team:**
- Alice
- Bob
- Charlie
"""
)