justinxzhao's picture
Minor styling changes.
e10e00e
raw
history blame
17.2 kB
import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
import random
# Define constants
MAJOR_A_WIN = "A>>B"
MINOR_A_WIN = "A>B"
MINOR_B_WIN = "B>A"
MAJOR_B_WIN = "B>>A"
TIE = "A=B"
def is_consistent(rating, reverse_rating):
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
MAJOR_B_WIN,
MINOR_B_WIN,
}:
return True
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
MAJOR_A_WIN,
MINOR_A_WIN,
}:
return True
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
MAJOR_B_WIN,
MINOR_B_WIN,
}:
return True
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
MAJOR_A_WIN,
MINOR_A_WIN,
}:
return True
if reverse_rating in {TIE} and rating in {TIE}:
return True
if reverse_rating in {TIE} and rating not in {TIE}:
return False
if rating in {TIE} and reverse_rating not in {TIE}:
return False
return False
# Function to convert PIL image to base64
def pil_to_base64(img):
buffered = BytesIO()
img.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
return img_str
# Load your dataframes
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
df_responses = pd.read_json("data/responses.jsonl", lines=True)
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
df_leaderboard = (
pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
)
df_leaderboard = df_leaderboard.rename(
columns={"EI Score": "Council Arena EI Score (95% CI)"}
)
# Prepare the scenario selector options
df_test_set["scenario_option"] = (
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
)
scenario_options = df_test_set["scenario_option"].tolist()
# Prepare the model selector options
model_options = df_responses["llm_responder"].unique().tolist()
# Prepare the judge selector options
judge_options = df_response_judging["llm_judge"].unique().tolist()
st.set_page_config(page_title="Language Model Council", page_icon="🏛️", layout="wide")
# Custom CSS to center title and header
center_css = """
<style>
h1, h2, h3, h6{
text-align: center;
}
</style>
"""
st.markdown(center_css, unsafe_allow_html=True)
# Title and subtitle.
st.title("Language Model Council")
st.markdown(
"### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
)
st.markdown(
"###### [Justin Zhao](https://www.justinxzhao.com/)¹, [Flor Miriam Plaza-del-Arco](https://fmplaza.github.io/)², [Amanda Cercas Curry](https://amandacurry.github.io/)²"
)
st.markdown("###### ¹ Predibase, ² Bocconi University")
# Create three columns
_, col1, col2, col3, col4, _ = st.columns([0.3, 0.1, 0.1, 0.1, 0.1, 0.3])
with col1:
st.link_button(
"Data",
"https://huggingface.co/datasets/llm-council/emotional_application",
use_container_width=True,
type="primary",
)
with col2:
st.link_button(
"Paper",
"https://openreview.net/forum?id=EgEMEYECXz",
use_container_width=True,
type="primary",
)
with col3:
st.link_button(
"Github",
"https://github.com/llm-council/llm-council",
use_container_width=True,
type="primary",
)
with col4:
st.link_button(
"Website", "https://llm-council.com/", use_container_width=True, type="primary"
)
# Render hero image.
with open("img/hero.svg", "r") as file:
svg_content = file.read()
left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2])
with cent_co:
st.image(svg_content, use_column_width=True)
with cent_co.expander("Abstract"):
st.markdown(
"""The rapid advancement of Large Language Models (LLMs) necessitates robust
and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based
on how well their responses align with human preferences. However, many tasks
such as those related to emotional intelligence, creative writing, or persuasiveness,
are highly subjective and often lack majoritarian human agreement. Judges may
have irreconcilable disagreements about what constitutes a better response. To
address the challenge of ranking LLMs on highly subjective tasks, we propose
a novel benchmarking framework, the Language Model Council (LMC). The
LMC operates through a democratic process to: 1) formulate a test set through
equal participation, 2) administer the test among council members, and 3) evaluate
responses as a collective jury. We deploy a council of 20 newest LLMs on an
open-ended emotional intelligence task: responding to interpersonal dilemmas.
Our results show that the LMC produces rankings that are more separable, robust,
and less biased than those from any individual LLM judge, and is more consistent
with a human-established leaderboard compared to other benchmarks."""
)
st.markdown(
"This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
)
# Create horizontal tabs
tabs = st.tabs(
[
"Leaderboard Results",
"Interpersonal Conflicts",
"Analysis",
"About Us",
]
)
# Define content for each tab
with tabs[0]:
_, mid_column, _ = st.columns([0.2, 0.6, 0.2])
mid_column.dataframe(df_leaderboard)
# HTML and CSS to create a text box with specified color
def colored_text_box(text, background_color, text_color="black"):
html_code = f"""
<div style="
background-color: {background_color};
color: {text_color};
padding: 10px;
border-radius: 10px;
">
{text}
</div>
"""
return html_code
# Ensure to initialize session state variables if they do not exist
if "selected_scenario" not in st.session_state:
st.session_state.selected_scenario = None
if "selected_model" not in st.session_state:
st.session_state.selected_model = None
if "selected_judge" not in st.session_state:
st.session_state.selected_judge = None
# Define callback functions to update session state
def update_scenario():
st.session_state.selected_scenario = st.session_state.scenario_selector
def update_model():
st.session_state.selected_model = st.session_state.model_selector
def update_judge():
st.session_state.selected_judge = st.session_state.judge_selector
def randomize_selection():
st.session_state.selected_scenario = random.choice(scenario_options)
st.session_state.selected_model = random.choice(model_options)
st.session_state.selected_judge = random.choice(judge_options)
with tabs[1]:
# Add randomize button at the top of the app
_, mid_column, _ = st.columns([0.4, 0.2, 0.4])
mid_column.button(
":game_die: Randomize!",
on_click=randomize_selection,
type="primary",
use_container_width=True,
)
st.markdown("#### 1. Select a scenario.")
# Create the selectors
st.session_state.selected_scenario = st.selectbox(
"Select Scenario",
scenario_options,
label_visibility="hidden",
key="scenario_selector",
on_change=update_scenario,
index=(
scenario_options.index(st.session_state.selected_scenario)
if st.session_state.selected_scenario
else 0
),
)
# Get the selected scenario details
if st.session_state.selected_scenario:
selected_emobench_id = int(st.session_state.selected_scenario.split(": ")[0])
scenario_details = df_test_set[
df_test_set["emobench_id"] == selected_emobench_id
].iloc[0]
# Display the detailed dilemma and additional information
st.markdown(
colored_text_box(
scenario_details["detailed_dilemma"],
"#01204E",
"white",
),
unsafe_allow_html=True,
)
with st.expander("Additional Information"):
st.write(
{
"LLM Author": scenario_details["llm_author"],
"Problem": scenario_details["problem"],
"Relationship": scenario_details["relationship"],
"Scenario": scenario_details["scenario"],
}
)
st.divider()
st.markdown("#### 2. View responses.")
# Create two columns for model selectors
col1, col2 = st.columns(2)
with col1:
fixed_model = "qwen1.5-32B-Chat"
st.selectbox(
"Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
)
# Get the response string for the fixed model
if st.session_state.selected_scenario:
response_details_fixed = df_responses[
(df_responses["emobench_id"] == selected_emobench_id)
& (df_responses["llm_responder"] == fixed_model)
].iloc[0]
# Display the response string
st.markdown(
colored_text_box(
response_details_fixed["response_string"],
"#028391",
"white",
),
unsafe_allow_html=True,
)
with col2:
st.session_state.selected_model = st.selectbox(
"Select Model",
model_options,
key="model_selector",
on_change=update_model,
index=(
model_options.index(st.session_state.selected_model)
if st.session_state.selected_model
else 0
),
)
# Get the response string for the selected model
if st.session_state.selected_model and st.session_state.selected_scenario:
response_details_dynamic = df_responses[
(df_responses["emobench_id"] == selected_emobench_id)
& (df_responses["llm_responder"] == st.session_state.selected_model)
].iloc[0]
# Display the response string
st.markdown(
colored_text_box(
response_details_dynamic["response_string"],
"#028391",
"white",
),
unsafe_allow_html=True,
)
st.divider()
st.markdown("#### 3. Response judging")
st.markdown("##### All council members")
col1, col2 = st.columns(2)
with col1:
st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**")
pairwise_counts_left = df_response_judging[
(df_response_judging["first_completion_by"] == fixed_model)
& (
df_response_judging["second_completion_by"]
== st.session_state.selected_model
)
]["pairwise_choice"].value_counts()
st.bar_chart(pairwise_counts_left)
with col2:
st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**")
pairwise_counts_right = df_response_judging[
(
df_response_judging["first_completion_by"]
== st.session_state.selected_model
)
& (df_response_judging["second_completion_by"] == fixed_model)
]["pairwise_choice"].value_counts()
st.bar_chart(pairwise_counts_right)
# Create the llm_judge selector
st.markdown("##### Individual LLM judges")
st.session_state.selected_judge = st.selectbox(
"Select Judge",
judge_options,
label_visibility="hidden",
key="judge_selector",
on_change=update_judge,
index=(
judge_options.index(st.session_state.selected_judge)
if st.session_state.selected_judge
else 0
),
)
# Get the judging details for the selected judge and models
if st.session_state.selected_judge and st.session_state.selected_scenario:
col1, col2 = st.columns(2)
judging_details_left = df_response_judging[
(df_response_judging["llm_judge"] == st.session_state.selected_judge)
& (df_response_judging["first_completion_by"] == fixed_model)
& (
df_response_judging["second_completion_by"]
== st.session_state.selected_model
)
].iloc[0]
judging_details_right = df_response_judging[
(df_response_judging["llm_judge"] == st.session_state.selected_judge)
& (
df_response_judging["first_completion_by"]
== st.session_state.selected_model
)
& (df_response_judging["second_completion_by"] == fixed_model)
].iloc[0]
# Render consistency.
if is_consistent(
judging_details_left["pairwise_choice"],
judging_details_right["pairwise_choice"],
):
st.success(
f"{st.session_state.selected_judge}'s ratings are consistent on this example.",
icon="✅",
)
else:
st.warning(
f"{st.session_state.selected_judge}'s ratings are inconsistent on this example.",
icon="⚠️",
)
# Display the judging details
with col1:
if not judging_details_left.empty:
st.write(
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
)
st.markdown(
colored_text_box(
judging_details_left["judging_response_string"],
"#FEAE6F",
"black",
),
unsafe_allow_html=True,
)
else:
st.write("No judging details found for the selected combination.")
with col2:
if not judging_details_right.empty:
st.write(
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
)
st.markdown(
colored_text_box(
judging_details_right["judging_response_string"],
"#FEAE6F",
"black",
),
unsafe_allow_html=True,
)
else:
st.write("No judging details found for the selected combination.")
with tabs[2]:
st.markdown("### Battles (Respondent vs. Respondent)")
st.markdown("###### Expected win rates based on Terry-Bradley coefficients")
image = Image.open("img/llm_vs_llm_win_rates.png")
img_base64 = pil_to_base64(image)
centered_image_html = f"""
<div style="text-align: center;">
<img src="data:image/png;base64,{img_base64}" width="1000"/>
</div>
"""
st.markdown(centered_image_html, unsafe_allow_html=True)
st.divider()
st.markdown("### Affinities (Judge vs. Respondent)")
st.markdown("###### Raw affinities")
image = Image.open("img/raw.png")
img_base64 = pil_to_base64(image)
centered_image_html = f"""
<div style="text-align: center;">
<img src="data:image/png;base64,{img_base64}" width="1000"/>
</div>
"""
st.markdown(centered_image_html, unsafe_allow_html=True)
# Some extra space.
st.text("")
st.text("")
st.text("")
st.markdown("###### Council-Normalized")
image = Image.open("img/council_normalized.png")
img_base64 = pil_to_base64(image)
centered_image_html = f"""
<div style="text-align: center;">
<img src="data:image/png;base64,{img_base64}" width="1000"/>
</div>
"""
st.markdown(centered_image_html, unsafe_allow_html=True)
st.divider()
st.markdown("### Agreement (Judge vs. Judge)")
st.markdown("###### Sidewise Cohen's Kappa:")
image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png")
img_base64 = pil_to_base64(image)
centered_image_html = f"""
<div style="text-align: center;">
<img src="data:image/png;base64,{img_base64}" width="1000"/>
</div>
"""
st.markdown(centered_image_html, unsafe_allow_html=True)
st.write("Check out the paper for more detailed analysis!")
with tabs[-1]:
st.write(
"""
Please reach out if you are interested in collaborating!
**Our Team:**
- Justin Zhao (justinxzhao@gmail.com)
- Flor Plaza (flor.plaza@unibocconi.it)
- Amanda Cercas Curry (amanda.cercas@unibocconi.it)
"""
)
# st.markdown("#### Citation")
with st.expander("Citation"):
st.write(
"Please cite the following paper if you find our leaderboard, dataset, or framework helpful."
)
st.code(
"""@misc{zhao2024council,
title = {Language Model Council: Benchmarking Foundation Models on Highly Subjective Tasks by Consensus},
url = {https://llm-council.com/},
author = {Justin Zhao, Flor Plaza, and Amanda Cercas Curry},
month = {June},
year = {2024}
}"""
)