|
import streamlit as st |
|
import pandas as pd |
|
from PIL import Image |
|
import base64 |
|
from io import BytesIO |
|
|
|
|
|
MAJOR_A_WIN = "A>>B" |
|
MINOR_A_WIN = "A>B" |
|
MINOR_B_WIN = "B>A" |
|
MAJOR_B_WIN = "B>>A" |
|
TIE = "A=B" |
|
|
|
|
|
def is_consistent(rating, reverse_rating): |
|
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in { |
|
MAJOR_B_WIN, |
|
MINOR_B_WIN, |
|
}: |
|
return True |
|
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in { |
|
MAJOR_A_WIN, |
|
MINOR_A_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in { |
|
MAJOR_B_WIN, |
|
MINOR_B_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in { |
|
MAJOR_A_WIN, |
|
MINOR_A_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {TIE} and rating in {TIE}: |
|
return True |
|
if reverse_rating in {TIE} and rating not in {TIE}: |
|
return False |
|
if rating in {TIE} and reverse_rating not in {TIE}: |
|
return False |
|
return False |
|
|
|
|
|
|
|
def pil_to_base64(img): |
|
buffered = BytesIO() |
|
img.save(buffered, format="PNG") |
|
img_str = base64.b64encode(buffered.getvalue()).decode() |
|
return img_str |
|
|
|
|
|
|
|
df_test_set = pd.read_json("data/test_set.jsonl", lines=True) |
|
df_responses = pd.read_json("data/responses.jsonl", lines=True) |
|
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True) |
|
df_leaderboard = ( |
|
pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True) |
|
) |
|
df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"}) |
|
|
|
|
|
df_test_set["scenario_option"] = ( |
|
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"] |
|
) |
|
scenario_options = df_test_set["scenario_option"].tolist() |
|
|
|
|
|
model_options = df_responses["llm_responder"].unique().tolist() |
|
|
|
|
|
judge_options = df_response_judging["llm_judge"].unique().tolist() |
|
|
|
st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
full_width_button_css = """ |
|
<style> |
|
div.stButton > button { |
|
width: 100%; |
|
} |
|
</style> |
|
""" |
|
|
|
st.markdown(full_width_button_css, unsafe_allow_html=True) |
|
|
|
|
|
with col1: |
|
if st.button("Blog"): |
|
st.write("Button 1 clicked") |
|
|
|
with col2: |
|
if st.button("Paper"): |
|
st.write("Button 2 clicked") |
|
|
|
with col3: |
|
if st.button("Github"): |
|
st.write("Button 3 clicked") |
|
|
|
|
|
center_css = """ |
|
<style> |
|
h1, h2{ |
|
text-align: center; |
|
} |
|
</style> |
|
""" |
|
|
|
st.markdown(center_css, unsafe_allow_html=True) |
|
|
|
|
|
image = Image.open("img/lmc_icon.png") |
|
|
|
|
|
img_base64 = pil_to_base64(image) |
|
|
|
|
|
centered_image_html = f""" |
|
<div style="text-align: center;"> |
|
<img src="data:image/png;base64,{img_base64}" width="50"/> |
|
</div> |
|
""" |
|
|
|
|
|
st.markdown(centered_image_html, unsafe_allow_html=True) |
|
|
|
st.title("Language Model Council") |
|
st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus") |
|
|
|
|
|
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"]) |
|
|
|
|
|
with tabs[0]: |
|
st.dataframe(df_leaderboard) |
|
|
|
with tabs[1]: |
|
st.markdown("### 1. Select a scenario.") |
|
|
|
selected_scenario = st.selectbox( |
|
"Select Scenario", scenario_options, label_visibility="hidden" |
|
) |
|
|
|
|
|
if selected_scenario: |
|
selected_emobench_id = int(selected_scenario.split(": ")[0]) |
|
scenario_details = df_test_set[ |
|
df_test_set["emobench_id"] == selected_emobench_id |
|
].iloc[0] |
|
|
|
|
|
st.write(scenario_details["detailed_dilemma"]) |
|
with st.expander("Additional Information"): |
|
st.write(f"**LLM Author:** {scenario_details['llm_author']}") |
|
st.write(f"**Problem:** {scenario_details['problem']}") |
|
st.write(f"**Relationship:** {scenario_details['relationship']}") |
|
st.write(f"**Scenario:** {scenario_details['scenario']}") |
|
|
|
st.divider() |
|
|
|
st.markdown("### 2. View responses.") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
fixed_model = "qwen1.5-32B-Chat" |
|
st.selectbox( |
|
"Select Model", [fixed_model], key="fixed_model", label_visibility="hidden" |
|
) |
|
|
|
|
|
if selected_scenario: |
|
response_details_fixed = df_responses[ |
|
(df_responses["emobench_id"] == selected_emobench_id) |
|
& (df_responses["llm_responder"] == fixed_model) |
|
].iloc[0] |
|
|
|
|
|
st.write(response_details_fixed["response_string"]) |
|
|
|
with col2: |
|
selected_model = st.selectbox( |
|
"Select Model", model_options, key="dynamic_model" |
|
) |
|
|
|
|
|
if selected_model and selected_scenario: |
|
response_details_dynamic = df_responses[ |
|
(df_responses["emobench_id"] == selected_emobench_id) |
|
& (df_responses["llm_responder"] == selected_model) |
|
].iloc[0] |
|
|
|
|
|
st.write(response_details_dynamic["response_string"]) |
|
|
|
st.divider() |
|
|
|
|
|
st.markdown("### 3. Response judging") |
|
|
|
st.markdown("#### All council members") |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write(f"**{fixed_model}** vs **{selected_model}**") |
|
pairwise_counts_left = df_response_judging[ |
|
(df_response_judging["first_completion_by"] == fixed_model) |
|
& (df_response_judging["second_completion_by"] == selected_model) |
|
]["pairwise_choice"].value_counts() |
|
st.bar_chart(pairwise_counts_left) |
|
|
|
with col2: |
|
st.write(f"**{selected_model}** vs **{fixed_model}**") |
|
pairwise_counts_right = df_response_judging[ |
|
(df_response_judging["first_completion_by"] == selected_model) |
|
& (df_response_judging["second_completion_by"] == fixed_model) |
|
]["pairwise_choice"].value_counts() |
|
|
|
st.bar_chart(pairwise_counts_right) |
|
|
|
|
|
|
|
st.markdown("#### Individudal LLM judges") |
|
selected_judge = st.selectbox( |
|
"Select Judge", judge_options, label_visibility="hidden" |
|
) |
|
|
|
|
|
if selected_judge and selected_scenario: |
|
col1, col2 = st.columns(2) |
|
|
|
judging_details_left = df_response_judging[ |
|
(df_response_judging["llm_judge"] == selected_judge) |
|
& (df_response_judging["first_completion_by"] == fixed_model) |
|
& (df_response_judging["second_completion_by"] == selected_model) |
|
].iloc[0] |
|
|
|
judging_details_right = df_response_judging[ |
|
(df_response_judging["llm_judge"] == selected_judge) |
|
& (df_response_judging["first_completion_by"] == selected_model) |
|
& (df_response_judging["second_completion_by"] == fixed_model) |
|
].iloc[0] |
|
|
|
if is_consistent( |
|
judging_details_left["pairwise_choice"], |
|
judging_details_right["pairwise_choice"], |
|
): |
|
st.success("The judge ratings are consistent.", icon="✅") |
|
else: |
|
st.warning("The judge ratings are inconsistent.", icon="⚠️") |
|
|
|
|
|
with col1: |
|
|
|
if not judging_details_left.empty: |
|
st.write( |
|
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}" |
|
) |
|
st.code(judging_details_left["judging_response_string"]) |
|
else: |
|
st.write("No judging details found for the selected combination.") |
|
|
|
with col2: |
|
|
|
if not judging_details_right.empty: |
|
st.write( |
|
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}" |
|
) |
|
st.code(judging_details_right["judging_response_string"]) |
|
else: |
|
st.write("No judging details found for the selected combination.") |
|
|
|
with tabs[2]: |
|
st.write("This is the about us page.") |
|
|
|
st.write( |
|
""" |
|
**Our Mission:** |
|
To provide the best service and data insights. |
|
|
|
**Our Team:** |
|
- Alice |
|
- Bob |
|
- Charlie |
|
""" |
|
) |
|
|