Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

emotional-intelligence-arena / app.py

justinxzhao

Add index.html shim, and add hero.svg

29e2769 8 months ago

raw

history blame

14.8 kB

	import streamlit as st
	import pandas as pd
	from PIL import Image
	import base64
	from io import BytesIO
	import random

	# Define constants
	MAJOR_A_WIN = "A>>B"
	MINOR_A_WIN = "A>B"
	MINOR_B_WIN = "B>A"
	MAJOR_B_WIN = "B>>A"
	TIE = "A=B"


	def is_consistent(rating, reverse_rating):
	if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {TIE} and rating in {TIE}:
	return True
	if reverse_rating in {TIE} and rating not in {TIE}:
	return False
	if rating in {TIE} and reverse_rating not in {TIE}:
	return False
	return False


	# Function to convert PIL image to base64
	def pil_to_base64(img):
	buffered = BytesIO()
	img.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode()
	return img_str


	# Function to convert PIL image to base64
	def pil_svg_to_base64(img):
	buffered = BytesIO()
	img.save(buffered, format="SVG")
	img_str = base64.b64encode(buffered.getvalue()).decode()
	return img_str


	# Load your dataframes
	df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
	df_responses = pd.read_json("data/responses.jsonl", lines=True)
	df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
	df_leaderboard = (
	pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
	)
	df_leaderboard = df_leaderboard.rename(
	columns={"EI Score": "Council Arena EI Score (95% CI)"}
	)

	# Prepare the scenario selector options
	df_test_set["scenario_option"] = (
	df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
	)
	scenario_options = df_test_set["scenario_option"].tolist()

	# Prepare the model selector options
	model_options = df_responses["llm_responder"].unique().tolist()

	# Prepare the judge selector options
	judge_options = df_response_judging["llm_judge"].unique().tolist()

	st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")

	# Create three columns
	col1, col2, col3 = st.columns(3)

	# Define CSS to make buttons take full space
	full_width_button_css = """
	<style>
	div.stButton > button {
	width: 100%;
	}
	</style>
	"""
	st.markdown(full_width_button_css, unsafe_allow_html=True)

	# Create a button that triggers the JavaScript function
	# if st.button(button_text):
	# st.markdown('<script type="text/javascript">openUrl()</script>', unsafe_allow_html=True)

	# Place a button in each column
	with col1:
	st.link_button(
	"Data",
	"https://huggingface.co/datasets/llm-council/emotional_application",
	use_container_width=True,
	)

	with col2:
	if st.button("Paper"):
	st.write("Button 2 clicked")

	with col3:
	st.link_button(
	"Github",
	"https://github.com/llm-council/llm-council",
	use_container_width=True,
	)

	# Custom CSS to center title and header
	center_css = """
	<style>
	h1, h2, h6{
	text-align: center;
	}
	</style>
	"""

	st.markdown(center_css, unsafe_allow_html=True)

	# Centered icon.
	# image = Image.open("img/lmc_icon.png")
	# img_base64 = pil_to_base64(image)
	# centered_image_html = f"""
	# <div style="text-align: center;">
	# <img src="data:image/png;base64,{img_base64}" width="50"/>
	# </div>
	# """
	# st.markdown(centered_image_html, unsafe_allow_html=True)

	# Title and subtitle.
	st.title("Language Model Council")
	st.markdown(
	"###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
	)

	# Render hero image.
	with open("img/hero.svg", "r") as file:
	svg_content = file.read()

	left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2])
	with cent_co:
	st.image(svg_content, use_column_width=True)


	with cent_co.expander("Abstract"):
	st.markdown(
	"""The rapid advancement of Large Language Models (LLMs) necessitates robust
	and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based
	on how well their responses align with human preferences. However, many tasks
	such as those related to emotional intelligence, creative writing, or persuasiveness,
	are highly subjective and often lack majoritarian human agreement. Judges may
	have irreconcilable disagreements about what constitutes a better response. To
	address the challenge of ranking LLMs on highly subjective tasks, we propose
	a novel benchmarking framework, the Language Model Council (LMC). The
	LMC operates through a democratic process to: 1) formulate a test set through
	equal participation, 2) administer the test among council members, and 3) evaluate
	responses as a collective jury. We deploy a council of 20 newest LLMs on an
	open-ended emotional intelligence task: responding to interpersonal dilemmas.
	Our results show that the LMC produces rankings that are more separable, robust,
	and less biased than those from any individual LLM judge, and is more consistent
	with a human-established leaderboard compared to other benchmarks."""
	)
	st.markdown(
	"This leaderboard comes from deploying a Council of 20 LLMs on an open-ended emotional intelligence task: responding to interpersonal dilemmas."
	)

	# Create horizontal tabs
	tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])

	# Define content for each tab
	with tabs[0]:
	st.dataframe(df_leaderboard)


	# HTML and CSS to create a text box with specified color
	def colored_text_box(text, background_color, text_color="black"):
	html_code = f"""
	<div style="
	background-color: {background_color};
	color: {text_color};
	padding: 10px;
	border-radius: 5px;
	">
	{text}
	</div>
	"""
	return html_code


	# Ensure to initialize session state variables if they do not exist
	if "selected_scenario" not in st.session_state:
	st.session_state.selected_scenario = None

	if "selected_model" not in st.session_state:
	st.session_state.selected_model = None

	if "selected_judge" not in st.session_state:
	st.session_state.selected_judge = None


	# Define callback functions to update session state
	def update_scenario():
	st.session_state.selected_scenario = st.session_state.scenario_selector


	def update_model():
	st.session_state.selected_model = st.session_state.model_selector


	def update_judge():
	st.session_state.selected_judge = st.session_state.judge_selector


	def randomize_selection():
	st.session_state.selected_scenario = random.choice(scenario_options)
	st.session_state.selected_model = random.choice(model_options)
	st.session_state.selected_judge = random.choice(judge_options)


	with tabs[1]:
	# Add randomize button at the top of the app
	_, mid_column, _ = st.columns([0.4, 0.2, 0.4])
	mid_column.button(
	":game_die: Randomize!", on_click=randomize_selection, type="primary"
	)

	st.markdown("### 1. Select a scenario.")
	# Create the selectors
	st.session_state.selected_scenario = st.selectbox(
	"Select Scenario",
	scenario_options,
	label_visibility="hidden",
	key="scenario_selector",
	on_change=update_scenario,
	index=(
	scenario_options.index(st.session_state.selected_scenario)
	if st.session_state.selected_scenario
	else 0
	),
	)

	# Get the selected scenario details
	if st.session_state.selected_scenario:
	selected_emobench_id = int(st.session_state.selected_scenario.split(": ")[0])
	scenario_details = df_test_set[
	df_test_set["emobench_id"] == selected_emobench_id
	].iloc[0]

	# Display the detailed dilemma and additional information
	st.markdown(
	colored_text_box(
	scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
	),
	unsafe_allow_html=True,
	)
	with st.expander("Additional Information"):
	st.write(f"LLM Author: {scenario_details['llm_author']}")
	st.write(f"Problem: {scenario_details['problem']}")
	st.write(f"Relationship: {scenario_details['relationship']}")
	st.write(f"Scenario: {scenario_details['scenario']}")

	st.divider()

	st.markdown("### 2. View responses.")

	# Create two columns for model selectors
	col1, col2 = st.columns(2)

	with col1:
	fixed_model = "qwen1.5-32B-Chat"
	st.selectbox(
	"Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
	)

	# Get the response string for the fixed model
	if st.session_state.selected_scenario:
	response_details_fixed = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == fixed_model)
	].iloc[0]

	# Display the response string
	st.markdown(
	colored_text_box(
	response_details_fixed["response_string"], "#eeeeeeff", "black"
	),
	unsafe_allow_html=True,
	)

	with col2:
	st.session_state.selected_model = st.selectbox(
	"Select Model",
	model_options,
	key="model_selector",
	on_change=update_model,
	index=(
	model_options.index(st.session_state.selected_model)
	if st.session_state.selected_model
	else 0
	),
	)

	# Get the response string for the selected model
	if st.session_state.selected_model and st.session_state.selected_scenario:
	response_details_dynamic = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == st.session_state.selected_model)
	].iloc[0]

	# Display the response string
	st.markdown(
	colored_text_box(
	response_details_dynamic["response_string"], "#eeeeeeff", "black"
	),
	unsafe_allow_html=True,
	)

	st.divider()

	# Add bar charts for value counts of pairwise choices over all judges
	st.markdown("### 3. Response judging")

	st.markdown("#### All council members")
	col1, col2 = st.columns(2)

	with col1:
	st.write(f"{fixed_model} vs {st.session_state.selected_model}")
	pairwise_counts_left = df_response_judging[
	(df_response_judging["first_completion_by"] == fixed_model)
	& (
	df_response_judging["second_completion_by"]
	== st.session_state.selected_model
	)
	]["pairwise_choice"].value_counts()
	st.bar_chart(pairwise_counts_left)

	with col2:
	st.write(f"{st.session_state.selected_model} vs {fixed_model}")
	pairwise_counts_right = df_response_judging[
	(
	df_response_judging["first_completion_by"]
	== st.session_state.selected_model
	)
	& (df_response_judging["second_completion_by"] == fixed_model)
	]["pairwise_choice"].value_counts()
	st.bar_chart(pairwise_counts_right)

	# Create the llm_judge selector
	st.markdown("#### Individual LLM judges")
	st.session_state.selected_judge = st.selectbox(
	"Select Judge",
	judge_options,
	label_visibility="hidden",
	key="judge_selector",
	on_change=update_judge,
	index=(
	judge_options.index(st.session_state.selected_judge)
	if st.session_state.selected_judge
	else 0
	),
	)

	# Get the judging details for the selected judge and models
	if st.session_state.selected_judge and st.session_state.selected_scenario:
	col1, col2 = st.columns(2)

	judging_details_left = df_response_judging[
	(df_response_judging["llm_judge"] == st.session_state.selected_judge)
	& (df_response_judging["first_completion_by"] == fixed_model)
	& (
	df_response_judging["second_completion_by"]
	== st.session_state.selected_model
	)
	].iloc[0]

	judging_details_right = df_response_judging[
	(df_response_judging["llm_judge"] == st.session_state.selected_judge)
	& (
	df_response_judging["first_completion_by"]
	== st.session_state.selected_model
	)
	& (df_response_judging["second_completion_by"] == fixed_model)
	].iloc[0]

	# Render consistency.
	if is_consistent(
	judging_details_left["pairwise_choice"],
	judging_details_right["pairwise_choice"],
	):
	st.success("The judge ratings are consistent.", icon="✅")
	else:
	st.warning("The judge ratings are inconsistent.", icon="⚠️")

	# Display the judging details
	with col1:
	if not judging_details_left.empty:
	st.write(
	f"Pairwise Choice: {judging_details_left['pairwise_choice']}"
	)
	st.markdown(
	colored_text_box(
	judging_details_left["judging_response_string"],
	"#eeeeeeff",
	"black",
	),
	unsafe_allow_html=True,
	)
	else:
	st.write("No judging details found for the selected combination.")

	with col2:
	if not judging_details_right.empty:
	st.write(
	f"Pairwise Choice: {judging_details_right['pairwise_choice']}"
	)
	st.markdown(
	colored_text_box(
	judging_details_right["judging_response_string"],
	"#eeeeeeff",
	"black",
	),
	unsafe_allow_html=True,
	)
	else:
	st.write("No judging details found for the selected combination.")

	with tabs[2]:
	st.write(
	"""
	Please reach out if you are interested in collaborating!

	Our Team:
	- Justin Zhao (justinxzhao@gmail.com)
	- Flor Plaza (flor.plaza@unibocconi.it)
	- Amanda Cercas Curry (amanda.cercas@unibocconi.it)
	"""
	)