Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

emotional-intelligence-arena / app.py

justinxzhao

Add icon, and reorganize data samples.

7ee6d4e 8 months ago

raw

history blame

9.18 kB

	import streamlit as st
	import pandas as pd
	from PIL import Image
	import base64
	from io import BytesIO

	# Define constants
	MAJOR_A_WIN = "A>>B"
	MINOR_A_WIN = "A>B"
	MINOR_B_WIN = "B>A"
	MAJOR_B_WIN = "B>>A"
	TIE = "A=B"


	def is_consistent(rating, reverse_rating):
	if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {TIE} and rating in {TIE}:
	return True
	if reverse_rating in {TIE} and rating not in {TIE}:
	return False
	if rating in {TIE} and reverse_rating not in {TIE}:
	return False
	return False


	# Function to convert PIL image to base64
	def pil_to_base64(img):
	buffered = BytesIO()
	img.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode()
	return img_str


	# Load your dataframes
	df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
	df_responses = pd.read_json("data/responses.jsonl", lines=True)
	df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
	df_leaderboard = (
	pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
	)
	df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})

	# Prepare the scenario selector options
	df_test_set["scenario_option"] = (
	df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
	)
	scenario_options = df_test_set["scenario_option"].tolist()

	# Prepare the model selector options
	model_options = df_responses["llm_responder"].unique().tolist()

	# Prepare the judge selector options
	judge_options = df_response_judging["llm_judge"].unique().tolist()

	st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")

	# Create three columns
	col1, col2, col3 = st.columns(3)

	# Define CSS to make buttons take full space
	full_width_button_css = """
	<style>
	div.stButton > button {
	width: 100%;
	}
	</style>
	"""

	st.markdown(full_width_button_css, unsafe_allow_html=True)

	# Place a button in each column
	with col1:
	if st.button("Blog"):
	st.write("Button 1 clicked")

	with col2:
	if st.button("Paper"):
	st.write("Button 2 clicked")

	with col3:
	if st.button("Github"):
	st.write("Button 3 clicked")

	# Custom CSS to center title and header
	center_css = """
	<style>
	h1, h2{
	text-align: center;
	}
	</style>
	"""

	st.markdown(center_css, unsafe_allow_html=True)

	# Load an image
	image = Image.open("img/lmc_icon.png")

	# Convert the image to base64
	img_base64 = pil_to_base64(image)

	# HTML to center the image and embed base64 image
	centered_image_html = f"""
	<div style="text-align: center;">
	<img src="data:image/png;base64,{img_base64}" width="50"/>
	</div>
	"""

	# Rendering the centered image
	st.markdown(centered_image_html, unsafe_allow_html=True)

	st.title("Language Model Council")
	st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus")

	# Create horizontal tabs
	tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])

	# Define content for each tab
	with tabs[0]:
	st.dataframe(df_leaderboard)

	with tabs[1]:
	st.markdown("### 1. Select a scenario.")
	# Create the selectors
	selected_scenario = st.selectbox(
	"Select Scenario", scenario_options, label_visibility="hidden"
	)

	# Get the selected scenario details
	if selected_scenario:
	selected_emobench_id = int(selected_scenario.split(": ")[0])
	scenario_details = df_test_set[
	df_test_set["emobench_id"] == selected_emobench_id
	].iloc[0]

	# Display the detailed dilemma and additional information
	st.write(scenario_details["detailed_dilemma"])
	with st.expander("Additional Information"):
	st.write(f"LLM Author: {scenario_details['llm_author']}")
	st.write(f"Problem: {scenario_details['problem']}")
	st.write(f"Relationship: {scenario_details['relationship']}")
	st.write(f"Scenario: {scenario_details['scenario']}")

	st.divider()

	st.markdown("### 2. View responses.")

	# Create two columns for model selectors
	col1, col2 = st.columns(2)

	with col1:
	fixed_model = "qwen1.5-32B-Chat"
	st.selectbox(
	"Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
	)

	# Get the response string for the fixed model
	if selected_scenario:
	response_details_fixed = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == fixed_model)
	].iloc[0]

	# Display the response string
	st.write(response_details_fixed["response_string"])

	with col2:
	selected_model = st.selectbox(
	"Select Model", model_options, key="dynamic_model"
	)

	# Get the response string for the selected model
	if selected_model and selected_scenario:
	response_details_dynamic = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == selected_model)
	].iloc[0]

	# Display the response string
	st.write(response_details_dynamic["response_string"])

	st.divider()

	# Add bar charts for value counts of pairwise choices over all judges
	st.markdown("### 3. Response judging")

	st.markdown("#### All council members")
	col1, col2 = st.columns(2)

	with col1:
	st.write(f"{fixed_model} vs {selected_model}")
	pairwise_counts_left = df_response_judging[
	(df_response_judging["first_completion_by"] == fixed_model)
	& (df_response_judging["second_completion_by"] == selected_model)
	]["pairwise_choice"].value_counts()
	st.bar_chart(pairwise_counts_left)

	with col2:
	st.write(f"{selected_model} vs {fixed_model}")
	pairwise_counts_right = df_response_judging[
	(df_response_judging["first_completion_by"] == selected_model)
	& (df_response_judging["second_completion_by"] == fixed_model)
	]["pairwise_choice"].value_counts()

	st.bar_chart(pairwise_counts_right)

	# Create the llm_judge selector
	# st.write("Select an individual judge for detailed inpsection.")
	st.markdown("#### Individudal LLM judges")
	selected_judge = st.selectbox(
	"Select Judge", judge_options, label_visibility="hidden"
	)

	# Get the judging details for the selected judge and models
	if selected_judge and selected_scenario:
	col1, col2 = st.columns(2)

	judging_details_left = df_response_judging[
	(df_response_judging["llm_judge"] == selected_judge)
	& (df_response_judging["first_completion_by"] == fixed_model)
	& (df_response_judging["second_completion_by"] == selected_model)
	].iloc[0]

	judging_details_right = df_response_judging[
	(df_response_judging["llm_judge"] == selected_judge)
	& (df_response_judging["first_completion_by"] == selected_model)
	& (df_response_judging["second_completion_by"] == fixed_model)
	].iloc[0]

	if is_consistent(
	judging_details_left["pairwise_choice"],
	judging_details_right["pairwise_choice"],
	):
	st.success("The judge ratings are consistent.", icon="✅")
	else:
	st.warning("The judge ratings are inconsistent.", icon="⚠️")

	# Display the judging details
	with col1:
	# st.write(f"{fixed_model} vs {selected_model}")
	if not judging_details_left.empty:
	st.write(
	f"Pairwise Choice: {judging_details_left['pairwise_choice']}"
	)
	st.code(judging_details_left["judging_response_string"])
	else:
	st.write("No judging details found for the selected combination.")

	with col2:
	# st.write(f"{selected_model} vs {fixed_model}")
	if not judging_details_right.empty:
	st.write(
	f"Pairwise Choice: {judging_details_right['pairwise_choice']}"
	)
	st.code(judging_details_right["judging_response_string"])
	else:
	st.write("No judging details found for the selected combination.")

	with tabs[2]:
	st.write("This is the about us page.")
	# Add your about us content here
	st.write(
	"""
	Our Mission:
	To provide the best service and data insights.

	Our Team:
	- Alice
	- Bob
	- Charlie
	"""
	)