Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

emotional-intelligence-arena / app.py

justinxzhao

Initial version of data tab browser.

a056e0b 4 months ago

raw

history blame

No virus

7.94 kB

	import streamlit as st
	import pandas as pd

	# Define constants
	MAJOR_A_WIN = "A>>B"
	MINOR_A_WIN = "A>B"
	MINOR_B_WIN = "B>A"
	MAJOR_B_WIN = "B>>A"
	TIE = "A=B"


	def is_consistent(rating, reverse_rating):
	if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
	MAJOR_B_WIN,
	MINOR_B_WIN,
	}:
	return True
	if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
	MAJOR_A_WIN,
	MINOR_A_WIN,
	}:
	return True
	if reverse_rating in {TIE} and rating in {TIE}:
	return True
	if reverse_rating in {TIE} and rating not in {TIE}:
	return False
	if rating in {TIE} and reverse_rating not in {TIE}:
	return False
	return False


	# Load your dataframes
	df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
	df_responses = pd.read_json("data/responses.jsonl", lines=True)
	df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)

	# Prepare the scenario selector options
	df_test_set["scenario_option"] = (
	df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
	)
	scenario_options = df_test_set["scenario_option"].tolist()

	# Prepare the model selector options
	model_options = df_responses["llm_responder"].unique().tolist()

	# Prepare the judge selector options
	judge_options = df_response_judging["llm_judge"].unique().tolist()

	st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")

	# Create three columns
	col1, col2, col3 = st.columns(3)

	# Define CSS to make buttons take full space
	full_width_button_css = """
	<style>
	div.stButton > button {
	width: 100%;
	}
	</style>
	"""

	st.markdown(full_width_button_css, unsafe_allow_html=True)

	# Place a button in each column
	with col1:
	if st.button("Blog"):
	st.write("Button 1 clicked")

	with col2:
	if st.button("Paper"):
	st.write("Button 2 clicked")

	with col3:
	if st.button("Github"):
	st.write("Button 3 clicked")

	# Custom CSS to center title and header
	center_css = """
	<style>
	h1, h2, h3, h4, h5, h6 {
	text-align: center;
	}
	</style>
	"""

	st.markdown(center_css, unsafe_allow_html=True)

	st.title("Language Model Council")
	st.subheader("Applied to emotional intelligence")

	# Create horizontal tabs
	tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])

	# Define content for each tab
	with tabs[0]:
	st.write("This is the leaderboard results page.")
	# Add your leaderboard results content here
	leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
	st.table(leaderboard)

	with tabs[1]:
	# Create the selectors
	selected_scenario = st.selectbox("Select Scenario", scenario_options)

	# Get the selected scenario details
	if selected_scenario:
	selected_emobench_id = int(selected_scenario.split(": ")[0])
	scenario_details = df_test_set[
	df_test_set["emobench_id"] == selected_emobench_id
	].iloc[0]

	# Display the detailed dilemma and additional information
	st.write(scenario_details["detailed_dilemma"])
	with st.expander("Additional Information"):
	st.write(f"LLM Author: {scenario_details['llm_author']}")
	st.write(f"Problem: {scenario_details['problem']}")
	st.write(f"Relationship: {scenario_details['relationship']}")
	st.write(f"Scenario: {scenario_details['scenario']}")

	st.divider()

	# Create two columns for model selectors
	col1, col2 = st.columns(2)

	with col1:
	fixed_model = "qwen1.5-32B-Chat"
	st.selectbox("Select Model", [fixed_model], key="fixed_model")

	# Get the response string for the fixed model
	if selected_scenario:
	response_details_fixed = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == fixed_model)
	].iloc[0]

	# Display the response string
	st.write(response_details_fixed["response_string"])

	with col2:
	selected_model = st.selectbox(
	"Select Model", model_options, key="dynamic_model"
	)

	# Get the response string for the selected model
	if selected_model and selected_scenario:
	response_details_dynamic = df_responses[
	(df_responses["emobench_id"] == selected_emobench_id)
	& (df_responses["llm_responder"] == selected_model)
	].iloc[0]

	# Display the response string
	st.write(response_details_dynamic["response_string"])

	st.divider()

	# Create the llm_judge selector
	selected_judge = st.selectbox("Select Judge", judge_options)

	# Get the judging details for the selected judge and models
	if selected_judge and selected_scenario:
	col1, col2 = st.columns(2)

	judging_details_left = df_response_judging[
	(df_response_judging["llm_judge"] == selected_judge)
	& (df_response_judging["first_completion_by"] == fixed_model)
	& (df_response_judging["second_completion_by"] == selected_model)
	].iloc[0]

	judging_details_right = df_response_judging[
	(df_response_judging["llm_judge"] == selected_judge)
	& (df_response_judging["first_completion_by"] == selected_model)
	& (df_response_judging["second_completion_by"] == fixed_model)
	].iloc[0]

	if is_consistent(
	judging_details_left["pairwise_choice"],
	judging_details_right["pairwise_choice"],
	):
	st.success("The judge ratings are consistent.", icon="✅")
	else:
	st.warning("The judge ratings are inconsistent.", icon="⚠️")

	# Display the judging details
	with col1:
	st.write(f"{fixed_model} vs {selected_model}")
	if not judging_details_left.empty:
	st.write(
	f"Pairwise Choice: {judging_details_left['pairwise_choice']}"
	)
	st.code(judging_details_left["judging_response_string"])
	else:
	st.write("No judging details found for the selected combination.")

	with col2:
	st.write(f"{selected_model} vs {fixed_model}")
	if not judging_details_right.empty:
	st.write(
	f"Pairwise Choice: {judging_details_right['pairwise_choice']}"
	)
	st.code(judging_details_right["judging_response_string"])
	else:
	st.write("No judging details found for the selected combination.")

	st.divider()

	# Add bar charts for value counts of pairwise choices over all judges
	col1, col2 = st.columns(2)

	with col1:
	pairwise_counts_left = df_response_judging[
	(df_response_judging["first_completion_by"] == fixed_model)
	& (df_response_judging["second_completion_by"] == selected_model)
	]["pairwise_choice"].value_counts()

	st.bar_chart(pairwise_counts_left)

	with col2:
	pairwise_counts_right = df_response_judging[
	(df_response_judging["first_completion_by"] == selected_model)
	& (df_response_judging["second_completion_by"] == fixed_model)
	]["pairwise_choice"].value_counts()

	st.bar_chart(pairwise_counts_right)

	with tabs[2]:
	st.write("This is the about us page.")
	# Add your about us content here
	st.write(
	"""
	Our Mission:
	To provide the best service and data insights.

	Our Team:
	- Alice
	- Bob
	- Charlie
	"""
	)