import streamlit as st import pandas as pd from PIL import Image import base64 from io import BytesIO # Define constants MAJOR_A_WIN = "A>>B" MINOR_A_WIN = "A>B" MINOR_B_WIN = "B>A" MAJOR_B_WIN = "B>>A" TIE = "A=B" def is_consistent(rating, reverse_rating): if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in { MAJOR_B_WIN, MINOR_B_WIN, }: return True if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in { MAJOR_A_WIN, MINOR_A_WIN, }: return True if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in { MAJOR_B_WIN, MINOR_B_WIN, }: return True if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in { MAJOR_A_WIN, MINOR_A_WIN, }: return True if reverse_rating in {TIE} and rating in {TIE}: return True if reverse_rating in {TIE} and rating not in {TIE}: return False if rating in {TIE} and reverse_rating not in {TIE}: return False return False # Function to convert PIL image to base64 def pil_to_base64(img): buffered = BytesIO(), format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() return img_str # Load your dataframes df_test_set = pd.read_json("data/test_set.jsonl", lines=True) df_responses = pd.read_json("data/responses.jsonl", lines=True) df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True) df_leaderboard = ( pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True) ) df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"}) # Prepare the scenario selector options df_test_set["scenario_option"] = ( df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"] ) scenario_options = df_test_set["scenario_option"].tolist() # Prepare the model selector options model_options = df_responses["llm_responder"].unique().tolist() # Prepare the judge selector options judge_options = df_response_judging["llm_judge"].unique().tolist() st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide") # Create three columns col1, col2, col3 = st.columns(3) # Define CSS to make buttons take full space full_width_button_css = """ """ st.markdown(full_width_button_css, unsafe_allow_html=True) # Place a button in each column with col1: if st.button("Blog"): st.write("Button 1 clicked") with col2: if st.button("Paper"): st.write("Button 2 clicked") with col3: if st.button("Github"): st.write("Button 3 clicked") # Custom CSS to center title and header center_css = """ """ st.markdown(center_css, unsafe_allow_html=True) # Load an image image ="img/lmc_icon.png") # Convert the image to base64 img_base64 = pil_to_base64(image) # HTML to center the image and embed base64 image centered_image_html = f"""
""" # Rendering the centered image st.markdown(centered_image_html, unsafe_allow_html=True) st.title("Language Model Council") st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus") # Create horizontal tabs tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"]) # Define content for each tab with tabs[0]: st.dataframe(df_leaderboard) with tabs[1]: st.markdown("### 1. Select a scenario.") # Create the selectors selected_scenario = st.selectbox( "Select Scenario", scenario_options, label_visibility="hidden" ) # Get the selected scenario details if selected_scenario: selected_emobench_id = int(selected_scenario.split(": ")[0]) scenario_details = df_test_set[ df_test_set["emobench_id"] == selected_emobench_id ].iloc[0] # Display the detailed dilemma and additional information st.write(scenario_details["detailed_dilemma"]) with st.expander("Additional Information"): st.write(f"**LLM Author:** {scenario_details['llm_author']}") st.write(f"**Problem:** {scenario_details['problem']}") st.write(f"**Relationship:** {scenario_details['relationship']}") st.write(f"**Scenario:** {scenario_details['scenario']}") st.divider() st.markdown("### 2. View responses.") # Create two columns for model selectors col1, col2 = st.columns(2) with col1: fixed_model = "qwen1.5-32B-Chat" st.selectbox( "Select Model", [fixed_model], key="fixed_model", label_visibility="hidden" ) # Get the response string for the fixed model if selected_scenario: response_details_fixed = df_responses[ (df_responses["emobench_id"] == selected_emobench_id) & (df_responses["llm_responder"] == fixed_model) ].iloc[0] # Display the response string st.write(response_details_fixed["response_string"]) with col2: selected_model = st.selectbox( "Select Model", model_options, key="dynamic_model" ) # Get the response string for the selected model if selected_model and selected_scenario: response_details_dynamic = df_responses[ (df_responses["emobench_id"] == selected_emobench_id) & (df_responses["llm_responder"] == selected_model) ].iloc[0] # Display the response string st.write(response_details_dynamic["response_string"]) st.divider() # Add bar charts for value counts of pairwise choices over all judges st.markdown("### 3. Response judging") st.markdown("#### All council members") col1, col2 = st.columns(2) with col1: st.write(f"**{fixed_model}** vs **{selected_model}**") pairwise_counts_left = df_response_judging[ (df_response_judging["first_completion_by"] == fixed_model) & (df_response_judging["second_completion_by"] == selected_model) ]["pairwise_choice"].value_counts() st.bar_chart(pairwise_counts_left) with col2: st.write(f"**{selected_model}** vs **{fixed_model}**") pairwise_counts_right = df_response_judging[ (df_response_judging["first_completion_by"] == selected_model) & (df_response_judging["second_completion_by"] == fixed_model) ]["pairwise_choice"].value_counts() st.bar_chart(pairwise_counts_right) # Create the llm_judge selector # st.write("**Select an individual judge for detailed inpsection.**") st.markdown("#### Individudal LLM judges") selected_judge = st.selectbox( "Select Judge", judge_options, label_visibility="hidden" ) # Get the judging details for the selected judge and models if selected_judge and selected_scenario: col1, col2 = st.columns(2) judging_details_left = df_response_judging[ (df_response_judging["llm_judge"] == selected_judge) & (df_response_judging["first_completion_by"] == fixed_model) & (df_response_judging["second_completion_by"] == selected_model) ].iloc[0] judging_details_right = df_response_judging[ (df_response_judging["llm_judge"] == selected_judge) & (df_response_judging["first_completion_by"] == selected_model) & (df_response_judging["second_completion_by"] == fixed_model) ].iloc[0] if is_consistent( judging_details_left["pairwise_choice"], judging_details_right["pairwise_choice"], ): st.success("The judge ratings are consistent.", icon="✅") else: st.warning("The judge ratings are inconsistent.", icon="⚠️") # Display the judging details with col1: # st.write(f"**{fixed_model}** vs **{selected_model}**") if not judging_details_left.empty: st.write( f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}" ) st.code(judging_details_left["judging_response_string"]) else: st.write("No judging details found for the selected combination.") with col2: # st.write(f"**{selected_model}** vs **{fixed_model}**") if not judging_details_right.empty: st.write( f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}" ) st.code(judging_details_right["judging_response_string"]) else: st.write("No judging details found for the selected combination.") with tabs[2]: st.write("This is the about us page.") # Add your about us content here st.write( """ **Our Mission:** To provide the best service and data insights. **Our Team:** - Alice - Bob - Charlie """ )