Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

justinxzhao commited on Jun 20

Commit

bd4620c

•

1 Parent(s): 30e2346

Add leaderboard graph and update about page.

Browse files

Files changed (2) hide show

app.py +69 -7
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from PIL import Image
 import base64
 from io import BytesIO
 import random
 # Define constants
 MAJOR_A_WIN = "A>>B"
@@ -173,6 +174,63 @@ tabs = st.tabs(
 # Define content for each tab
 with tabs[0]:
     _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
     mid_column.dataframe(df_leaderboard)
@@ -503,14 +561,18 @@ with tabs[2]:
     st.write("Check out the paper for more detailed analysis!")
 with tabs[-1]:
-    st.write(
-        """
-    Please reach out if you are interested in collaborating!
-    **Our Team:**
-    - Justin Zhao (justinxzhao@gmail.com)
-    - Flor Plaza (flor.plaza@unibocconi.it)
-    - Amanda Cercas Curry (amanda.cercas@unibocconi.it)
     """
     )

 import base64
 from io import BytesIO
 import random
+import plotly.graph_objects as go
 # Define constants
 MAJOR_A_WIN = "A>>B"
 # Define content for each tab
 with tabs[0]:
     _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
+    mid_column.markdown("#### Leaderboard Graph")
+    df = df_leaderboard.copy()
+    df["Score"] = df["Council Arena EI Score (95% CI)"].apply(
+        lambda x: float(x.split(" ")[0])
+    )
+    df["Lower"] = df["Council Arena EI Score (95% CI)"].apply(
+        lambda x: float(x.split(" ")[1][1:-1])
+    )
+    df["Upper"] = df["Council Arena EI Score (95% CI)"].apply(
+        lambda x: float(x.split(" ")[2][:-1])
+    )
+    # Sort the DataFrame by Score in descending order
+    df = df.sort_values(by="Score", ascending=False)
+    # Create the bar chart
+    fig = go.Figure()
+    # Generate rainbow colors
+    num_bars = len(df)
+    colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)]
+    fig.add_trace(
+        go.Bar(
+            x=df["Score"],
+            y=df["LLM"],
+            orientation="h",
+            error_x=dict(
+                type="data",
+                array=df["Upper"],
+                arrayminus=-1 * df["Lower"],
+                thickness=0.5,
+                width=3,
+                color="black",
+            ),
+            marker=dict(color=colors, opacity=0.8),
+        )
+    )
+    fig.update_layout(
+        xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True),
+        yaxis_title="LLM",
+        yaxis=dict(autorange="reversed"),
+        template="presentation",
+        width=1000,
+        height=700,
+    )
+    # Display the plot in Streamlit
+    mid_column.plotly_chart(fig)
+    mid_column.divider()
+    mid_column.markdown("#### Leaderboard Table")
+    # Display the table.
     mid_column.dataframe(df_leaderboard)
     st.write("Check out the paper for more detailed analysis!")
 with tabs[-1]:
+    st.markdown(
+        """**Motivation**:
+Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best?
+**Main collaborators**:
+- [Justin Zhao](https://x.com/justinxzhao)
+- [Flor Plaza](https://x.com/florplaza22)
+- [Sam Paech](https://x.com/sam_paech)
+- [Federico Bianchi](https://x.com/federicobianchy)
+- [Sahand Sabour](https://x.com/SahandSabour)
+- [Amanda Cercas Curry](https://x.com/CurriedAmanda)
     """
     )

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- streamlit


1	+ streamlit
2	+ plotly