Spaces:

meghsn
/

WebAgent-Leaderboard

Sleeping

App Files Files Community

megh1211 commited on Jul 15

Commit

a59bcfa

•

1 Parent(s): 29ea9a5

Init push

Browse files

Files changed (5) hide show

.gitignore +2 -0
Dockerfile +11 -0
app.py +389 -0
requirements.txt +6 -0
results.json +178 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+COPY ./app.py /code/app.py
+COPY ./results.json /code/results.json
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+CMD ["streamlit", "run", "/code/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import json
+import re
+import streamlit as st
+import requests
+import pandas as pd
+from io import StringIO
+import plotly.graph_objs as go
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
+@st.cache_data
+def get_model_info(df):
+    api = HfApi()
+    # Initialize new columns for likes and tags
+    df['Likes'] = None
+    df['Tags'] = None
+    # Iterate through DataFrame rows
+    for index, row in df.iterrows():
+        model = row['Model'].strip()
+        try:
+            model_info = api.model_info(repo_id=str(model))
+            df.loc[index, 'Likes'] = model_info.likes
+            df.loc[index, 'Tags'] = ', '.join(model_info.tags)
+        except (RepositoryNotFoundError, RevisionNotFoundError):
+            df.loc[index, 'Likes'] = -1
+            df.loc[index, 'Tags'] = ''
+    return df
+def create_bar_chart(df, category):
+    """Create and display a bar chart for a given category."""
+    st.write(f"### {category} Scores")
+    # Sort the DataFrame based on the category score
+    sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
+    # Create the bar chart with a color gradient (using 'Viridis' color scale as an example)
+    fig = go.Figure(go.Bar(
+        x=sorted_df[category],
+        y=sorted_df['Model'],
+        orientation='h',
+        marker=dict(color=sorted_df[category], colorscale='Inferno')
+    ))
+    # Update layout for better readability
+    fig.update_layout(
+        margin=dict(l=20, r=20, t=20, b=20)
+    )
+    # Adjust the height of the chart based on the number of rows in the DataFrame
+    st.plotly_chart(fig, use_container_width=True, height=35)
+def main():
+    st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
+    with open("results.json") as f:
+        all_results = json.load(f)
+    st.title("🏆 WebAgent Leaderboard")
+    st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
+    # content = create_yall()
+    tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
+    # Leaderboard tab
+    with tab1:
+        score_columns = ['WorkArena-L1', 'WorkArena++-L2', 'WorkArena++-L3', 'MiniWoB', 'WebArena']
+        full_df = pd.DataFrame.from_dict(all_results["workarena_agent_curriculum"])
+        df = pd.DataFrame(columns=full_df.columns)
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        df['Average'] = sum(df[column] for column in score_columns)/len(score_columns)
+        # Sort values
+        df = df.sort_values(by='Average', ascending=False)
+        # Add a search bar
+        search_query = st.text_input("Search models", "", key="search_main")
+        # Filter the DataFrame based on the search query
+        if search_query:
+            df = df[df['Model'].str.contains(search_query, case=False)]
+        # Display the filtered DataFrame or the entire leaderboard
+        st.dataframe(
+            df[['Model'] + score_columns + ['Average']],
+            use_container_width=True,
+            column_config={
+                "WorkArena-L1": {'alignment': 'center'},
+                "WorkArena++-L2": {'alignment': 'center'},
+                "WorkArena++-L3": {'alignment': 'center'},
+                "MiniWoB": {'alignment': 'center'},
+                "WebArena": {'alignment': 'center'},
+            },
+            hide_index=True,
+            # height=int(len(df) * 36.2),
+        )
+        # Comparison between models
+        selected_models = st.multiselect('Select models to compare', df['Model'].unique())
+        comparison_df = df[df['Model'].isin(selected_models)]
+        st.dataframe(
+            comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"WorkArena-L1": "{:.2f}".format, "MiniWoB": "{:.2f}".format, "WorkArena++-L2": "{:.2f}".format, "WorkArena++-L3": "{:.2f}".format, "WebArena": "{:.2f}".format}),
+            use_container_width=True,
+            # column_config={
+            #     "L1": {'alignment': 'center'},
+            #     "L2-Memory": {'alignment': 'center'},
+            #     "L2-Retrieval": {'alignment': 'center'},
+            #     "L3-Memory": {'alignment': 'center'},
+            #     "L3-Retrieval": {'alignment': 'center'},
+            # },
+            hide_index=True,
+        )
+        # Add a button to export data to CSV
+        if st.button("Export to CSV", key="export_main"):
+            # Export the DataFrame to CSV
+            csv_data = df.to_csv(index=False)
+            # Create a link to download the CSV file
+            st.download_button(
+                label="Download CSV",
+                data=csv_data,
+                file_name="leaderboard.csv",
+                key="download-csv",
+                help="Click to download the CSV file",
+            )
+        # # Human curriculum
+        # score_columns = ['WorkArena++-L2', 'WorkArena++-L3']
+        # st.markdown('''
+        #     ### Human subset results
+        # ''')
+        # full_df = pd.DataFrame.from_dict(all_results["workarena_human_curriculum"])
+        # df = pd.DataFrame(columns=full_df.columns)
+        # # Create a DataFrame based on selected filters
+        # dfs_to_concat = []
+        # dfs_to_concat.append(full_df)
+        # # Concatenate the DataFrames
+        # if dfs_to_concat:
+        #     df = pd.concat(dfs_to_concat, ignore_index=True)
+        # # Sort values
+        # df = df.sort_values(by='WorkArena++-L2', ascending=False)
+        # # Display the filtered DataFrame or the entire leaderboard
+        # st.dataframe(
+        #     df[['Model'] + score_columns],
+        #     use_container_width=True,
+        #     column_config={
+        #         "WorkArena-L1": {'alignment': 'center'},
+        #         "WorkArena++-L2": {'alignment': 'center'},
+        #         "WorkArena++-L3": {'alignment': 'center'},
+        #         "MiniWoB": {'alignment': 'center'},
+        #         "WebArena": {'alignment': 'center'},
+        #     },
+        #     hide_index=True,
+        #     # height=int(len(df) * 36.2),
+        # )
+    with tab2:
+        score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
+        full_df = pd.DataFrame.from_dict(all_results["workarena_l2_agent_curriculum"])
+        df = pd.DataFrame(columns=full_df.columns)
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        # Sort values
+        df = df.sort_values(by='Overall', ascending=False)
+        # Add a search bar
+        search_query = st.text_input("Search models", "", key="search_l2")
+        # Filter the DataFrame based on the search query
+        if search_query:
+            df = df[df['Model'].str.contains(search_query, case=False)]
+        # Display the filtered DataFrame or the entire leaderboard
+        st.dataframe(
+            df[['Model'] + score_columns],
+            use_container_width=True,
+            column_config={
+                "Overall": {'alignment': 'center'},
+                "Contextual Understanding": {'alignment': 'center'},
+                "Data-driven Decision Making": {'alignment': 'center'},
+                "Planning and Problem Solving": {'alignment': 'center'},
+                "Information Retrieval": {'alignment': 'center'},
+                "Sophisticated Memorization": {'alignment': 'center'},
+            },
+            hide_index=True,
+            # height=int(len(df) * 36.2),
+        )
+        # Comparison between models
+        selected_models = st.multiselect('Select models to compare', df['Model'].unique())
+        comparison_df = df[df['Model'].isin(selected_models)]
+        st.dataframe(
+            comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
+            use_container_width=True,
+            # column_config={
+                # "Overall": {'alignment': 'center'},
+                # "Contextual Understanding": {'alignment': 'center'},
+                # "Data-driven Decision Making": {'alignment': 'center'},
+                # "Planning and Problem Solving": {'alignment': 'center'},
+                # "Information Retrieval": {'alignment': 'center'},
+                # "Sophisticated Memorization": {'alignment': 'center'},
+            # },
+            hide_index=True,
+        )
+        # Add a button to export data to CSV
+        if st.button("Export to CSV", key="export_l2"):
+            # Export the DataFrame to CSV
+            csv_data = df.to_csv(index=False)
+            # Create a link to download the CSV file
+            st.download_button(
+                label="Download CSV",
+                data=csv_data,
+                file_name="leaderboard.csv",
+                key="download-csv",
+                help="Click to download the CSV file",
+            )
+        # Human curriculum
+        st.markdown('''
+            ### Human subset results
+        ''')
+        full_df = pd.DataFrame.from_dict(all_results["workarena_l2_human_curriculum"])
+        df = pd.DataFrame(columns=full_df.columns)
+        # Create a DataFrame based on selected filters
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        # Sort values
+        df = df.sort_values(by='Overall', ascending=False)
+        # Display the filtered DataFrame or the entire leaderboard
+        st.dataframe(
+            df[['Model'] + score_columns],
+            use_container_width=True,
+            column_config={
+                "Overall": {'alignment': 'center'},
+                "Contextual Understanding": {'alignment': 'center'},
+                "Data-driven Decision Making": {'alignment': 'center'},
+                "Planning and Problem Solving": {'alignment': 'center'},
+                "Information Retrieval": {'alignment': 'center'},
+                "Sophisticated Memorization": {'alignment': 'center'},
+            },
+            hide_index=True,
+            # height=int(len(df) * 36.2),
+        )
+    with tab3:
+        score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
+        full_df = pd.DataFrame.from_dict(all_results["workarena_l3_agent_curriculum"])
+        df = pd.DataFrame(columns=full_df.columns)
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        # Sort values
+        df = df.sort_values(by='Overall', ascending=False)
+        # Add a search bar
+        search_query = st.text_input("Search models", "", key="search_l3")
+        # Filter the DataFrame based on the search query
+        if search_query:
+            df = df[df['Model'].str.contains(search_query, case=False)]
+        # Display the filtered DataFrame or the entire leaderboard
+        st.dataframe(
+            df[['Model'] + score_columns],
+            use_container_width=True,
+            column_config={
+                "Overall": {'alignment': 'center'},
+                "Contextual Understanding": {'alignment': 'center'},
+                "Data-driven Decision Making": {'alignment': 'center'},
+                "Planning and Problem Solving": {'alignment': 'center'},
+                "Information Retrieval": {'alignment': 'center'},
+                "Sophisticated Memorization": {'alignment': 'center'},
+            },
+            hide_index=True,
+            # height=int(len(df) * 36.2),
+        )
+        # Comparison between models
+        selected_models = st.multiselect('Select models to compare', df['Model'].unique())
+        comparison_df = df[df['Model'].isin(selected_models)]
+        st.dataframe(
+            comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
+            use_container_width=True,
+            # column_config={
+                # "Overall": {'alignment': 'center'},
+                # "Contextual Understanding": {'alignment': 'center'},
+                # "Data-driven Decision Making": {'alignment': 'center'},
+                # "Planning and Problem Solving": {'alignment': 'center'},
+                # "Information Retrieval": {'alignment': 'center'},
+                # "Sophisticated Memorization": {'alignment': 'center'},
+            # },
+            hide_index=True,
+        )
+        # Add a button to export data to CSV
+        if st.button("Export to CSV", key="export_l3"):
+            # Export the DataFrame to CSV
+            csv_data = df.to_csv(index=False)
+            # Create a link to download the CSV file
+            st.download_button(
+                label="Download CSV",
+                data=csv_data,
+                file_name="leaderboard.csv",
+                key="download-csv",
+                help="Click to download the CSV file",
+            )
+        # Human curriculum
+        st.markdown('''
+            ### Human subset results
+        ''')
+        full_df = pd.DataFrame.from_dict(all_results["workarena_l3_human_curriculum"])
+        df = pd.DataFrame(columns=full_df.columns)
+        # Create a DataFrame based on selected filters
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        # Sort values
+        df = df.sort_values(by='Overall', ascending=False)
+        # Display the filtered DataFrame or the entire leaderboard
+        st.dataframe(
+            df[['Model'] + score_columns],
+            use_container_width=True,
+            column_config={
+                "Overall": {'alignment': 'center'},
+                "Contextual Understanding": {'alignment': 'center'},
+                "Data-driven Decision Making": {'alignment': 'center'},
+                "Planning and Problem Solving": {'alignment': 'center'},
+                "Information Retrieval": {'alignment': 'center'},
+                "Sophisticated Memorization": {'alignment': 'center'},
+            },
+            hide_index=True,
+            # height=int(len(df) * 36.2),
+        )
+    # About tab
+    with tab4:
+        st.markdown('''
+            ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
+        ''')
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit==1.23
+pandas
+requests
+plotly
+gistyc
+huggingface_hub

results.json ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+    "workarena_agent_curriculum": [
+        {
+            "Model": "GPT-3.5",
+            "WorkArena-L1": 6.1,
+            "WorkArena++-L2": 0.0,
+            "WorkArena++-L3": 0.0,
+            "MiniWoB": 43.4,
+            "WebArena": 6.7
+        },
+        {
+            "Model": "GPT-4o",
+            "WorkArena-L1": 42.7,
+            "WorkArena++-L2": 3.0,
+            "WorkArena++-L3": 0.0,
+            "MiniWoB": 71.3,
+            "WebArena": 23.5
+        },
+        {
+            "Model": "GPT-4o-V",
+            "WorkArena-L1": 41.8,
+            "WorkArena++-L2": 3.8,
+            "WorkArena++-L3": 0.0,
+            "MiniWoB": 72.5,
+            "WebArena": 24.0
+        },
+        {
+            "Model": "LLaMA-3-70b",
+            "WorkArena-L1": 17.9,
+            "WorkArena++-L2": 0.0,
+            "WorkArena++-L3": 0.0,
+            "MiniWoB": 68.2,
+            "WebArena": 11.0
+        },
+        {
+            "Model": "Mixtral-8x22b",
+            "WorkArena-L1": 12.4,
+            "WorkArena++-L2": 0.0,
+            "WorkArena++-L3": 0.0,
+            "MiniWoB": 62.4,
+            "WebArena": 12.6
+        }
+    ],
+    "workarena_l2_agent_curriculum": [
+        {
+            "Model": "GPT-3.5",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "GPT-4o",
+            "Overall": 3.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 14.6
+        },
+        {
+            "Model": "GPT-4o-V",
+            "Overall": 3.8,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 3.6,
+            "Sophisticated Memorization": 14.6
+        },
+        {
+            "Model": "LLaMA-3-70b",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "Mixtral-8x22b",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        }
+    ],
+    "workarena_l2_human_curriculum": [
+        {
+            "Model": "Human",
+            "Overall": 93.9,
+            "Contextual Understanding": 100.0,
+            "Data-driven Decision Making": 84.6,
+            "Planning and Problem Solving": 100.0,
+            "Information Retrieval": 100.0,
+            "Sophisticated Memorization": 91.7
+        },
+        {
+            "Model": "GPT-4o",
+            "Overall": 2.1,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 8.3
+        }
+    ],
+    "workarena_l3_agent_curriculum": [
+        {
+            "Model": "GPT-3.5",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "GPT-4o",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "GPT-4o-V",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "LLaMA-3-70b",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        },
+        {
+            "Model": "Mixtral-8x22b",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        }
+    ],
+    "workarena_l3_human_curriculum": [
+        {
+            "Model": "Human",
+            "Overall": 93.9,
+            "Contextual Understanding": 87.5,
+            "Data-driven Decision Making": 100.0,
+            "Planning and Problem Solving": 87.5,
+            "Information Retrieval": 100.0,
+            "Sophisticated Memorization": 91.7
+        },
+        {
+            "Model": "GPT-4o",
+            "Overall": 0.0,
+            "Contextual Understanding": 0.0,
+            "Data-driven Decision Making": 0.0,
+            "Planning and Problem Solving": 0.0,
+            "Information Retrieval": 0.0,
+            "Sophisticated Memorization": 0.0
+        }
+    ]
+}