Spaces:
Running
Running
add deep search benchmark
Browse files- app.py +28 -1
- deepsearch_result.jsonl +16 -0
app.py
CHANGED
|
@@ -3,6 +3,8 @@ import pandas as pd
|
|
| 3 |
import gradio as gr
|
| 4 |
from content import *
|
| 5 |
from css import *
|
|
|
|
|
|
|
| 6 |
|
| 7 |
NONE_COL = "Ranking"
|
| 8 |
|
|
@@ -90,6 +92,26 @@ rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
|
|
| 90 |
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
|
| 91 |
rag_df = rag_df[RAG_COLS]
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
| 94 |
with demo:
|
| 95 |
gr.HTML(TITLE)
|
|
@@ -97,7 +119,12 @@ with demo:
|
|
| 97 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
| 98 |
gr.Markdown("## Leaderboard")
|
| 99 |
with gr.Group():
|
| 100 |
-
with gr.Tab("Results: Agent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
leaderboard_table_test = gr.components.Dataframe(
|
| 102 |
value=agent_df, datatype=AGENT_TYPES, interactive=False,
|
| 103 |
column_widths = ["20%"] * len(agent_df.columns)
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from content import *
|
| 5 |
from css import *
|
| 6 |
+
def model_hyperlink(link, model_name):
|
| 7 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 8 |
|
| 9 |
NONE_COL = "Ranking"
|
| 10 |
|
|
|
|
| 92 |
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
|
| 93 |
rag_df = rag_df[RAG_COLS]
|
| 94 |
|
| 95 |
+
deep_search_ranking = []
|
| 96 |
+
with open("deepsearch_result.jsonl", "r") as f:
|
| 97 |
+
for line in f:
|
| 98 |
+
item = json.loads(line)
|
| 99 |
+
deep_search_ranking.append([item["method"], item["model"], item["overall"]])
|
| 100 |
+
deep_search_ranking = sorted(deep_search_ranking, key=lambda x: x[2], reverse=False)
|
| 101 |
+
ranking_dict = {}
|
| 102 |
+
for i, (method, model, score) in enumerate(deep_search_ranking):
|
| 103 |
+
ranking_dict[score] = i
|
| 104 |
+
deep_search_df = []
|
| 105 |
+
with open("deepsearch_result.jsonl", "r") as f:
|
| 106 |
+
for line in f:
|
| 107 |
+
item = json.loads(line)
|
| 108 |
+
deep_search_df.append([item["org"], item["method"], item["model"], f"{item['overall'] * 100:.2f}", item["link"], ranking_dict[item["overall"]]])
|
| 109 |
+
|
| 110 |
+
deep_search_df = pd.DataFrame.from_records(deep_search_df, columns=["Organisation","Method", "Backbone", "Overall", "Link", NONE_COL])
|
| 111 |
+
deep_search_df = deep_search_df.sort_values(by=["Overall"], ascending=False)
|
| 112 |
+
deep_search_df = deep_search_df[["Organisation", "Method", "Backbone", "Overall", "Link", NONE_COL]]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
| 116 |
with demo:
|
| 117 |
gr.HTML(TITLE)
|
|
|
|
| 119 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
| 120 |
gr.Markdown("## Leaderboard")
|
| 121 |
with gr.Group():
|
| 122 |
+
with gr.Tab("Results: Deep Search Agent 🤖🔎"):
|
| 123 |
+
leaderboard_table_test = gr.components.Dataframe(
|
| 124 |
+
value=deep_search_df, datatype=AGENT_TYPES, interactive=False,
|
| 125 |
+
column_widths = ["10%", "18%", "18%", "10%"]
|
| 126 |
+
)
|
| 127 |
+
with gr.Tab("Results: Web Traversal Agent 🤖️"):
|
| 128 |
leaderboard_table_test = gr.components.Dataframe(
|
| 129 |
value=agent_df, datatype=AGENT_TYPES, interactive=False,
|
| 130 |
column_widths = ["20%"] * len(agent_df.columns)
|
deepsearch_result.jsonl
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-Base", "model": "qwq-32B", "overall": 0.419}
|
| 2 |
+
{"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-RL", "model": "qwq-32B", "overall": 0.465}
|
| 3 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-7b-instruct", "overall": 0.36}
|
| 4 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-32b-instruct", "overall": 0.384}
|
| 5 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwq-32b", "overall": 0.479}
|
| 6 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-32b-instruct", "overall": 0.514}
|
| 7 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwq-32b", "overall": 0.497}
|
| 8 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-72b-instruct", "overall": 0.522}
|
| 9 |
+
{"org": "Tencent","link": "https://github.com/TencentCloudADP/youtu-agent","method": "Youtu-agent", "model": "deepseek-v3.1", "overall": 0.7147}
|
| 10 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-8b", "overall": 0.413}
|
| 11 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-8b", "overall": 0.457}
|
| 12 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-32b", "overall": 0.457}
|
| 13 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-32b", "overall": 0.493}
|
| 14 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-SFT", "model": "qwen2.5-32b-instruct", "overall": 0.615}
|
| 15 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-32b-instruct", "overall": 0.630}
|
| 16 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-7b-instruct", "overall": 0.556}
|