--wip-- [skip ci]
Browse files
benchmarks/grounding/qualifire-Benchmark-real-world-use-cases-grounding-judges-metrics.csv
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
|
| 2 |
+
qualifire-eval,Qualifire,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.9198711755233495,0.281710901260376,28.1710901260376,100,91
|
| 3 |
+
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8260869565217391,7.698865675926209,769.8865675926208,100,84
|
| 4 |
+
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8152173913043479,1.173613429069519,117.3613429069519,100,83
|
| 5 |
+
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8152173913043479,0.7524095869064331,75.24095869064331,100,83
|
| 6 |
+
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,9.492682189941407,949.2682189941406,100,85
|
| 7 |
+
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8727858293075684,0.9671694731712341,96.71694731712341,100,88
|
| 8 |
+
gemma-2-27b-it,Gemma 2 27B,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8293075684380031,7.022996637821198,702.2996637821198,100,84
|
| 9 |
+
gemma-2-9b-it,Gemma 2 9B,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.0,1.0131394577026367,101.31394577026367,100,0
|
| 10 |
+
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8695652173913043,0.6952290463447571,69.52290463447571,100,88
|
| 11 |
+
o3-mini, o3-mini,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.75,4.5483531618118285,454.83531618118286,100,77
|
| 12 |
+
gpt-4.1,GPT-4.1,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7532206119162641,2.0433401608467103,204.33401608467102,100,77
|
| 13 |
+
gpt-4o,GPT-4o,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7749597423510467,1.2855332112312317,128.55332112312317,100,79
|
| 14 |
+
gpt-4-turbo,GPT-4 Turbo,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8043478260869565,1.7661374664306642,176.6137466430664,100,82
|
| 15 |
+
gpt-3.5-turbo,GPT-3.5 Turbo,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,6.820291919708252,682.0291919708252,100,85
|
| 16 |
+
claude-3-haiku-20240307,Claude 3 Haiku,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8727858293075684,1.873636977672577,187.3636977672577,100,88
|
| 17 |
+
claude-3-sonnet-20240229,Claude 3 Sonnet,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8820450885668277,1.6580136108398438,165.80136108398438,100,89
|
| 18 |
+
claude-3-opus-latest,Claude 3 Opus,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8276972624798712,2.1441323065757754,214.41323065757751,100,84
|
| 19 |
+
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7826086956521738,2.122197768688202,212.2197768688202,100,80
|
| 20 |
+
claude-3-5-haiku-latest,Claude 3.5 Haiku,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,1.9041582226753235,190.41582226753235,100,85
|
| 21 |
+
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8478260869565217,1.130689218044281,113.0689218044281,100,86
|
| 22 |
+
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,6.882997851371766,688.2997851371765,100,85
|
| 23 |
+
deepseek-v3,DeepSeek V3,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8820450885668277,9.833151006698609,983.3151006698608,100,89
|
| 24 |
+
deepseek-r1,DeepSeek R1,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8075684380032206,7.401882836818695,740.1882836818695,100,82
|
data/leaderboard.csv
CHANGED
|
@@ -2,7 +2,7 @@ judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
|
| 2 |
qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
|
| 3 |
claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
|
| 5 |
-
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.
|
| 6 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
|
| 7 |
gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
|
| 8 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
|
|
@@ -13,8 +13,8 @@ judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
|
|
| 13 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
|
| 14 |
gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
|
| 15 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 16 |
-
claude-3-opus-latest,Claude 3 Opus,1483.
|
| 17 |
-
gpt-4o,GPT-4o,1483.
|
| 18 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
|
| 19 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
|
| 20 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
|
|
|
|
| 2 |
qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
|
| 3 |
claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
|
| 5 |
+
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.37554446099,3.0,0.0,3.0,Alibaba,Open Source,
|
| 6 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
|
| 7 |
gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
|
| 8 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
|
|
|
|
| 13 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
|
| 14 |
gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
|
| 15 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 16 |
+
claude-3-opus-latest,Claude 3 Opus,1483.8496849577325,1.0,3.0,4.0,Anthropic,Proprietary,
|
| 17 |
+
gpt-4o,GPT-4o,1483.5476042607663,1.0,3.0,4.0,OpenAI,Proprietary,
|
| 18 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
|
| 19 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
|
| 20 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
|
data/models.jsonl
CHANGED
|
@@ -17,11 +17,10 @@
|
|
| 17 |
{"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 19 |
|
| 20 |
-
{"id": "claude-3-
|
| 21 |
-
{"id": "claude-3-
|
| 22 |
-
{"id": "claude-
|
| 23 |
-
{"id": "claude-
|
| 24 |
-
{"id": "claude-3-5-haiku-latest", "name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 25 |
|
| 26 |
|
| 27 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
|
|
|
| 17 |
{"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 19 |
|
| 20 |
+
{"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 21 |
+
{"id": "claude-3-7-haiku-latest", "name": "Claude 3.7 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 22 |
+
{"id": "claude-4-sonnet-latest", "name": "Claude 4 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 23 |
+
{"id": "claude-4-haiku-latest", "name": "Claude 4 haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
models.jsonl
CHANGED
|
@@ -17,11 +17,10 @@
|
|
| 17 |
{"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 19 |
|
| 20 |
-
{"id": "claude-3-
|
| 21 |
-
{"id": "claude-3-
|
| 22 |
-
{"id": "claude-
|
| 23 |
-
{"id": "claude-
|
| 24 |
-
{"id": "claude-3-5-haiku-latest", "name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 25 |
|
| 26 |
|
| 27 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
|
|
|
| 17 |
{"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 19 |
|
| 20 |
+
{"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 21 |
+
{"id": "claude-3-7-haiku-latest", "name": "Claude 3.7 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 22 |
+
{"id": "claude-4-sonnet-latest", "name": "Claude 4 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
| 23 |
+
{"id": "claude-4-haiku-latest", "name": "Claude 4 haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
run_benchmarks.py
CHANGED
|
@@ -223,6 +223,7 @@ def evaluate_dataset(
|
|
| 223 |
"context",
|
| 224 |
"text",
|
| 225 |
"adversarial",
|
|
|
|
| 226 |
]
|
| 227 |
for possible_name in possible_input_names:
|
| 228 |
matches = [col for col in column_names if possible_name in col.lower()]
|
|
@@ -245,6 +246,7 @@ def evaluate_dataset(
|
|
| 245 |
"response",
|
| 246 |
"completion",
|
| 247 |
"generation",
|
|
|
|
| 248 |
]
|
| 249 |
for possible_name in possible_output_names:
|
| 250 |
matches = [col for col in column_names if possible_name in col.lower()]
|
|
|
|
| 223 |
"context",
|
| 224 |
"text",
|
| 225 |
"adversarial",
|
| 226 |
+
"doc",
|
| 227 |
]
|
| 228 |
for possible_name in possible_input_names:
|
| 229 |
matches = [col for col in column_names if possible_name in col.lower()]
|
|
|
|
| 246 |
"response",
|
| 247 |
"completion",
|
| 248 |
"generation",
|
| 249 |
+
"claim",
|
| 250 |
]
|
| 251 |
for possible_name in possible_output_names:
|
| 252 |
matches = [col for col in column_names if possible_name in col.lower()]
|