dror44 commited on
Commit
a0c1734
·
1 Parent(s): db51bf1

--wip-- [skip ci]

Browse files
benchmarks/grounding/qualifire-Benchmark-real-world-use-cases-grounding-judges-metrics.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
2
+ qualifire-eval,Qualifire,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.9198711755233495,0.281710901260376,28.1710901260376,100,91
3
+ meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8260869565217391,7.698865675926209,769.8865675926208,100,84
4
+ meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8152173913043479,1.173613429069519,117.3613429069519,100,83
5
+ meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8152173913043479,0.7524095869064331,75.24095869064331,100,83
6
+ meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,9.492682189941407,949.2682189941406,100,85
7
+ meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8727858293075684,0.9671694731712341,96.71694731712341,100,88
8
+ gemma-2-27b-it,Gemma 2 27B,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8293075684380031,7.022996637821198,702.2996637821198,100,84
9
+ gemma-2-9b-it,Gemma 2 9B,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.0,1.0131394577026367,101.31394577026367,100,0
10
+ mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8695652173913043,0.6952290463447571,69.52290463447571,100,88
11
+ o3-mini, o3-mini,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.75,4.5483531618118285,454.83531618118286,100,77
12
+ gpt-4.1,GPT-4.1,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7532206119162641,2.0433401608467103,204.33401608467102,100,77
13
+ gpt-4o,GPT-4o,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7749597423510467,1.2855332112312317,128.55332112312317,100,79
14
+ gpt-4-turbo,GPT-4 Turbo,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8043478260869565,1.7661374664306642,176.6137466430664,100,82
15
+ gpt-3.5-turbo,GPT-3.5 Turbo,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,6.820291919708252,682.0291919708252,100,85
16
+ claude-3-haiku-20240307,Claude 3 Haiku,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8727858293075684,1.873636977672577,187.3636977672577,100,88
17
+ claude-3-sonnet-20240229,Claude 3 Sonnet,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8820450885668277,1.6580136108398438,165.80136108398438,100,89
18
+ claude-3-opus-latest,Claude 3 Opus,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8276972624798712,2.1441323065757754,214.41323065757751,100,84
19
+ claude-3-5-sonnet-latest,Claude 3.5 Sonnet,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.7826086956521738,2.122197768688202,212.2197768688202,100,80
20
+ claude-3-5-haiku-latest,Claude 3.5 Haiku,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,1.9041582226753235,190.41582226753235,100,85
21
+ qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8478260869565217,1.130689218044281,113.0689218044281,100,86
22
+ qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8385668276972624,6.882997851371766,688.2997851371765,100,85
23
+ deepseek-v3,DeepSeek V3,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8820450885668277,9.833151006698609,983.3151006698608,100,89
24
+ deepseek-r1,DeepSeek R1,qualifire/Benchmark-real-world-use-cases-grounding,0.0,0.8075684380032206,7.401882836818695,740.1882836818695,100,82
data/leaderboard.csv CHANGED
@@ -2,7 +2,7 @@ judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
2
  qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
3
  claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
5
- qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.3755444609899,3.0,0.0,3.0,Alibaba,Open Source,
6
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
7
  gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
8
  claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
@@ -13,8 +13,8 @@ judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
13
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
14
  gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
15
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
16
- claude-3-opus-latest,Claude 3 Opus,1483.8496849577323,1.0,3.0,4.0,Anthropic,Proprietary,
17
- gpt-4o,GPT-4o,1483.5476042607665,1.0,3.0,4.0,OpenAI,Proprietary,
18
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
19
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
20
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
 
2
  qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
3
  claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
5
+ qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.37554446099,3.0,0.0,3.0,Alibaba,Open Source,
6
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
7
  gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
8
  claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
 
13
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
14
  gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
15
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
16
+ claude-3-opus-latest,Claude 3 Opus,1483.8496849577325,1.0,3.0,4.0,Anthropic,Proprietary,
17
+ gpt-4o,GPT-4o,1483.5476042607663,1.0,3.0,4.0,OpenAI,Proprietary,
18
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
19
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
20
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
data/models.jsonl CHANGED
@@ -17,11 +17,10 @@
17
  {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
18
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
19
 
20
- {"id": "claude-3-haiku-20240307", "name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "provider": "anthropic", "parameters": "N/A"}
21
- {"id": "claude-3-sonnet-20240229", "name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "provider": "anthropic", "parameters": "N/A"}
22
- {"id": "claude-3-opus-latest", "name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "provider": "anthropic", "parameters": "N/A"}
23
- {"id": "claude-3-5-sonnet-latest", "name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
24
- {"id": "claude-3-5-haiku-latest", "name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
25
 
26
 
27
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
 
17
  {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
18
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
19
 
20
+ {"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
21
+ {"id": "claude-3-7-haiku-latest", "name": "Claude 3.7 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
22
+ {"id": "claude-4-sonnet-latest", "name": "Claude 4 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
23
+ {"id": "claude-4-haiku-latest", "name": "Claude 4 haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
 
24
 
25
 
26
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
models.jsonl CHANGED
@@ -17,11 +17,10 @@
17
  {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
18
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
19
 
20
- {"id": "claude-3-haiku-20240307", "name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "provider": "anthropic", "parameters": "N/A"}
21
- {"id": "claude-3-sonnet-20240229", "name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "provider": "anthropic", "parameters": "N/A"}
22
- {"id": "claude-3-opus-latest", "name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "provider": "anthropic", "parameters": "N/A"}
23
- {"id": "claude-3-5-sonnet-latest", "name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
24
- {"id": "claude-3-5-haiku-latest", "name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
25
 
26
 
27
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
 
17
  {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
18
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
19
 
20
+ {"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
21
+ {"id": "claude-3-7-haiku-latest", "name": "Claude 3.7 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
22
+ {"id": "claude-4-sonnet-latest", "name": "Claude 4 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
23
+ {"id": "claude-4-haiku-latest", "name": "Claude 4 haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-4-haiku-latest", "provider": "anthropic", "parameters": "N/A"}
 
24
 
25
 
26
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
run_benchmarks.py CHANGED
@@ -223,6 +223,7 @@ def evaluate_dataset(
223
  "context",
224
  "text",
225
  "adversarial",
 
226
  ]
227
  for possible_name in possible_input_names:
228
  matches = [col for col in column_names if possible_name in col.lower()]
@@ -245,6 +246,7 @@ def evaluate_dataset(
245
  "response",
246
  "completion",
247
  "generation",
 
248
  ]
249
  for possible_name in possible_output_names:
250
  matches = [col for col in column_names if possible_name in col.lower()]
 
223
  "context",
224
  "text",
225
  "adversarial",
226
+ "doc",
227
  ]
228
  for possible_name in possible_input_names:
229
  matches = [col for col in column_names if possible_name in col.lower()]
 
246
  "response",
247
  "completion",
248
  "generation",
249
+ "claim",
250
  ]
251
  for possible_name in possible_output_names:
252
  matches = [col for col in column_names if possible_name in col.lower()]