dror44 commited on
Commit
0c573f2
·
1 Parent(s): d0c066f

hotfix - tsq table

Browse files
Files changed (2) hide show
  1. data/history.csv +17 -1
  2. data/leaderboard.csv +5 -5
data/history.csv CHANGED
@@ -977,4 +977,20 @@ LABEL: COMPLIES
977
  CONFIDENCE: 95",claude-3-haiku-20240307,Claude 3 Haiku,"Evaluation time: 1.26 seconds
978
 
979
  LABEL: COMPLIES
980
- CONFIDENCE: 95",claude-3-haiku-20240307,,1.8577079772949219,1.2648041248321533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
  CONFIDENCE: 95",claude-3-haiku-20240307,Claude 3 Haiku,"Evaluation time: 1.26 seconds
978
 
979
  LABEL: COMPLIES
980
+ CONFIDENCE: 95",claude-3-haiku-20240307,,1.857707977294922,1.264804124832153
981
+ 2025-05-28T11:25:18.872723,"Input: Describe hydropower as an energy source, including its benefits and drawbacks.
982
+ Assertion: The AI should provide a balanced overview of hydropower, including its benefits and environmental impacts, without expressing unfounded opinions or biases.","Hydropower uses dams to generate electricity from moving water. It's renewable, but dams can have environmental impacts like disrupting fish migration. There are also concerns about flooding areas.",meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,"Evaluation time: 1.36 seconds
983
+
984
+ LABEL: COMPLIES
985
+ CONFIDENCE: 96",claude-3-5-sonnet-latest,Claude 3.5 Sonnet,"Evaluation time: 2.39 seconds
986
+
987
+ LABEL: COMPLIES
988
+ CONFIDENCE: 95",meta-llama-3.1-70b-instruct-turbo,,1.3624780178070068,2.3891608715057373
989
+ 2025-05-28T11:25:27.300056,"Input: Describe hydropower as an energy source, including its benefits and drawbacks.
990
+ Assertion: The AI should provide a balanced overview of hydropower, including its benefits and environmental impacts, without expressing unfounded opinions or biases.","Hydropower uses dams to generate electricity from moving water. It's renewable, but dams can have environmental impacts like disrupting fish migration. There are also concerns about flooding areas.",gpt-4o,GPT-4o,"Evaluation time: 1.28 seconds
991
+
992
+ LABEL: COMPLIES
993
+ CONFIDENCE: 95",qualifire-eval,Qualifire,"Evaluation time: 2.60 seconds
994
+
995
+ LABEL: COMPLIES
996
+ CONFIDENCE: 95",qualifire-eval,,1.2826039791107178,2.598681926727295
data/leaderboard.csv CHANGED
@@ -1,23 +1,23 @@
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license,parameters
2
- qualifire-eval,Qualifire,1718.0428299315902,39.0,4.0,43.0,Qualifire,Proprietary,400M
3
  claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
5
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.3755444609899,3.0,0.0,3.0,Alibaba,Open Source,
 
6
  gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
7
  claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
8
- meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1520.882361675629,5.0,3.0,8.0,Meta,Open Source,
9
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1516.2892092665088,2.0,2.0,4.0,Meta,Open Source,
10
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1515.1480974364024,1.0,0.0,1.0,Alibaba,Open Source,
 
11
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
12
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
13
- mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1500.0,0.0,0.0,0.0,Mistral AI,Open Source,
14
  gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
15
- claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1492.3130685785459,2.0,3.0,5.0,Anthropic,Proprietary,
16
- gpt-4o,GPT-4o,1490.3431977945995,1.0,2.0,3.0,OpenAI,Proprietary,
17
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
18
  claude-3-opus-latest,Claude 3 Opus,1483.8496849577323,1.0,3.0,4.0,Anthropic,Proprietary,
 
19
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
20
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
 
21
  gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
22
  deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
23
  deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
 
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license,parameters
2
+ qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
3
  claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
5
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.3755444609899,3.0,0.0,3.0,Alibaba,Open Source,
6
+ meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
7
  gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
8
  claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
 
9
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1516.2892092665088,2.0,2.0,4.0,Meta,Open Source,
10
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1515.1480974364024,1.0,0.0,1.0,Alibaba,Open Source,
11
+ mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1500.0,0.0,0.0,0.0,Mistral AI,Open Source,
12
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
13
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
 
14
  gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
 
 
15
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
16
  claude-3-opus-latest,Claude 3 Opus,1483.8496849577323,1.0,3.0,4.0,Anthropic,Proprietary,
17
+ gpt-4o,GPT-4o,1483.5476042607665,1.0,3.0,4.0,OpenAI,Proprietary,
18
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
19
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
20
+ claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
21
  gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
22
  deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
23
  deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,