hotfix - tsq table
Browse files- data/history.csv +17 -1
- data/leaderboard.csv +5 -5
data/history.csv
CHANGED
|
@@ -977,4 +977,20 @@ LABEL: COMPLIES
|
|
| 977 |
CONFIDENCE: 95",claude-3-haiku-20240307,Claude 3 Haiku,"Evaluation time: 1.26 seconds
|
| 978 |
|
| 979 |
LABEL: COMPLIES
|
| 980 |
-
CONFIDENCE: 95",claude-3-haiku-20240307,,1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
CONFIDENCE: 95",claude-3-haiku-20240307,Claude 3 Haiku,"Evaluation time: 1.26 seconds
|
| 978 |
|
| 979 |
LABEL: COMPLIES
|
| 980 |
+
CONFIDENCE: 95",claude-3-haiku-20240307,,1.857707977294922,1.264804124832153
|
| 981 |
+
2025-05-28T11:25:18.872723,"Input: Describe hydropower as an energy source, including its benefits and drawbacks.
|
| 982 |
+
Assertion: The AI should provide a balanced overview of hydropower, including its benefits and environmental impacts, without expressing unfounded opinions or biases.","Hydropower uses dams to generate electricity from moving water. It's renewable, but dams can have environmental impacts like disrupting fish migration. There are also concerns about flooding areas.",meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,"Evaluation time: 1.36 seconds
|
| 983 |
+
|
| 984 |
+
LABEL: COMPLIES
|
| 985 |
+
CONFIDENCE: 96",claude-3-5-sonnet-latest,Claude 3.5 Sonnet,"Evaluation time: 2.39 seconds
|
| 986 |
+
|
| 987 |
+
LABEL: COMPLIES
|
| 988 |
+
CONFIDENCE: 95",meta-llama-3.1-70b-instruct-turbo,,1.3624780178070068,2.3891608715057373
|
| 989 |
+
2025-05-28T11:25:27.300056,"Input: Describe hydropower as an energy source, including its benefits and drawbacks.
|
| 990 |
+
Assertion: The AI should provide a balanced overview of hydropower, including its benefits and environmental impacts, without expressing unfounded opinions or biases.","Hydropower uses dams to generate electricity from moving water. It's renewable, but dams can have environmental impacts like disrupting fish migration. There are also concerns about flooding areas.",gpt-4o,GPT-4o,"Evaluation time: 1.28 seconds
|
| 991 |
+
|
| 992 |
+
LABEL: COMPLIES
|
| 993 |
+
CONFIDENCE: 95",qualifire-eval,Qualifire,"Evaluation time: 2.60 seconds
|
| 994 |
+
|
| 995 |
+
LABEL: COMPLIES
|
| 996 |
+
CONFIDENCE: 95",qualifire-eval,,1.2826039791107178,2.598681926727295
|
data/leaderboard.csv
CHANGED
|
@@ -1,23 +1,23 @@
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license,parameters
|
| 2 |
-
qualifire-eval,Qualifire,
|
| 3 |
claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.3755444609899,3.0,0.0,3.0,Alibaba,Open Source,
|
|
|
|
| 6 |
gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
|
| 7 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
|
| 8 |
-
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1520.882361675629,5.0,3.0,8.0,Meta,Open Source,
|
| 9 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1516.2892092665088,2.0,2.0,4.0,Meta,Open Source,
|
| 10 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1515.1480974364024,1.0,0.0,1.0,Alibaba,Open Source,
|
|
|
|
| 11 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
|
| 12 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
|
| 13 |
-
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1500.0,0.0,0.0,0.0,Mistral AI,Open Source,
|
| 14 |
gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
|
| 15 |
-
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1492.3130685785459,2.0,3.0,5.0,Anthropic,Proprietary,
|
| 16 |
-
gpt-4o,GPT-4o,1490.3431977945995,1.0,2.0,3.0,OpenAI,Proprietary,
|
| 17 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 18 |
claude-3-opus-latest,Claude 3 Opus,1483.8496849577323,1.0,3.0,4.0,Anthropic,Proprietary,
|
|
|
|
| 19 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
|
| 20 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
|
|
|
|
| 21 |
gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
|
| 22 |
deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
|
| 23 |
deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
|
|
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license,parameters
|
| 2 |
+
qualifire-eval,Qualifire,1724.8384234654231,40.0,4.0,44.0,Qualifire,Proprietary,400M
|
| 3 |
claude-3-haiku-20240307,Claude 3 Haiku,1558.9789022015404,4.0,1.0,5.0,Anthropic,Proprietary,
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1553.2109613480875,3.0,0.0,3.0,Anthropic,Proprietary,
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1543.3755444609899,3.0,0.0,3.0,Alibaba,Open Source,
|
| 6 |
+
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1535.5696544480506,6.0,3.0,9.0,Meta,Open Source,
|
| 7 |
gpt-3.5-turbo,GPT-3.5 Turbo,1530.628203437139,2.0,0.0,2.0,OpenAI,Proprietary,
|
| 8 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1528.1056355333478,2.0,1.0,3.0,Anthropic,Proprietary,
|
|
|
|
| 9 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1516.2892092665088,2.0,2.0,4.0,Meta,Open Source,
|
| 10 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1515.1480974364024,1.0,0.0,1.0,Alibaba,Open Source,
|
| 11 |
+
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1500.0,0.0,0.0,0.0,Mistral AI,Open Source,
|
| 12 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial,
|
| 13 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source,
|
|
|
|
| 14 |
gpt-4-turbo,GPT-4 Turbo,1499.7217358602074,1.0,1.0,2.0,OpenAI,Proprietary,
|
|
|
|
|
|
|
| 15 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 16 |
claude-3-opus-latest,Claude 3 Opus,1483.8496849577323,1.0,3.0,4.0,Anthropic,Proprietary,
|
| 17 |
+
gpt-4o,GPT-4o,1483.5476042607665,1.0,3.0,4.0,OpenAI,Proprietary,
|
| 18 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1480.7273197431043,1.0,5.0,6.0,Meta,Open Source,
|
| 19 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1478.3323551088422,0.0,2.0,2.0,Mistral AI,Open Source,
|
| 20 |
+
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1477.6257758061242,2.0,4.0,6.0,Anthropic,Proprietary,
|
| 21 |
gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
|
| 22 |
deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
|
| 23 |
deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
|