cyberosa
commited on
Commit
•
0016913
1
Parent(s):
06920f7
updating the leaderboard with new superforcaster tool
Browse files- formatted_data.csv +3 -0
formatted_data.csv
CHANGED
@@ -2,6 +2,7 @@ Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
|
|
2 |
prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
|
3 |
prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
|
4 |
prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
|
|
|
5 |
prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
|
6 |
prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
|
7 |
prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
|
@@ -19,6 +20,7 @@ prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2
|
|
19 |
prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
|
20 |
prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
|
21 |
prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
|
|
|
22 |
prediction-offline,gpt-4o-2024-08-06,0.7164179104477612,240,335,732.0776119402985,0.0081607761194029
|
23 |
prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792
|
24 |
prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385545,0.0172735843373493
|
@@ -31,6 +33,7 @@ prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.01334
|
|
31 |
prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928
|
32 |
prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.0021127275641025
|
33 |
prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176
|
|
|
34 |
prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999
|
35 |
prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746
|
36 |
prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321
|
|
|
2 |
prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
|
3 |
prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
|
4 |
prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
|
5 |
+
superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
|
6 |
prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
|
7 |
prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
|
8 |
prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
|
|
|
20 |
prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
|
21 |
prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
|
22 |
prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
|
23 |
+
superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615
|
24 |
prediction-offline,gpt-4o-2024-08-06,0.7164179104477612,240,335,732.0776119402985,0.0081607761194029
|
25 |
prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792
|
26 |
prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385545,0.0172735843373493
|
|
|
33 |
prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928
|
34 |
prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.0021127275641025
|
35 |
prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176
|
36 |
+
superforcaster,gpt-3.5-turbo-0125,0.5648148148148148,183,324,2142.33024691358,0.0011131651234567
|
37 |
prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999
|
38 |
prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746
|
39 |
prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321
|