cyberosa commited on
Commit
0016913
1 Parent(s): 06920f7

updating the leaderboard with new superforcaster tool

Browse files
Files changed (1) hide show
  1. formatted_data.csv +3 -0
formatted_data.csv CHANGED
@@ -2,6 +2,7 @@ Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
2
  prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
3
  prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
4
  prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
 
5
  prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
6
  prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
7
  prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
@@ -19,6 +20,7 @@ prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2
19
  prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
20
  prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
21
  prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
 
22
  prediction-offline,gpt-4o-2024-08-06,0.7164179104477612,240,335,732.0776119402985,0.0081607761194029
23
  prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792
24
  prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385545,0.0172735843373493
@@ -31,6 +33,7 @@ prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.01334
31
  prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928
32
  prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.0021127275641025
33
  prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176
 
34
  prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999
35
  prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746
36
  prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321
 
2
  prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
3
  prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
4
  prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
5
+ superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
6
  prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
7
  prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
8
  prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
 
20
  prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
21
  prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
22
  prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
23
+ superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615
24
  prediction-offline,gpt-4o-2024-08-06,0.7164179104477612,240,335,732.0776119402985,0.0081607761194029
25
  prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792
26
  prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385545,0.0172735843373493
 
33
  prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928
34
  prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.0021127275641025
35
  prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176
36
+ superforcaster,gpt-3.5-turbo-0125,0.5648148148148148,183,324,2142.33024691358,0.0011131651234567
37
  prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999
38
  prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746
39
  prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321