cyberosa commited on
Commit
b01451f
1 Parent(s): a9bd212

updating ranking with two models: gpt4o and claude3_5_sonnet

Browse files
Files changed (1) hide show
  1. formatted_data.csv +38 -29
formatted_data.csv CHANGED
@@ -1,31 +1,40 @@
1
  Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
2
- prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945289,0.0033739787234042555
3
- prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647028
4
- prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.019254515624999982
5
- prediction-offline,gpt-4-0125-preview,0.7507692307692307,244,325,727.1846153846154,0.008048953846153844
6
- prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.018169212121212114
7
- prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.02567674496644293
8
- prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.00959039117647058
 
 
 
 
 
 
 
9
  prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
10
- prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.10624464497041416
11
- prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.01465865337423311
12
- prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.006891669724770637
13
- prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461746
14
- prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792065
15
- prediction-offline,databricks/dbrx-instruct:nitro,0.7118055555555556,205,288,755.9895833333334,0.0006803906249999573
16
- prediction-request-reasoning,claude-3-haiku-20240307,0.6982248520710059,236,338,2700.6508875739646,0.0016877189349112328
17
- prediction-with-research-bold,gpt-4-1106-preview,0.6938775510204082,34,49,9319.244897959185,0.11741489795918365
18
- prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385543,0.017273584337349383
19
- prediction-online-sme,gpt-4-0125-preview,0.7012195121951219,230,328,2237.868902439024,0.027385884146341445
20
- prediction-offline,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.6866952789699571,160,233,951.5751072961374,0.0005138505579399129
21
- prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.013348171617161701
22
- prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
23
- prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
24
- prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
25
- prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999664
26
- prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
27
- prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321892
28
- prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.5018587360594795,135,269,3099.4869888475837,0.001673722973977683
29
- prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.00238661999999997
30
- prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752
31
- prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.001697451428571419
 
 
 
1
  Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
2
+ prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
3
+ prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
4
+ prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
5
+ prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
6
+ prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
7
+ prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
8
+ prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.0192545156249999
9
+ prediction-offline,gpt-4-0125-preview,0.7507692307692307,244,325,727.1846153846154,0.0080489538461538
10
+ prediction-online,gpt-4o-2024-08-06,0.7507418397626113,253,337,2484.0830860534124,0.0268839169139465
11
+ prediction-offline-sme,gpt-4o-2024-08-06,0.7485207100591716,253,338,1381.3727810650887,0.0173823668639053
12
+ prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.0181692121212121
13
+ prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.0256767449664429
14
+ prediction-offline,claude-3-5-sonnet-20240620,0.7433628318584071,252,339,815.4306784660766,0.0030337964601769
15
+ prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.0095903911764705
16
  prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
17
+ prediction-online,claude-3-5-sonnet-20240620,0.7337278106508875,248,338,2773.6745562130177,0.0099932130177514
18
+ prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.1062446449704141
19
+ prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
20
+ prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
21
+ prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
22
+ prediction-offline,gpt-4o-2024-08-06,0.7164179104477612,240,335,732.0776119402985,0.0081607761194029
23
+ prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792
24
+ prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385545,0.0172735843373493
25
+ prediction-offline,databricks/dbrx-instruct:nitro,0.7118055555555556,205,288,755.9895833333334,0.0006803906249999
26
+ prediction-online-sme,gpt-4-0125-preview,0.7012195121951219,230,328,2237.868902439024,0.0273858841463414
27
+ prediction-request-reasoning,claude-3-haiku-20240307,0.6982248520710059,236,338,2700.6508875739646,0.0016877189349112
28
+ prediction-with-research-bold,gpt-4-1106-preview,0.6938775510204082,34,49,9319.244897959185,0.1174148979591836
29
+ prediction-offline,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.6866952789699571,160,233,951.5751072961374,0.0005138505579399
30
+ prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.0133481716171617
31
+ prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928
32
+ prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.0021127275641025
33
+ prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176
34
+ prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999
35
+ prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746
36
+ prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321
37
+ prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.5018587360594795,135,269,3099.4869888475837,0.0016737229739776
38
+ prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.0023866199999999
39
+ prediction-online-sme,gpt-3.5-turbo-0125,0.4941176470588235,168,340,2189.1882352941175,0.0024025235294117
40
+ prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.0016974514285714