leaderboard-gradio / formatted_data.csv
arshy's picture
initial commit
59b2d96
Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
claude-prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945289,0.0033739787234042555
claude-prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647028
prediction-request-reasoning-claude,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.019254515624999982
prediction-offline,gpt-4-0125-preview,0.7507692307692307,244,325,727.1846153846154,0.008048953846153844
prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.018169212121212114
prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.02567674496644293
claude-prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.00959039117647058
prediction-url-cot-claude,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
prediction-request-reasoning-claude,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.10624464497041416
prediction-request-rag-claude,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.01465865337423311
claude-prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.006891669724770637
prediction-request-rag,gpt-4-0125-preview,0.7161716171617162,217,303,1240.980198019802,0.013809207920792065
prediction-request-reasoning-claude,claude-3-haiku-20240307,0.6982248520710059,236,338,2700.6508875739646,0.0016877189349112328
prediction-with-research-bold,gpt-4-1106-preview,0.6938775510204082,34,49,9319.244897959185,0.11741489795918365
prediction-online,gpt-4-0125-preview,0.713855421686747,237,332,1549.8524096385543,0.017273584337349383
prediction-online-sme,gpt-4-0125-preview,0.7012195121951219,230,328,2237.868902439024,0.027385884146341445
claude-prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.013348171617161701
prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752