jhao commited on
Commit
476f6a4
1 Parent(s): 4f872f6

Update results

Browse files
Files changed (1) hide show
  1. assets/gtbench_results.csv +23 -23
assets/gtbench_results.csv CHANGED
@@ -1,23 +1,23 @@
1
- Model,Agent,Opponent Model,Opponent Agent,Tic-Tac-Toe,Connect Four,Breakthrough,Liar's Dice,Blind Auction,Negotiation,Kuhn Poker,Nim,Pig,Iterated Prisoner's Dilemma
2
- GPT-3.5-turbo,Prompt,GPT-3.5-turbo-1106,prompt agent,0,0,0,0,0,0,0,0,0,0
3
- GPT-4,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.1111111111,0.08,0.32,0.8,0.04,-0.2807881773,0.4,0.08,-0.04,0.004191114837
4
- GPT-4,CoT,GPT-3.5-turbo-1106,prompt agent,-0.02222222222,-0.08,0.56,0.24,0.06896551724,0.1345565749,0.44,0.04,0.04,-0.1601239669
5
- GPT-3.5-turbo,CoT,GPT-3.5-turbo-1106,prompt agent,0.2765957447,-0.32,-0.12,0.44,0.1153846154,-0.2069767442,0.12,-0.04,-0.16,0.1261829653
6
- GPT-3.5-turbo,SC-CoT,GPT-3.5-turbo-1106,prompt agent,0.4090909091,-0.04,-0.16,0.52,-0.12,-0.3146796431,-0.08,0,-0.08,-0.1554828151
7
- GPT-3.5-turbo,ToT,GPT-3.5-turbo-1106,prompt agent,-0.04545454545,0.24,0.16,0,-0.12,0.1827697262,0,0.12,-0.4,-0.1914893617
8
- Codellama-34b-instruct,Prompt,GPT-3.5-turbo-1106,prompt agent,0.3333333333,-0.1,-0.8,-0.4,-0.25,0.2162849873,-0.16,0.36,0.12,0.6
9
- Llama-2-70b-chat,SC-CoT,GPT-3.5-turbo-1106,prompt agent,-0.4693877551,-0.16,-0.68,0.16,-0.04,0.05194593714,0.12,0.04,0.04,0.2961783439
10
- Codellama-34b-instruct,CoT,GPT-3.5-turbo-1106,prompt agent,0.3157894737,-0.36,-0.76,-0.32,-0.2682926829,0.0849338455,0,0.48,-0.08,0.03158844765
11
- Llama-2-70b-chat,CoT,GPT-3.5-turbo-1106,prompt agent,-0.5,0.08,-0.8,0.2653061224,-0.08641975309,0.1280026324,-0.2,0.0612244898,-0.16,0.3242677824
12
- Mistral-7b-Orca,CoT,GPT-3.5-turbo-1106,prompt agent,-0.07692307692,-0.12,-0.32,-0.56,0.1333333333,0.07843137255,0,0.36,-0.68,0.05470459519
13
- Codellama-34b-instruct,SC-CoT,GPT-3.5-turbo-1106,prompt agent,0.1219512195,-0.6,-0.56,-0.28,-0.3483146067,0.09466811752,0,0.16,0.12,0.007955449483
14
- Mistral-7b-Orca,SC-CoT,GPT-3.5-turbo-1106,prompt agent,-0.2,-0.08,-0.4,-0.64,0.08196721311,0.3636363636,-0.04,0.44,-0.84,0.01265822785
15
- Codellama-34b-instruct,ToT,GPT-3.5-turbo-1106,prompt agent,-0.02127659574,-0.16,-0.6,-0.52,-0.3043478261,0.09764918626,0,-0.04,-0.16,0.2366609294
16
- Llama-2-70b-chat,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.3658536585,-1,-0.44,-0.16,-0.07462686567,-0.03333333333,-0.04,0.8,-0.02040816327,-0.7118834081
17
- Mistral-7b-Orca,ToT,GPT-3.5-turbo-1106,prompt agent,-0.1794871795,-0.8,-0.32,-0.44,-0.04651162791,0.2987012987,-0.2,-0.08,-0.84,0.1615445232
18
- Mistral-7b-Orca,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.4285714286,-0.84,-0.68,-0.68,-0.06896551724,-0.1138211382,-0.04,-0.08,0,-0.1818181818
19
- GPT-4,Prompt,GPT-4,prompt agent,0,0,0,0,0,0,0,0,0,0
20
- Codellama-34b-instruct,Prompt,GPT-4,prompt agent,-0.06382978723,0.72,-0.6,-0.64,-0.1481481481,0,0.08,0.16,0.04,0.3424657534
21
- Codellama-34b-instruct,CoT,GPT-4,prompt agent,0.02222222222,0.56,-1,-0.8,0.4489795918,-0.07765344184,0.08,0.2,-0.08,0.2237654321
22
- Llama-2-70b-chat,Prompt,GPT-4,prompt agent,-0.9375,0.96,-0.92,-0.72,-0.25,0,-0.04,0.36,0.2,0.3333333333
23
- Llama-2-70b-chat,CoT,GPT-4,prompt agent,-0.2857142857,0.2,-0.88,-0.9166666667,-0.4166666667,0.2011982027,0,-0.02564102564,-0.36,0.1729106628
 
1
+ Model,Agent,Opponent Model,Opponent Agent,Tic-Tac-Toe,Connect Four,Breakthrough,Liar's Dice,Blind Auction,Negotiation,Kuhn Poker,Nim,Pig,Iterated Prisoner's Dilemma,
2
+ GPT-3.5-turbo,Prompt,GPT-3.5-turbo-1106,prompt agent,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
3
+ GPT-4,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.111,0.080,0.320,0.800,0.040,-0.281,0.400,0.080,-0.040,0.004,0.129
4
+ GPT-4,CoT,GPT-3.5-turbo-1106,prompt agent,-0.022,-0.080,0.560,0.240,0.069,0.135,0.440,0.040,0.040,-0.160,0.126
5
+ GPT-3.5-turbo,CoT,GPT-3.5-turbo-1106,prompt agent,0.277,-0.320,-0.120,0.440,0.115,-0.207,0.120,-0.040,-0.160,0.126,0.023
6
+ GPT-3.5-turbo,SC-CoT,GPT-3.5-turbo-1106,prompt agent,0.409,-0.040,-0.160,0.520,-0.120,-0.315,-0.080,0.000,-0.080,-0.155,-0.002
7
+ GPT-3.5-turbo,ToT,GPT-3.5-turbo-1106,prompt agent,-0.045,0.240,0.160,0.000,-0.120,0.183,0.000,0.120,-0.400,-0.191,-0.005
8
+ Codellama-34b-instruct,Prompt,GPT-3.5-turbo-1106,prompt agent,0.333,-0.100,-0.800,-0.400,-0.250,0.216,-0.160,0.360,0.120,0.600,-0.008
9
+ Llama-2-70b-chat,SC-CoT,GPT-3.5-turbo-1106,prompt agent,-0.469,-0.160,-0.680,0.160,-0.040,0.052,0.120,0.040,0.040,0.296,-0.064
10
+ Codellama-34b-instruct,CoT,GPT-3.5-turbo-1106,prompt agent,0.316,-0.360,-0.760,-0.320,-0.268,0.085,0.000,0.480,-0.080,0.032,-0.088
11
+ Llama-2-70b-chat,CoT,GPT-3.5-turbo-1106,prompt agent,-0.500,0.080,-0.800,0.265,-0.086,0.128,-0.200,0.061,-0.160,0.324,-0.089
12
+ Mistral-7b-Orca,CoT,GPT-3.5-turbo-1106,prompt agent,-0.077,-0.120,-0.320,-0.560,0.133,0.078,0.000,0.360,-0.680,0.055,-0.113
13
+ Codellama-34b-instruct,SC-CoT,GPT-3.5-turbo-1106,prompt agent,0.122,-0.600,-0.560,-0.280,-0.348,0.095,0.000,0.160,0.120,0.008,-0.128
14
+ Mistral-7b-Orca,SC-CoT,GPT-3.5-turbo-1106,prompt agent,-0.200,-0.080,-0.400,-0.640,0.082,0.364,-0.040,0.440,-0.840,0.013,-0.130
15
+ Codellama-34b-instruct,ToT,GPT-3.5-turbo-1106,prompt agent,-0.021,-0.160,-0.600,-0.520,-0.304,0.098,0.000,-0.040,-0.160,0.237,-0.147
16
+ Llama-2-70b-chat,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.366,-1.000,-0.440,-0.160,-0.075,-0.033,-0.040,0.800,-0.020,-0.712,-0.205
17
+ Mistral-7b-Orca,ToT,GPT-3.5-turbo-1106,prompt agent,-0.179,-0.800,-0.320,-0.440,-0.047,0.299,-0.200,-0.080,-0.840,0.162,-0.245
18
+ Mistral-7b-Orca,Prompt,GPT-3.5-turbo-1106,prompt agent,-0.429,-0.840,-0.680,-0.680,-0.069,-0.114,-0.040,-0.080,0.000,-0.182,-0.311
19
+ GPT-4,Prompt,GPT-4,prompt agent,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
20
+ Codellama-34b-instruct,Prompt,GPT-4,prompt agent,-0.064,0.720,-0.600,-0.640,-0.148,0.000,0.080,0.160,0.040,0.342,-0.011
21
+ Codellama-34b-instruct,CoT,GPT-4,prompt agent,0.022,0.560,-1.000,-0.800,0.449,-0.078,0.080,0.200,-0.080,0.224,-0.042
22
+ Llama-2-70b-chat,Prompt,GPT-4,prompt agent,-0.938,0.960,-0.920,-0.720,-0.250,0.000,-0.040,0.360,0.200,0.333,-0.101
23
+ Llama-2-70b-chat,CoT,GPT-4,prompt agent,-0.286,0.200,-0.880,-0.917,-0.417,0.201,0.000,-0.026,-0.360,0.173,-0.231