Spaces:
Build error
Build error
counted few-shot prompts for all models
Browse files
data/all_model_token_counts.csv
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name,num_shots,max,min,mean,std
|
2 |
+
Mistral-7B-v0.3-Chinese-Chat,0,928,694,799.354,15.567384660085061
|
3 |
+
internlm2_5-7b-chat,0,511,426,461.91766666666666,7.767732430462529
|
4 |
+
internlm2_5-7b-chat-1m,0,511,426,461.91766666666666,7.767732430462529
|
5 |
+
Qwen2-7B-Instruct,0,517,426,465.33866666666665,8.617118029244592
|
6 |
+
Llama3.1-8B-Chinese-Chat,0,652,512,571.091,9.115687078710652
|
7 |
+
internlm2_5-20b-chat,0,511,426,461.91766666666666,7.767732430462529
|
8 |
+
Llama3.1-70B-Chinese-Chat,0,652,512,571.091,9.115687078710652
|
9 |
+
Qwen2-72B-Instruct,0,517,426,465.33866666666665,8.617118029244592
|
10 |
+
Mistral-7B-v0.3-Chinese-Chat,5,2573,2339,2444.354,15.567384660085061
|
11 |
+
internlm2_5-7b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
|
12 |
+
internlm2_5-7b-chat-1m,5,1351,1266,1301.9176666666667,7.767732430462529
|
13 |
+
Qwen2-7B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
|
14 |
+
Llama3.1-8B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
|
15 |
+
internlm2_5-20b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
|
16 |
+
Llama3.1-70B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
|
17 |
+
Qwen2-72B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
|
18 |
+
Mistral-7B-v0.3-Chinese-Chat,10,4119,3885,3990.354,15.567384660085061
|
19 |
+
internlm2_5-7b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
|
20 |
+
internlm2_5-7b-chat-1m,10,2245,2160,2195.9176666666667,7.767732430462529
|
21 |
+
Qwen2-7B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
|
22 |
+
Llama3.1-8B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
|
23 |
+
internlm2_5-20b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
|
24 |
+
Llama3.1-70B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
|
25 |
+
Qwen2-72B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
|
26 |
+
Mistral-7B-v0.3-Chinese-Chat,20,7392,7158,7263.354,15.567384660085061
|
27 |
+
internlm2_5-7b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
|
28 |
+
internlm2_5-7b-chat-1m,20,4065,3980,4015.9176666666667,7.767732430462529
|
29 |
+
Qwen2-7B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
|
30 |
+
Llama3.1-8B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
|
31 |
+
internlm2_5-20b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
|
32 |
+
Llama3.1-70B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
|
33 |
+
Qwen2-72B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
|
34 |
+
Mistral-7B-v0.3-Chinese-Chat,30,10804,10570,10675.354,15.567384660085061
|
35 |
+
internlm2_5-7b-chat,30,5903,5818,5853.917666666666,7.767732430462529
|
36 |
+
internlm2_5-7b-chat-1m,30,5903,5818,5853.917666666666,7.767732430462529
|
37 |
+
Qwen2-7B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
|
38 |
+
Llama3.1-8B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
|
39 |
+
internlm2_5-20b-chat,30,5903,5818,5853.917666666666,7.767732430462529
|
40 |
+
Llama3.1-70B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
|
41 |
+
Qwen2-72B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
|
42 |
+
Mistral-7B-v0.3-Chinese-Chat,40,14152,13918,14023.354,15.567384660085061
|
43 |
+
internlm2_5-7b-chat,40,7709,7624,7659.917666666666,7.767732430462529
|
44 |
+
internlm2_5-7b-chat-1m,40,7709,7624,7659.917666666666,7.767732430462529
|
45 |
+
Qwen2-7B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
|
46 |
+
Llama3.1-8B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
|
47 |
+
internlm2_5-20b-chat,40,7709,7624,7659.917666666666,7.767732430462529
|
48 |
+
Llama3.1-70B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
|
49 |
+
Qwen2-72B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
|
50 |
+
Mistral-7B-v0.3-Chinese-Chat,50,17588,17354,17459.354,15.567384660085061
|
51 |
+
internlm2_5-7b-chat,50,9561,9476,9511.917666666666,7.767732430462529
|
52 |
+
internlm2_5-7b-chat-1m,50,9561,9476,9511.917666666666,7.767732430462529
|
53 |
+
Qwen2-7B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
|
54 |
+
Llama3.1-8B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
|
55 |
+
internlm2_5-20b-chat,50,9561,9476,9511.917666666666,7.767732430462529
|
56 |
+
Llama3.1-70B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
|
57 |
+
Qwen2-72B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
|
58 |
+
gpt-4o,0,606,464,524.8063333333333,10.057594723695004
|
59 |
+
gpt-4o-mini,0,606,464,524.8063333333333,10.057594723695004
|
60 |
+
o1-preview,0,925,682,797.5953333333333,16.41724967580933
|
61 |
+
o1-mini,0,925,682,797.5953333333333,16.41724967580933
|
62 |
+
gpt-4o,5,1711,1569,1629.8063333333334,10.057594723695004
|
63 |
+
gpt-4o-mini,5,1711,1569,1629.8063333333334,10.057594723695004
|
64 |
+
o1-preview,5,2649,2406,2521.595333333333,16.41724967580933
|
65 |
+
o1-mini,5,2649,2406,2521.595333333333,16.41724967580933
|
66 |
+
gpt-4o,10,2781,2639,2699.8063333333334,10.057594723695004
|
67 |
+
gpt-4o-mini,10,2781,2639,2699.8063333333334,10.057594723695004
|
68 |
+
o1-preview,10,4282,4039,4154.595333333334,16.41724967580933
|
69 |
+
o1-mini,10,4282,4039,4154.595333333334,16.41724967580933
|
70 |
+
gpt-4o,20,5110,4968,5028.806333333333,10.057594723695004
|
71 |
+
gpt-4o-mini,20,5110,4968,5028.806333333333,10.057594723695004
|
72 |
+
o1-preview,20,7781,7538,7653.595333333334,16.41724967580933
|
73 |
+
o1-mini,20,7781,7538,7653.595333333334,16.41724967580933
|
74 |
+
gpt-4o,30,7572,7430,7490.806333333333,10.057594723695004
|
75 |
+
gpt-4o-mini,30,7572,7430,7490.806333333333,10.057594723695004
|
76 |
+
o1-preview,30,11459,11216,11331.595333333333,16.41724967580933
|
77 |
+
o1-mini,30,11459,11216,11331.595333333333,16.41724967580933
|
78 |
+
gpt-4o,40,9998,9856,9916.806333333334,10.057594723695004
|
79 |
+
gpt-4o-mini,40,9998,9856,9916.806333333334,10.057594723695004
|
80 |
+
o1-preview,40,15061,14818,14933.595333333333,16.41724967580933
|
81 |
+
o1-mini,40,15061,14818,14933.595333333333,16.41724967580933
|
82 |
+
gpt-4o,50,12476,12334,12394.806333333334,10.057594723695004
|
83 |
+
gpt-4o-mini,50,12476,12334,12394.806333333334,10.057594723695004
|
84 |
+
o1-preview,50,18760,18517,18632.595333333335,16.41724967580933
|
85 |
+
o1-mini,50,18760,18517,18632.595333333335,16.41724967580933
|
data/openai_metrics.csv
CHANGED
@@ -14,7 +14,10 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
|
14 |
40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
16 |
0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
|
|
|
17 |
10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
|
|
|
18 |
0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
|
|
|
19 |
10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
|
20 |
-
|
|
|
14 |
40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
16 |
0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
|
17 |
+
5,o1-mini,o1-mini/shots-05,0.724,0.7905045610386181,0.724,0.7482963122126776,0.9966666666666667
|
18 |
10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
|
19 |
+
20,o1-mini,o1-mini/shots-20,0.7343333333333333,0.786101455887261,0.7343333333333333,0.7535300565051624,0.9946666666666667
|
20 |
0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
|
21 |
+
5,o1-preview,o1-preview/shots-05,0.7313333333333333,0.7878283093765627,0.7313333333333333,0.7535489719321234,0.979
|
22 |
10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
|
23 |
+
20,o1-preview,o1-preview/shots-20,0.7443333333333333,0.7911442834260676,0.7443333333333333,0.7625144090816939,0.9853333333333333
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -269,6 +269,7 @@ def load_logical_reasoning_dataset(
|
|
269 |
chinese_prompt=True,
|
270 |
test_data=None,
|
271 |
num_shots=0,
|
|
|
272 |
):
|
273 |
postfix = "" if chinese_prompt else "_en"
|
274 |
train_data_file = data_path + f"/train{postfix}.csv"
|
@@ -319,10 +320,16 @@ def load_logical_reasoning_dataset(
|
|
319 |
texts.append(prompt + output + tokenizer.eos_token if output else "")
|
320 |
return {"train_text": texts, "prompt": prompts}
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
print(datasets)
|
328 |
return datasets
|
|
|
269 |
chinese_prompt=True,
|
270 |
test_data=None,
|
271 |
num_shots=0,
|
272 |
+
format_test_only=False,
|
273 |
):
|
274 |
postfix = "" if chinese_prompt else "_en"
|
275 |
train_data_file = data_path + f"/train{postfix}.csv"
|
|
|
320 |
texts.append(prompt + output + tokenizer.eos_token if output else "")
|
321 |
return {"train_text": texts, "prompt": prompts}
|
322 |
|
323 |
+
if format_test_only:
|
324 |
+
datasets["test"] = datasets["test"].map(
|
325 |
+
formatting_prompts_func,
|
326 |
+
batched=True,
|
327 |
+
)
|
328 |
+
else:
|
329 |
+
datasets = datasets.map(
|
330 |
+
formatting_prompts_func,
|
331 |
+
batched=True,
|
332 |
+
)
|
333 |
|
334 |
print(datasets)
|
335 |
return datasets
|
notebooks/04b_OpenAI-Models_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/06b_Open-Source-Models_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|