dh-mc commited on
Commit
a8683cf
·
1 Parent(s): 0baa6cc

counted few-shot prompts for all models

Browse files
data/all_model_token_counts.csv ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,num_shots,max,min,mean,std
2
+ Mistral-7B-v0.3-Chinese-Chat,0,928,694,799.354,15.567384660085061
3
+ internlm2_5-7b-chat,0,511,426,461.91766666666666,7.767732430462529
4
+ internlm2_5-7b-chat-1m,0,511,426,461.91766666666666,7.767732430462529
5
+ Qwen2-7B-Instruct,0,517,426,465.33866666666665,8.617118029244592
6
+ Llama3.1-8B-Chinese-Chat,0,652,512,571.091,9.115687078710652
7
+ internlm2_5-20b-chat,0,511,426,461.91766666666666,7.767732430462529
8
+ Llama3.1-70B-Chinese-Chat,0,652,512,571.091,9.115687078710652
9
+ Qwen2-72B-Instruct,0,517,426,465.33866666666665,8.617118029244592
10
+ Mistral-7B-v0.3-Chinese-Chat,5,2573,2339,2444.354,15.567384660085061
11
+ internlm2_5-7b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
12
+ internlm2_5-7b-chat-1m,5,1351,1266,1301.9176666666667,7.767732430462529
13
+ Qwen2-7B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
14
+ Llama3.1-8B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
15
+ internlm2_5-20b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
16
+ Llama3.1-70B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
17
+ Qwen2-72B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
18
+ Mistral-7B-v0.3-Chinese-Chat,10,4119,3885,3990.354,15.567384660085061
19
+ internlm2_5-7b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
20
+ internlm2_5-7b-chat-1m,10,2245,2160,2195.9176666666667,7.767732430462529
21
+ Qwen2-7B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
22
+ Llama3.1-8B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
23
+ internlm2_5-20b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
24
+ Llama3.1-70B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
25
+ Qwen2-72B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
26
+ Mistral-7B-v0.3-Chinese-Chat,20,7392,7158,7263.354,15.567384660085061
27
+ internlm2_5-7b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
28
+ internlm2_5-7b-chat-1m,20,4065,3980,4015.9176666666667,7.767732430462529
29
+ Qwen2-7B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
30
+ Llama3.1-8B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
31
+ internlm2_5-20b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
32
+ Llama3.1-70B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
33
+ Qwen2-72B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
34
+ Mistral-7B-v0.3-Chinese-Chat,30,10804,10570,10675.354,15.567384660085061
35
+ internlm2_5-7b-chat,30,5903,5818,5853.917666666666,7.767732430462529
36
+ internlm2_5-7b-chat-1m,30,5903,5818,5853.917666666666,7.767732430462529
37
+ Qwen2-7B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
38
+ Llama3.1-8B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
39
+ internlm2_5-20b-chat,30,5903,5818,5853.917666666666,7.767732430462529
40
+ Llama3.1-70B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
41
+ Qwen2-72B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
42
+ Mistral-7B-v0.3-Chinese-Chat,40,14152,13918,14023.354,15.567384660085061
43
+ internlm2_5-7b-chat,40,7709,7624,7659.917666666666,7.767732430462529
44
+ internlm2_5-7b-chat-1m,40,7709,7624,7659.917666666666,7.767732430462529
45
+ Qwen2-7B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
46
+ Llama3.1-8B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
47
+ internlm2_5-20b-chat,40,7709,7624,7659.917666666666,7.767732430462529
48
+ Llama3.1-70B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
49
+ Qwen2-72B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
50
+ Mistral-7B-v0.3-Chinese-Chat,50,17588,17354,17459.354,15.567384660085061
51
+ internlm2_5-7b-chat,50,9561,9476,9511.917666666666,7.767732430462529
52
+ internlm2_5-7b-chat-1m,50,9561,9476,9511.917666666666,7.767732430462529
53
+ Qwen2-7B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
54
+ Llama3.1-8B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
55
+ internlm2_5-20b-chat,50,9561,9476,9511.917666666666,7.767732430462529
56
+ Llama3.1-70B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
57
+ Qwen2-72B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
58
+ gpt-4o,0,606,464,524.8063333333333,10.057594723695004
59
+ gpt-4o-mini,0,606,464,524.8063333333333,10.057594723695004
60
+ o1-preview,0,925,682,797.5953333333333,16.41724967580933
61
+ o1-mini,0,925,682,797.5953333333333,16.41724967580933
62
+ gpt-4o,5,1711,1569,1629.8063333333334,10.057594723695004
63
+ gpt-4o-mini,5,1711,1569,1629.8063333333334,10.057594723695004
64
+ o1-preview,5,2649,2406,2521.595333333333,16.41724967580933
65
+ o1-mini,5,2649,2406,2521.595333333333,16.41724967580933
66
+ gpt-4o,10,2781,2639,2699.8063333333334,10.057594723695004
67
+ gpt-4o-mini,10,2781,2639,2699.8063333333334,10.057594723695004
68
+ o1-preview,10,4282,4039,4154.595333333334,16.41724967580933
69
+ o1-mini,10,4282,4039,4154.595333333334,16.41724967580933
70
+ gpt-4o,20,5110,4968,5028.806333333333,10.057594723695004
71
+ gpt-4o-mini,20,5110,4968,5028.806333333333,10.057594723695004
72
+ o1-preview,20,7781,7538,7653.595333333334,16.41724967580933
73
+ o1-mini,20,7781,7538,7653.595333333334,16.41724967580933
74
+ gpt-4o,30,7572,7430,7490.806333333333,10.057594723695004
75
+ gpt-4o-mini,30,7572,7430,7490.806333333333,10.057594723695004
76
+ o1-preview,30,11459,11216,11331.595333333333,16.41724967580933
77
+ o1-mini,30,11459,11216,11331.595333333333,16.41724967580933
78
+ gpt-4o,40,9998,9856,9916.806333333334,10.057594723695004
79
+ gpt-4o-mini,40,9998,9856,9916.806333333334,10.057594723695004
80
+ o1-preview,40,15061,14818,14933.595333333333,16.41724967580933
81
+ o1-mini,40,15061,14818,14933.595333333333,16.41724967580933
82
+ gpt-4o,50,12476,12334,12394.806333333334,10.057594723695004
83
+ gpt-4o-mini,50,12476,12334,12394.806333333334,10.057594723695004
84
+ o1-preview,50,18760,18517,18632.595333333335,16.41724967580933
85
+ o1-mini,50,18760,18517,18632.595333333335,16.41724967580933
data/openai_metrics.csv CHANGED
@@ -14,7 +14,10 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
14
  40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
15
  50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
16
  0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
 
17
  10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
 
18
  0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
 
19
  10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
20
- 10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
 
14
  40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
15
  50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
16
  0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
17
+ 5,o1-mini,o1-mini/shots-05,0.724,0.7905045610386181,0.724,0.7482963122126776,0.9966666666666667
18
  10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
19
+ 20,o1-mini,o1-mini/shots-20,0.7343333333333333,0.786101455887261,0.7343333333333333,0.7535300565051624,0.9946666666666667
20
  0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
21
+ 5,o1-preview,o1-preview/shots-05,0.7313333333333333,0.7878283093765627,0.7313333333333333,0.7535489719321234,0.979
22
  10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
23
+ 20,o1-preview,o1-preview/shots-20,0.7443333333333333,0.7911442834260676,0.7443333333333333,0.7625144090816939,0.9853333333333333
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -269,6 +269,7 @@ def load_logical_reasoning_dataset(
269
  chinese_prompt=True,
270
  test_data=None,
271
  num_shots=0,
 
272
  ):
273
  postfix = "" if chinese_prompt else "_en"
274
  train_data_file = data_path + f"/train{postfix}.csv"
@@ -319,10 +320,16 @@ def load_logical_reasoning_dataset(
319
  texts.append(prompt + output + tokenizer.eos_token if output else "")
320
  return {"train_text": texts, "prompt": prompts}
321
 
322
- datasets = datasets.map(
323
- formatting_prompts_func,
324
- batched=True,
325
- )
 
 
 
 
 
 
326
 
327
  print(datasets)
328
  return datasets
 
269
  chinese_prompt=True,
270
  test_data=None,
271
  num_shots=0,
272
+ format_test_only=False,
273
  ):
274
  postfix = "" if chinese_prompt else "_en"
275
  train_data_file = data_path + f"/train{postfix}.csv"
 
320
  texts.append(prompt + output + tokenizer.eos_token if output else "")
321
  return {"train_text": texts, "prompt": prompts}
322
 
323
+ if format_test_only:
324
+ datasets["test"] = datasets["test"].map(
325
+ formatting_prompts_func,
326
+ batched=True,
327
+ )
328
+ else:
329
+ datasets = datasets.map(
330
+ formatting_prompts_func,
331
+ batched=True,
332
+ )
333
 
334
  print(datasets)
335
  return datasets
notebooks/04b_OpenAI-Models_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/06b_Open-Source-Models_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff