mllm-dev commited on
Commit
b9aeffd
1 Parent(s): 8274b9e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10158/config.json +55 -0
  2. checkpoint-10158/merges.txt +0 -0
  3. checkpoint-10158/model.safetensors +3 -0
  4. checkpoint-10158/optimizer.pt +3 -0
  5. checkpoint-10158/rng_state.pth +3 -0
  6. checkpoint-10158/scheduler.pt +3 -0
  7. checkpoint-10158/special_tokens_map.json +6 -0
  8. checkpoint-10158/tokenizer.json +0 -0
  9. checkpoint-10158/tokenizer_config.json +20 -0
  10. checkpoint-10158/trainer_state.json +188 -0
  11. checkpoint-10158/training_args.bin +3 -0
  12. checkpoint-10158/vocab.json +0 -0
  13. checkpoint-13544/config.json +55 -0
  14. checkpoint-13544/merges.txt +0 -0
  15. checkpoint-13544/model.safetensors +3 -0
  16. checkpoint-13544/optimizer.pt +3 -0
  17. checkpoint-13544/rng_state.pth +3 -0
  18. checkpoint-13544/scheduler.pt +3 -0
  19. checkpoint-13544/special_tokens_map.json +6 -0
  20. checkpoint-13544/tokenizer.json +0 -0
  21. checkpoint-13544/tokenizer_config.json +20 -0
  22. checkpoint-13544/trainer_state.json +246 -0
  23. checkpoint-13544/training_args.bin +3 -0
  24. checkpoint-13544/vocab.json +0 -0
  25. checkpoint-16930/config.json +55 -0
  26. checkpoint-16930/merges.txt +0 -0
  27. checkpoint-16930/model.safetensors +3 -0
  28. checkpoint-16930/optimizer.pt +3 -0
  29. checkpoint-16930/rng_state.pth +3 -0
  30. checkpoint-16930/scheduler.pt +3 -0
  31. checkpoint-16930/special_tokens_map.json +6 -0
  32. checkpoint-16930/tokenizer.json +0 -0
  33. checkpoint-16930/tokenizer_config.json +20 -0
  34. checkpoint-16930/trainer_state.json +297 -0
  35. checkpoint-16930/training_args.bin +3 -0
  36. checkpoint-16930/vocab.json +0 -0
  37. checkpoint-3386/config.json +55 -0
  38. checkpoint-3386/merges.txt +0 -0
  39. checkpoint-3386/model.safetensors +3 -0
  40. checkpoint-3386/optimizer.pt +3 -0
  41. checkpoint-3386/rng_state.pth +3 -0
  42. checkpoint-3386/scheduler.pt +3 -0
  43. checkpoint-3386/special_tokens_map.json +6 -0
  44. checkpoint-3386/tokenizer.json +0 -0
  45. checkpoint-3386/tokenizer_config.json +20 -0
  46. checkpoint-3386/trainer_state.json +72 -0
  47. checkpoint-3386/training_args.bin +3 -0
  48. checkpoint-3386/vocab.json +0 -0
  49. checkpoint-6772/config.json +55 -0
  50. checkpoint-6772/merges.txt +0 -0
checkpoint-10158/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2,
23
+ "LABEL_3": 3,
24
+ "LABEL_4": 4
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_ctx": 1024,
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": null,
32
+ "n_layer": 12,
33
+ "n_positions": 1024,
34
+ "pad_token_id": 50256,
35
+ "problem_type": "single_label_classification",
36
+ "reorder_and_upcast_attn": false,
37
+ "resid_pdrop": 0.1,
38
+ "scale_attn_by_inverse_layer_idx": false,
39
+ "scale_attn_weights": true,
40
+ "summary_activation": null,
41
+ "summary_first_dropout": 0.1,
42
+ "summary_proj_to_labels": true,
43
+ "summary_type": "cls_index",
44
+ "summary_use_proj": true,
45
+ "task_specific_params": {
46
+ "text-generation": {
47
+ "do_sample": true,
48
+ "max_length": 50
49
+ }
50
+ },
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.38.2",
53
+ "use_cache": true,
54
+ "vocab_size": 50257
55
+ }
checkpoint-10158/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10158/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caf34bb723748831a50d7a838d07cf5dbce7e280a8adce1f568f9d8736662ee6
3
+ size 497789648
checkpoint-10158/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27523f88ca90f6c9d301317e27c51a8eaba315917a5aaed103c0da9b9f6df4e3
3
+ size 995673413
checkpoint-10158/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f48c8214578b302af98bd03444b843112101f97f026ddaea0bb3026c94fe93d
3
+ size 14575
checkpoint-10158/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55be64e2bec5b72412353305297ffd6ec4dd5e6bac48ac857e6058bf2b4dbb6d
3
+ size 627
checkpoint-10158/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-10158/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10158/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
checkpoint-10158/trainer_state.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6810389161109924,
3
+ "best_model_checkpoint": "YELP_full/checkpoint-10158",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 10158,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15,
13
+ "grad_norm": 8.398588180541992,
14
+ "learning_rate": 5.8227997637330186e-05,
15
+ "loss": 0.9245,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.3,
20
+ "grad_norm": 3.5803122520446777,
21
+ "learning_rate": 5.645599527466036e-05,
22
+ "loss": 0.7913,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.44,
27
+ "grad_norm": 3.3800671100616455,
28
+ "learning_rate": 5.468399291199055e-05,
29
+ "loss": 0.7613,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.59,
34
+ "grad_norm": 1.9862334728240967,
35
+ "learning_rate": 5.291199054932074e-05,
36
+ "loss": 0.7362,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.74,
41
+ "grad_norm": 2.043750524520874,
42
+ "learning_rate": 5.113998818665092e-05,
43
+ "loss": 0.7262,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.89,
48
+ "grad_norm": 2.8348655700683594,
49
+ "learning_rate": 4.93679858239811e-05,
50
+ "loss": 0.7165,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_accuracy": 0.69042,
56
+ "eval_loss": 0.6984841227531433,
57
+ "eval_runtime": 141.1206,
58
+ "eval_samples_per_second": 354.307,
59
+ "eval_steps_per_second": 1.849,
60
+ "step": 3386
61
+ },
62
+ {
63
+ "epoch": 1.03,
64
+ "grad_norm": 2.2309489250183105,
65
+ "learning_rate": 4.7595983461311283e-05,
66
+ "loss": 0.7047,
67
+ "step": 3500
68
+ },
69
+ {
70
+ "epoch": 1.18,
71
+ "grad_norm": 2.665053129196167,
72
+ "learning_rate": 4.582398109864147e-05,
73
+ "loss": 0.6764,
74
+ "step": 4000
75
+ },
76
+ {
77
+ "epoch": 1.33,
78
+ "grad_norm": 2.590282678604126,
79
+ "learning_rate": 4.4051978735971645e-05,
80
+ "loss": 0.6733,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 1.48,
85
+ "grad_norm": 2.890204429626465,
86
+ "learning_rate": 4.227997637330183e-05,
87
+ "loss": 0.6712,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 1.62,
92
+ "grad_norm": 2.0544567108154297,
93
+ "learning_rate": 4.050797401063202e-05,
94
+ "loss": 0.6692,
95
+ "step": 5500
96
+ },
97
+ {
98
+ "epoch": 1.77,
99
+ "grad_norm": 1.7416001558303833,
100
+ "learning_rate": 3.87359716479622e-05,
101
+ "loss": 0.6693,
102
+ "step": 6000
103
+ },
104
+ {
105
+ "epoch": 1.92,
106
+ "grad_norm": 2.1172029972076416,
107
+ "learning_rate": 3.696396928529238e-05,
108
+ "loss": 0.6641,
109
+ "step": 6500
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_accuracy": 0.69884,
114
+ "eval_loss": 0.6872273683547974,
115
+ "eval_runtime": 141.4186,
116
+ "eval_samples_per_second": 353.56,
117
+ "eval_steps_per_second": 1.846,
118
+ "step": 6772
119
+ },
120
+ {
121
+ "epoch": 2.07,
122
+ "grad_norm": 2.3501951694488525,
123
+ "learning_rate": 3.5191966922622565e-05,
124
+ "loss": 0.6458,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 2.22,
129
+ "grad_norm": 1.6436865329742432,
130
+ "learning_rate": 3.341996455995275e-05,
131
+ "loss": 0.624,
132
+ "step": 7500
133
+ },
134
+ {
135
+ "epoch": 2.36,
136
+ "grad_norm": 1.550398588180542,
137
+ "learning_rate": 3.164796219728293e-05,
138
+ "loss": 0.6243,
139
+ "step": 8000
140
+ },
141
+ {
142
+ "epoch": 2.51,
143
+ "grad_norm": 3.080960512161255,
144
+ "learning_rate": 2.9875959834613114e-05,
145
+ "loss": 0.6246,
146
+ "step": 8500
147
+ },
148
+ {
149
+ "epoch": 2.66,
150
+ "grad_norm": 2.6615045070648193,
151
+ "learning_rate": 2.81039574719433e-05,
152
+ "loss": 0.6246,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 2.81,
157
+ "grad_norm": 1.667477011680603,
158
+ "learning_rate": 2.633195510927348e-05,
159
+ "loss": 0.622,
160
+ "step": 9500
161
+ },
162
+ {
163
+ "epoch": 2.95,
164
+ "grad_norm": 2.1684648990631104,
165
+ "learning_rate": 2.4559952746603663e-05,
166
+ "loss": 0.62,
167
+ "step": 10000
168
+ },
169
+ {
170
+ "epoch": 3.0,
171
+ "eval_accuracy": 0.70262,
172
+ "eval_loss": 0.6810389161109924,
173
+ "eval_runtime": 141.0454,
174
+ "eval_samples_per_second": 354.496,
175
+ "eval_steps_per_second": 1.85,
176
+ "step": 10158
177
+ }
178
+ ],
179
+ "logging_steps": 500,
180
+ "max_steps": 16930,
181
+ "num_input_tokens_seen": 0,
182
+ "num_train_epochs": 5,
183
+ "save_steps": 500,
184
+ "total_flos": 9.360569379258778e+17,
185
+ "train_batch_size": 192,
186
+ "trial_name": null,
187
+ "trial_params": null
188
+ }
checkpoint-10158/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202a5542d25d5172e8161c0589eaf5ec460cc8e907a908b0e1388d20c67b502b
3
+ size 4539
checkpoint-10158/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13544/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2,
23
+ "LABEL_3": 3,
24
+ "LABEL_4": 4
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_ctx": 1024,
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": null,
32
+ "n_layer": 12,
33
+ "n_positions": 1024,
34
+ "pad_token_id": 50256,
35
+ "problem_type": "single_label_classification",
36
+ "reorder_and_upcast_attn": false,
37
+ "resid_pdrop": 0.1,
38
+ "scale_attn_by_inverse_layer_idx": false,
39
+ "scale_attn_weights": true,
40
+ "summary_activation": null,
41
+ "summary_first_dropout": 0.1,
42
+ "summary_proj_to_labels": true,
43
+ "summary_type": "cls_index",
44
+ "summary_use_proj": true,
45
+ "task_specific_params": {
46
+ "text-generation": {
47
+ "do_sample": true,
48
+ "max_length": 50
49
+ }
50
+ },
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.38.2",
53
+ "use_cache": true,
54
+ "vocab_size": 50257
55
+ }
checkpoint-13544/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13544/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef9ec89a8247ddf709a0a0e52e045356905e70d0ce97535cffe691c275a88262
3
+ size 497789648
checkpoint-13544/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ff1542742a74e499ef868950098a77dcd4a27f02443bf2cbb14acfb780edc65
3
+ size 995673413
checkpoint-13544/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24349b3cba32146fb2236623464a9cbabb82dcb22863c17febbf55a8f97037ea
3
+ size 14575
checkpoint-13544/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aad367127a4eb985fd0e54992f2fe136d3de932fba87aafb1454cd94e28a5fc3
3
+ size 627
checkpoint-13544/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-13544/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13544/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
checkpoint-13544/trainer_state.json ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6810389161109924,
3
+ "best_model_checkpoint": "YELP_full/checkpoint-10158",
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13544,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15,
13
+ "grad_norm": 8.398588180541992,
14
+ "learning_rate": 5.8227997637330186e-05,
15
+ "loss": 0.9245,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.3,
20
+ "grad_norm": 3.5803122520446777,
21
+ "learning_rate": 5.645599527466036e-05,
22
+ "loss": 0.7913,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.44,
27
+ "grad_norm": 3.3800671100616455,
28
+ "learning_rate": 5.468399291199055e-05,
29
+ "loss": 0.7613,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.59,
34
+ "grad_norm": 1.9862334728240967,
35
+ "learning_rate": 5.291199054932074e-05,
36
+ "loss": 0.7362,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.74,
41
+ "grad_norm": 2.043750524520874,
42
+ "learning_rate": 5.113998818665092e-05,
43
+ "loss": 0.7262,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.89,
48
+ "grad_norm": 2.8348655700683594,
49
+ "learning_rate": 4.93679858239811e-05,
50
+ "loss": 0.7165,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_accuracy": 0.69042,
56
+ "eval_loss": 0.6984841227531433,
57
+ "eval_runtime": 141.1206,
58
+ "eval_samples_per_second": 354.307,
59
+ "eval_steps_per_second": 1.849,
60
+ "step": 3386
61
+ },
62
+ {
63
+ "epoch": 1.03,
64
+ "grad_norm": 2.2309489250183105,
65
+ "learning_rate": 4.7595983461311283e-05,
66
+ "loss": 0.7047,
67
+ "step": 3500
68
+ },
69
+ {
70
+ "epoch": 1.18,
71
+ "grad_norm": 2.665053129196167,
72
+ "learning_rate": 4.582398109864147e-05,
73
+ "loss": 0.6764,
74
+ "step": 4000
75
+ },
76
+ {
77
+ "epoch": 1.33,
78
+ "grad_norm": 2.590282678604126,
79
+ "learning_rate": 4.4051978735971645e-05,
80
+ "loss": 0.6733,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 1.48,
85
+ "grad_norm": 2.890204429626465,
86
+ "learning_rate": 4.227997637330183e-05,
87
+ "loss": 0.6712,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 1.62,
92
+ "grad_norm": 2.0544567108154297,
93
+ "learning_rate": 4.050797401063202e-05,
94
+ "loss": 0.6692,
95
+ "step": 5500
96
+ },
97
+ {
98
+ "epoch": 1.77,
99
+ "grad_norm": 1.7416001558303833,
100
+ "learning_rate": 3.87359716479622e-05,
101
+ "loss": 0.6693,
102
+ "step": 6000
103
+ },
104
+ {
105
+ "epoch": 1.92,
106
+ "grad_norm": 2.1172029972076416,
107
+ "learning_rate": 3.696396928529238e-05,
108
+ "loss": 0.6641,
109
+ "step": 6500
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_accuracy": 0.69884,
114
+ "eval_loss": 0.6872273683547974,
115
+ "eval_runtime": 141.4186,
116
+ "eval_samples_per_second": 353.56,
117
+ "eval_steps_per_second": 1.846,
118
+ "step": 6772
119
+ },
120
+ {
121
+ "epoch": 2.07,
122
+ "grad_norm": 2.3501951694488525,
123
+ "learning_rate": 3.5191966922622565e-05,
124
+ "loss": 0.6458,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 2.22,
129
+ "grad_norm": 1.6436865329742432,
130
+ "learning_rate": 3.341996455995275e-05,
131
+ "loss": 0.624,
132
+ "step": 7500
133
+ },
134
+ {
135
+ "epoch": 2.36,
136
+ "grad_norm": 1.550398588180542,
137
+ "learning_rate": 3.164796219728293e-05,
138
+ "loss": 0.6243,
139
+ "step": 8000
140
+ },
141
+ {
142
+ "epoch": 2.51,
143
+ "grad_norm": 3.080960512161255,
144
+ "learning_rate": 2.9875959834613114e-05,
145
+ "loss": 0.6246,
146
+ "step": 8500
147
+ },
148
+ {
149
+ "epoch": 2.66,
150
+ "grad_norm": 2.6615045070648193,
151
+ "learning_rate": 2.81039574719433e-05,
152
+ "loss": 0.6246,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 2.81,
157
+ "grad_norm": 1.667477011680603,
158
+ "learning_rate": 2.633195510927348e-05,
159
+ "loss": 0.622,
160
+ "step": 9500
161
+ },
162
+ {
163
+ "epoch": 2.95,
164
+ "grad_norm": 2.1684648990631104,
165
+ "learning_rate": 2.4559952746603663e-05,
166
+ "loss": 0.62,
167
+ "step": 10000
168
+ },
169
+ {
170
+ "epoch": 3.0,
171
+ "eval_accuracy": 0.70262,
172
+ "eval_loss": 0.6810389161109924,
173
+ "eval_runtime": 141.0454,
174
+ "eval_samples_per_second": 354.496,
175
+ "eval_steps_per_second": 1.85,
176
+ "step": 10158
177
+ },
178
+ {
179
+ "epoch": 3.1,
180
+ "grad_norm": 2.221029043197632,
181
+ "learning_rate": 2.2787950383933847e-05,
182
+ "loss": 0.5931,
183
+ "step": 10500
184
+ },
185
+ {
186
+ "epoch": 3.25,
187
+ "grad_norm": 2.1362805366516113,
188
+ "learning_rate": 2.1015948021264028e-05,
189
+ "loss": 0.5843,
190
+ "step": 11000
191
+ },
192
+ {
193
+ "epoch": 3.4,
194
+ "grad_norm": 2.233245849609375,
195
+ "learning_rate": 1.9243945658594212e-05,
196
+ "loss": 0.5826,
197
+ "step": 11500
198
+ },
199
+ {
200
+ "epoch": 3.54,
201
+ "grad_norm": 2.0913615226745605,
202
+ "learning_rate": 1.7471943295924396e-05,
203
+ "loss": 0.5824,
204
+ "step": 12000
205
+ },
206
+ {
207
+ "epoch": 3.69,
208
+ "grad_norm": 2.6865012645721436,
209
+ "learning_rate": 1.5699940933254577e-05,
210
+ "loss": 0.5758,
211
+ "step": 12500
212
+ },
213
+ {
214
+ "epoch": 3.84,
215
+ "grad_norm": 2.226073741912842,
216
+ "learning_rate": 1.3927938570584761e-05,
217
+ "loss": 0.5822,
218
+ "step": 13000
219
+ },
220
+ {
221
+ "epoch": 3.99,
222
+ "grad_norm": 3.137810468673706,
223
+ "learning_rate": 1.2155936207914945e-05,
224
+ "loss": 0.5852,
225
+ "step": 13500
226
+ },
227
+ {
228
+ "epoch": 4.0,
229
+ "eval_accuracy": 0.70036,
230
+ "eval_loss": 0.6996423602104187,
231
+ "eval_runtime": 141.3356,
232
+ "eval_samples_per_second": 353.768,
233
+ "eval_steps_per_second": 1.847,
234
+ "step": 13544
235
+ }
236
+ ],
237
+ "logging_steps": 500,
238
+ "max_steps": 16930,
239
+ "num_input_tokens_seen": 0,
240
+ "num_train_epochs": 5,
241
+ "save_steps": 500,
242
+ "total_flos": 1.247513087841362e+18,
243
+ "train_batch_size": 192,
244
+ "trial_name": null,
245
+ "trial_params": null
246
+ }
checkpoint-13544/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202a5542d25d5172e8161c0589eaf5ec460cc8e907a908b0e1388d20c67b502b
3
+ size 4539
checkpoint-13544/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-16930/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2,
23
+ "LABEL_3": 3,
24
+ "LABEL_4": 4
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_ctx": 1024,
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": null,
32
+ "n_layer": 12,
33
+ "n_positions": 1024,
34
+ "pad_token_id": 50256,
35
+ "problem_type": "single_label_classification",
36
+ "reorder_and_upcast_attn": false,
37
+ "resid_pdrop": 0.1,
38
+ "scale_attn_by_inverse_layer_idx": false,
39
+ "scale_attn_weights": true,
40
+ "summary_activation": null,
41
+ "summary_first_dropout": 0.1,
42
+ "summary_proj_to_labels": true,
43
+ "summary_type": "cls_index",
44
+ "summary_use_proj": true,
45
+ "task_specific_params": {
46
+ "text-generation": {
47
+ "do_sample": true,
48
+ "max_length": 50
49
+ }
50
+ },
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.38.2",
53
+ "use_cache": true,
54
+ "vocab_size": 50257
55
+ }
checkpoint-16930/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-16930/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4090092241945e27e2abddb1c370884d514e406fd988c3308a6db168642a07ae
3
+ size 497789648
checkpoint-16930/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd58cd81d8dc515dde664037a5a1a99bb4df7e00b9f3a18de8978f647c3f2b3c
3
+ size 995673413
checkpoint-16930/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f189957a14b9cf4fca550bee4ebfa68b91a25d1c61fd3f255dc5c5505dd1c2e8
3
+ size 14575
checkpoint-16930/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc5c6d1c88ebca9de906f01f74db9e6c8595c2692ae71faebebdcdbaa6651375
3
+ size 627
checkpoint-16930/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-16930/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-16930/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
checkpoint-16930/trainer_state.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6810389161109924,
3
+ "best_model_checkpoint": "YELP_full/checkpoint-10158",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 16930,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15,
13
+ "grad_norm": 8.398588180541992,
14
+ "learning_rate": 5.8227997637330186e-05,
15
+ "loss": 0.9245,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.3,
20
+ "grad_norm": 3.5803122520446777,
21
+ "learning_rate": 5.645599527466036e-05,
22
+ "loss": 0.7913,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.44,
27
+ "grad_norm": 3.3800671100616455,
28
+ "learning_rate": 5.468399291199055e-05,
29
+ "loss": 0.7613,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.59,
34
+ "grad_norm": 1.9862334728240967,
35
+ "learning_rate": 5.291199054932074e-05,
36
+ "loss": 0.7362,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.74,
41
+ "grad_norm": 2.043750524520874,
42
+ "learning_rate": 5.113998818665092e-05,
43
+ "loss": 0.7262,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.89,
48
+ "grad_norm": 2.8348655700683594,
49
+ "learning_rate": 4.93679858239811e-05,
50
+ "loss": 0.7165,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_accuracy": 0.69042,
56
+ "eval_loss": 0.6984841227531433,
57
+ "eval_runtime": 141.1206,
58
+ "eval_samples_per_second": 354.307,
59
+ "eval_steps_per_second": 1.849,
60
+ "step": 3386
61
+ },
62
+ {
63
+ "epoch": 1.03,
64
+ "grad_norm": 2.2309489250183105,
65
+ "learning_rate": 4.7595983461311283e-05,
66
+ "loss": 0.7047,
67
+ "step": 3500
68
+ },
69
+ {
70
+ "epoch": 1.18,
71
+ "grad_norm": 2.665053129196167,
72
+ "learning_rate": 4.582398109864147e-05,
73
+ "loss": 0.6764,
74
+ "step": 4000
75
+ },
76
+ {
77
+ "epoch": 1.33,
78
+ "grad_norm": 2.590282678604126,
79
+ "learning_rate": 4.4051978735971645e-05,
80
+ "loss": 0.6733,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 1.48,
85
+ "grad_norm": 2.890204429626465,
86
+ "learning_rate": 4.227997637330183e-05,
87
+ "loss": 0.6712,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 1.62,
92
+ "grad_norm": 2.0544567108154297,
93
+ "learning_rate": 4.050797401063202e-05,
94
+ "loss": 0.6692,
95
+ "step": 5500
96
+ },
97
+ {
98
+ "epoch": 1.77,
99
+ "grad_norm": 1.7416001558303833,
100
+ "learning_rate": 3.87359716479622e-05,
101
+ "loss": 0.6693,
102
+ "step": 6000
103
+ },
104
+ {
105
+ "epoch": 1.92,
106
+ "grad_norm": 2.1172029972076416,
107
+ "learning_rate": 3.696396928529238e-05,
108
+ "loss": 0.6641,
109
+ "step": 6500
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_accuracy": 0.69884,
114
+ "eval_loss": 0.6872273683547974,
115
+ "eval_runtime": 141.4186,
116
+ "eval_samples_per_second": 353.56,
117
+ "eval_steps_per_second": 1.846,
118
+ "step": 6772
119
+ },
120
+ {
121
+ "epoch": 2.07,
122
+ "grad_norm": 2.3501951694488525,
123
+ "learning_rate": 3.5191966922622565e-05,
124
+ "loss": 0.6458,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 2.22,
129
+ "grad_norm": 1.6436865329742432,
130
+ "learning_rate": 3.341996455995275e-05,
131
+ "loss": 0.624,
132
+ "step": 7500
133
+ },
134
+ {
135
+ "epoch": 2.36,
136
+ "grad_norm": 1.550398588180542,
137
+ "learning_rate": 3.164796219728293e-05,
138
+ "loss": 0.6243,
139
+ "step": 8000
140
+ },
141
+ {
142
+ "epoch": 2.51,
143
+ "grad_norm": 3.080960512161255,
144
+ "learning_rate": 2.9875959834613114e-05,
145
+ "loss": 0.6246,
146
+ "step": 8500
147
+ },
148
+ {
149
+ "epoch": 2.66,
150
+ "grad_norm": 2.6615045070648193,
151
+ "learning_rate": 2.81039574719433e-05,
152
+ "loss": 0.6246,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 2.81,
157
+ "grad_norm": 1.667477011680603,
158
+ "learning_rate": 2.633195510927348e-05,
159
+ "loss": 0.622,
160
+ "step": 9500
161
+ },
162
+ {
163
+ "epoch": 2.95,
164
+ "grad_norm": 2.1684648990631104,
165
+ "learning_rate": 2.4559952746603663e-05,
166
+ "loss": 0.62,
167
+ "step": 10000
168
+ },
169
+ {
170
+ "epoch": 3.0,
171
+ "eval_accuracy": 0.70262,
172
+ "eval_loss": 0.6810389161109924,
173
+ "eval_runtime": 141.0454,
174
+ "eval_samples_per_second": 354.496,
175
+ "eval_steps_per_second": 1.85,
176
+ "step": 10158
177
+ },
178
+ {
179
+ "epoch": 3.1,
180
+ "grad_norm": 2.221029043197632,
181
+ "learning_rate": 2.2787950383933847e-05,
182
+ "loss": 0.5931,
183
+ "step": 10500
184
+ },
185
+ {
186
+ "epoch": 3.25,
187
+ "grad_norm": 2.1362805366516113,
188
+ "learning_rate": 2.1015948021264028e-05,
189
+ "loss": 0.5843,
190
+ "step": 11000
191
+ },
192
+ {
193
+ "epoch": 3.4,
194
+ "grad_norm": 2.233245849609375,
195
+ "learning_rate": 1.9243945658594212e-05,
196
+ "loss": 0.5826,
197
+ "step": 11500
198
+ },
199
+ {
200
+ "epoch": 3.54,
201
+ "grad_norm": 2.0913615226745605,
202
+ "learning_rate": 1.7471943295924396e-05,
203
+ "loss": 0.5824,
204
+ "step": 12000
205
+ },
206
+ {
207
+ "epoch": 3.69,
208
+ "grad_norm": 2.6865012645721436,
209
+ "learning_rate": 1.5699940933254577e-05,
210
+ "loss": 0.5758,
211
+ "step": 12500
212
+ },
213
+ {
214
+ "epoch": 3.84,
215
+ "grad_norm": 2.226073741912842,
216
+ "learning_rate": 1.3927938570584761e-05,
217
+ "loss": 0.5822,
218
+ "step": 13000
219
+ },
220
+ {
221
+ "epoch": 3.99,
222
+ "grad_norm": 3.137810468673706,
223
+ "learning_rate": 1.2155936207914945e-05,
224
+ "loss": 0.5852,
225
+ "step": 13500
226
+ },
227
+ {
228
+ "epoch": 4.0,
229
+ "eval_accuracy": 0.70036,
230
+ "eval_loss": 0.6996423602104187,
231
+ "eval_runtime": 141.3356,
232
+ "eval_samples_per_second": 353.768,
233
+ "eval_steps_per_second": 1.847,
234
+ "step": 13544
235
+ },
236
+ {
237
+ "epoch": 4.13,
238
+ "grad_norm": 3.2649760246276855,
239
+ "learning_rate": 1.0383933845245126e-05,
240
+ "loss": 0.5559,
241
+ "step": 14000
242
+ },
243
+ {
244
+ "epoch": 4.28,
245
+ "grad_norm": 2.007054328918457,
246
+ "learning_rate": 8.61193148257531e-06,
247
+ "loss": 0.5501,
248
+ "step": 14500
249
+ },
250
+ {
251
+ "epoch": 4.43,
252
+ "grad_norm": 2.0773346424102783,
253
+ "learning_rate": 6.839929119905493e-06,
254
+ "loss": 0.5481,
255
+ "step": 15000
256
+ },
257
+ {
258
+ "epoch": 4.58,
259
+ "grad_norm": 3.139374256134033,
260
+ "learning_rate": 5.0679267572356765e-06,
261
+ "loss": 0.5484,
262
+ "step": 15500
263
+ },
264
+ {
265
+ "epoch": 4.73,
266
+ "grad_norm": 2.532846450805664,
267
+ "learning_rate": 3.2959243945658593e-06,
268
+ "loss": 0.5506,
269
+ "step": 16000
270
+ },
271
+ {
272
+ "epoch": 4.87,
273
+ "grad_norm": 1.989895224571228,
274
+ "learning_rate": 1.5239220318960424e-06,
275
+ "loss": 0.5467,
276
+ "step": 16500
277
+ },
278
+ {
279
+ "epoch": 5.0,
280
+ "eval_accuracy": 0.69984,
281
+ "eval_loss": 0.7139469385147095,
282
+ "eval_runtime": 141.3715,
283
+ "eval_samples_per_second": 353.678,
284
+ "eval_steps_per_second": 1.846,
285
+ "step": 16930
286
+ }
287
+ ],
288
+ "logging_steps": 500,
289
+ "max_steps": 16930,
290
+ "num_input_tokens_seen": 0,
291
+ "num_train_epochs": 5,
292
+ "save_steps": 500,
293
+ "total_flos": 1.559227030315131e+18,
294
+ "train_batch_size": 192,
295
+ "trial_name": null,
296
+ "trial_params": null
297
+ }
checkpoint-16930/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202a5542d25d5172e8161c0589eaf5ec460cc8e907a908b0e1388d20c67b502b
3
+ size 4539
checkpoint-16930/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3386/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2,
23
+ "LABEL_3": 3,
24
+ "LABEL_4": 4
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_ctx": 1024,
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": null,
32
+ "n_layer": 12,
33
+ "n_positions": 1024,
34
+ "pad_token_id": 50256,
35
+ "problem_type": "single_label_classification",
36
+ "reorder_and_upcast_attn": false,
37
+ "resid_pdrop": 0.1,
38
+ "scale_attn_by_inverse_layer_idx": false,
39
+ "scale_attn_weights": true,
40
+ "summary_activation": null,
41
+ "summary_first_dropout": 0.1,
42
+ "summary_proj_to_labels": true,
43
+ "summary_type": "cls_index",
44
+ "summary_use_proj": true,
45
+ "task_specific_params": {
46
+ "text-generation": {
47
+ "do_sample": true,
48
+ "max_length": 50
49
+ }
50
+ },
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.38.2",
53
+ "use_cache": true,
54
+ "vocab_size": 50257
55
+ }
checkpoint-3386/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3386/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b1fd630afaa81f9cb6fba2852daf870f9b3ccce2cd90c7f5f92e68adbe35254
3
+ size 497789648
checkpoint-3386/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540034f0d4d4ac78e8bfdde2e8cf904c131de29731a146d8cd468542e5fccbaf
3
+ size 995673413
checkpoint-3386/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2eb55b19a91c95a375d480df8d96670803e2083c5af7194a10a108016693962
3
+ size 14575
checkpoint-3386/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f6a993440071310de9019e6c5ce5237633ab076ff69ee362e2520e9677ff67
3
+ size 627
checkpoint-3386/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-3386/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3386/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
checkpoint-3386/trainer_state.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6984841227531433,
3
+ "best_model_checkpoint": "YELP_full/checkpoint-3386",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3386,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15,
13
+ "grad_norm": 8.398588180541992,
14
+ "learning_rate": 5.8227997637330186e-05,
15
+ "loss": 0.9245,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.3,
20
+ "grad_norm": 3.5803122520446777,
21
+ "learning_rate": 5.645599527466036e-05,
22
+ "loss": 0.7913,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.44,
27
+ "grad_norm": 3.3800671100616455,
28
+ "learning_rate": 5.468399291199055e-05,
29
+ "loss": 0.7613,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.59,
34
+ "grad_norm": 1.9862334728240967,
35
+ "learning_rate": 5.291199054932074e-05,
36
+ "loss": 0.7362,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.74,
41
+ "grad_norm": 2.043750524520874,
42
+ "learning_rate": 5.113998818665092e-05,
43
+ "loss": 0.7262,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.89,
48
+ "grad_norm": 2.8348655700683594,
49
+ "learning_rate": 4.93679858239811e-05,
50
+ "loss": 0.7165,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_accuracy": 0.69042,
56
+ "eval_loss": 0.6984841227531433,
57
+ "eval_runtime": 141.1206,
58
+ "eval_samples_per_second": 354.307,
59
+ "eval_steps_per_second": 1.849,
60
+ "step": 3386
61
+ }
62
+ ],
63
+ "logging_steps": 500,
64
+ "max_steps": 16930,
65
+ "num_input_tokens_seen": 0,
66
+ "num_train_epochs": 5,
67
+ "save_steps": 500,
68
+ "total_flos": 3.124146123583488e+17,
69
+ "train_batch_size": 192,
70
+ "trial_name": null,
71
+ "trial_params": null
72
+ }
checkpoint-3386/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202a5542d25d5172e8161c0589eaf5ec460cc8e907a908b0e1388d20c67b502b
3
+ size 4539
checkpoint-3386/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-6772/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2,
23
+ "LABEL_3": 3,
24
+ "LABEL_4": 4
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_ctx": 1024,
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": null,
32
+ "n_layer": 12,
33
+ "n_positions": 1024,
34
+ "pad_token_id": 50256,
35
+ "problem_type": "single_label_classification",
36
+ "reorder_and_upcast_attn": false,
37
+ "resid_pdrop": 0.1,
38
+ "scale_attn_by_inverse_layer_idx": false,
39
+ "scale_attn_weights": true,
40
+ "summary_activation": null,
41
+ "summary_first_dropout": 0.1,
42
+ "summary_proj_to_labels": true,
43
+ "summary_type": "cls_index",
44
+ "summary_use_proj": true,
45
+ "task_specific_params": {
46
+ "text-generation": {
47
+ "do_sample": true,
48
+ "max_length": 50
49
+ }
50
+ },
51
+ "torch_dtype": "float32",
52
+ "transformers_version": "4.38.2",
53
+ "use_cache": true,
54
+ "vocab_size": 50257
55
+ }
checkpoint-6772/merges.txt ADDED
The diff for this file is too large to render. See raw diff