Andyrasika commited on
Commit
02aed86
1 Parent(s): c7efe3e

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,61 @@
1
  ---
2
- license: creativeml-openrail-m
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ base_model: distilgpt2
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: gpt2_dolly_lite
8
+ results: []
9
  ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # gpt2_dolly_lite
15
+
16
+ This model is a fine-tuned version of [distilgpt2](https://huggingface.co/distilgpt2) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 2.4067
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 0.001
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 32
40
+ - seed: 42
41
+ - gradient_accumulation_steps: 4
42
+ - total_train_batch_size: 32
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: linear
45
+ - num_epochs: 3
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 2.708 | 1.0 | 1300 | 2.5611 |
52
+ | 2.1768 | 2.0 | 2600 | 2.4149 |
53
+ | 1.7189 | 3.0 | 3900 | 2.4067 |
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.32.1
59
+ - Pytorch 2.0.1+cu118
60
+ - Datasets 2.14.4
61
+ - Tokenizers 0.13.3
checkpoint-1300/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilgpt2",
3
+ "_num_labels": 1,
4
+ "activation_function": "gelu_new",
5
+ "architectures": [
6
+ "GPT2LMHeadModel"
7
+ ],
8
+ "attn_pdrop": 0.1,
9
+ "bos_token_id": 50256,
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 1024,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 6,
26
+ "n_positions": 1024,
27
+ "reorder_and_upcast_attn": false,
28
+ "resid_pdrop": 0.1,
29
+ "scale_attn_by_inverse_layer_idx": false,
30
+ "scale_attn_weights": true,
31
+ "summary_activation": null,
32
+ "summary_first_dropout": 0.1,
33
+ "summary_proj_to_labels": true,
34
+ "summary_type": "cls_index",
35
+ "summary_use_proj": true,
36
+ "task_specific_params": {
37
+ "text-generation": {
38
+ "do_sample": true,
39
+ "max_length": 50
40
+ }
41
+ },
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.32.1",
44
+ "use_cache": true,
45
+ "vocab_size": 50257
46
+ }
checkpoint-1300/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.32.1"
6
+ }
checkpoint-1300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f44c8f6c514ad8e349d345a46a159fc6ed4166c32d4626aa0a2f9d0ad011355
3
+ size 655364037
checkpoint-1300/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b5190e105a8a7362be16f16eaf1b51bfca3508ebfbe376e764a4a3f06b8bef7
3
+ size 327674773
checkpoint-1300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65ebbc7268a7952f4f6c6443d3fc774fecab4069169a85def8a285e8a02168ec
3
+ size 14575
checkpoint-1300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:212e58901c3687afbfbec22f9809a97294d8c570380f7ab9130924780eef0624
3
+ size 627
checkpoint-1300/trainer_state.json ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.561117649078369,
3
+ "best_model_checkpoint": "./gpt2_dolly_lite/checkpoint-1300",
4
+ "epoch": 0.9998077292828302,
5
+ "eval_steps": 500,
6
+ "global_step": 1300,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.0009871794871794872,
14
+ "loss": 3.3078,
15
+ "step": 50
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.0009743589743589744,
20
+ "loss": 3.1249,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0009615384615384616,
26
+ "loss": 3.1026,
27
+ "step": 150
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 0.0009487179487179487,
32
+ "loss": 3.0496,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.000935897435897436,
38
+ "loss": 2.9706,
39
+ "step": 250
40
+ },
41
+ {
42
+ "epoch": 0.23,
43
+ "learning_rate": 0.0009230769230769232,
44
+ "loss": 3.0195,
45
+ "step": 300
46
+ },
47
+ {
48
+ "epoch": 0.27,
49
+ "learning_rate": 0.0009102564102564102,
50
+ "loss": 2.9747,
51
+ "step": 350
52
+ },
53
+ {
54
+ "epoch": 0.31,
55
+ "learning_rate": 0.0008974358974358974,
56
+ "loss": 2.9205,
57
+ "step": 400
58
+ },
59
+ {
60
+ "epoch": 0.35,
61
+ "learning_rate": 0.0008846153846153846,
62
+ "loss": 2.9495,
63
+ "step": 450
64
+ },
65
+ {
66
+ "epoch": 0.38,
67
+ "learning_rate": 0.0008717948717948718,
68
+ "loss": 2.8866,
69
+ "step": 500
70
+ },
71
+ {
72
+ "epoch": 0.42,
73
+ "learning_rate": 0.0008589743589743589,
74
+ "loss": 2.892,
75
+ "step": 550
76
+ },
77
+ {
78
+ "epoch": 0.46,
79
+ "learning_rate": 0.0008461538461538462,
80
+ "loss": 2.8849,
81
+ "step": 600
82
+ },
83
+ {
84
+ "epoch": 0.5,
85
+ "learning_rate": 0.0008333333333333334,
86
+ "loss": 2.8819,
87
+ "step": 650
88
+ },
89
+ {
90
+ "epoch": 0.54,
91
+ "learning_rate": 0.0008205128205128205,
92
+ "loss": 2.8561,
93
+ "step": 700
94
+ },
95
+ {
96
+ "epoch": 0.58,
97
+ "learning_rate": 0.0008076923076923078,
98
+ "loss": 2.8311,
99
+ "step": 750
100
+ },
101
+ {
102
+ "epoch": 0.62,
103
+ "learning_rate": 0.0007948717948717948,
104
+ "loss": 2.8176,
105
+ "step": 800
106
+ },
107
+ {
108
+ "epoch": 0.65,
109
+ "learning_rate": 0.000782051282051282,
110
+ "loss": 2.7782,
111
+ "step": 850
112
+ },
113
+ {
114
+ "epoch": 0.69,
115
+ "learning_rate": 0.0007692307692307693,
116
+ "loss": 2.7273,
117
+ "step": 900
118
+ },
119
+ {
120
+ "epoch": 0.73,
121
+ "learning_rate": 0.0007564102564102564,
122
+ "loss": 2.7686,
123
+ "step": 950
124
+ },
125
+ {
126
+ "epoch": 0.77,
127
+ "learning_rate": 0.0007435897435897436,
128
+ "loss": 2.7274,
129
+ "step": 1000
130
+ },
131
+ {
132
+ "epoch": 0.81,
133
+ "learning_rate": 0.0007307692307692307,
134
+ "loss": 2.7514,
135
+ "step": 1050
136
+ },
137
+ {
138
+ "epoch": 0.85,
139
+ "learning_rate": 0.000717948717948718,
140
+ "loss": 2.7232,
141
+ "step": 1100
142
+ },
143
+ {
144
+ "epoch": 0.88,
145
+ "learning_rate": 0.0007051282051282052,
146
+ "loss": 2.6937,
147
+ "step": 1150
148
+ },
149
+ {
150
+ "epoch": 0.92,
151
+ "learning_rate": 0.0006923076923076923,
152
+ "loss": 2.7209,
153
+ "step": 1200
154
+ },
155
+ {
156
+ "epoch": 0.96,
157
+ "learning_rate": 0.0006794871794871796,
158
+ "loss": 2.6832,
159
+ "step": 1250
160
+ },
161
+ {
162
+ "epoch": 1.0,
163
+ "learning_rate": 0.0006666666666666666,
164
+ "loss": 2.708,
165
+ "step": 1300
166
+ },
167
+ {
168
+ "epoch": 1.0,
169
+ "eval_loss": 2.561117649078369,
170
+ "eval_runtime": 41.0726,
171
+ "eval_samples_per_second": 253.235,
172
+ "eval_steps_per_second": 7.937,
173
+ "step": 1300
174
+ }
175
+ ],
176
+ "logging_steps": 50,
177
+ "max_steps": 3900,
178
+ "num_train_epochs": 3,
179
+ "save_steps": 500,
180
+ "total_flos": 2217745708572672.0,
181
+ "trial_name": null,
182
+ "trial_params": null
183
+ }
checkpoint-1300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5863f7260e5f2acd6ed9097371440ce6f89c0fdc0fbea3c0b7bd599e0244316f
3
+ size 4027
checkpoint-2600/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilgpt2",
3
+ "_num_labels": 1,
4
+ "activation_function": "gelu_new",
5
+ "architectures": [
6
+ "GPT2LMHeadModel"
7
+ ],
8
+ "attn_pdrop": 0.1,
9
+ "bos_token_id": 50256,
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 1024,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 6,
26
+ "n_positions": 1024,
27
+ "reorder_and_upcast_attn": false,
28
+ "resid_pdrop": 0.1,
29
+ "scale_attn_by_inverse_layer_idx": false,
30
+ "scale_attn_weights": true,
31
+ "summary_activation": null,
32
+ "summary_first_dropout": 0.1,
33
+ "summary_proj_to_labels": true,
34
+ "summary_type": "cls_index",
35
+ "summary_use_proj": true,
36
+ "task_specific_params": {
37
+ "text-generation": {
38
+ "do_sample": true,
39
+ "max_length": 50
40
+ }
41
+ },
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.32.1",
44
+ "use_cache": true,
45
+ "vocab_size": 50257
46
+ }
checkpoint-2600/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.32.1"
6
+ }
checkpoint-2600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2f5f1a4c772f7ed8b56cd9b26fadd5b86d5a775421136ecefc1e753bf2cb96d
3
+ size 655364037
checkpoint-2600/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcab8e0ae7f44c5de6401e22a75acebe3915057a32af2cae6bb57a2745f7a34d
3
+ size 327674773
checkpoint-2600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:950a675deac0bd82676ee97ba5d2d1126640847e7cfd72fc6877d8dfc8e26002
3
+ size 14575
checkpoint-2600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89dbab01816d59b4bbc89d82d176e913e76e05dbd72b4fa6cac4ee1fc89a3eaa
3
+ size 627
checkpoint-2600/trainer_state.json ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.41491436958313,
3
+ "best_model_checkpoint": "./gpt2_dolly_lite/checkpoint-2600",
4
+ "epoch": 1.9996154585656605,
5
+ "eval_steps": 500,
6
+ "global_step": 2600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.0009871794871794872,
14
+ "loss": 3.3078,
15
+ "step": 50
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.0009743589743589744,
20
+ "loss": 3.1249,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0009615384615384616,
26
+ "loss": 3.1026,
27
+ "step": 150
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 0.0009487179487179487,
32
+ "loss": 3.0496,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.000935897435897436,
38
+ "loss": 2.9706,
39
+ "step": 250
40
+ },
41
+ {
42
+ "epoch": 0.23,
43
+ "learning_rate": 0.0009230769230769232,
44
+ "loss": 3.0195,
45
+ "step": 300
46
+ },
47
+ {
48
+ "epoch": 0.27,
49
+ "learning_rate": 0.0009102564102564102,
50
+ "loss": 2.9747,
51
+ "step": 350
52
+ },
53
+ {
54
+ "epoch": 0.31,
55
+ "learning_rate": 0.0008974358974358974,
56
+ "loss": 2.9205,
57
+ "step": 400
58
+ },
59
+ {
60
+ "epoch": 0.35,
61
+ "learning_rate": 0.0008846153846153846,
62
+ "loss": 2.9495,
63
+ "step": 450
64
+ },
65
+ {
66
+ "epoch": 0.38,
67
+ "learning_rate": 0.0008717948717948718,
68
+ "loss": 2.8866,
69
+ "step": 500
70
+ },
71
+ {
72
+ "epoch": 0.42,
73
+ "learning_rate": 0.0008589743589743589,
74
+ "loss": 2.892,
75
+ "step": 550
76
+ },
77
+ {
78
+ "epoch": 0.46,
79
+ "learning_rate": 0.0008461538461538462,
80
+ "loss": 2.8849,
81
+ "step": 600
82
+ },
83
+ {
84
+ "epoch": 0.5,
85
+ "learning_rate": 0.0008333333333333334,
86
+ "loss": 2.8819,
87
+ "step": 650
88
+ },
89
+ {
90
+ "epoch": 0.54,
91
+ "learning_rate": 0.0008205128205128205,
92
+ "loss": 2.8561,
93
+ "step": 700
94
+ },
95
+ {
96
+ "epoch": 0.58,
97
+ "learning_rate": 0.0008076923076923078,
98
+ "loss": 2.8311,
99
+ "step": 750
100
+ },
101
+ {
102
+ "epoch": 0.62,
103
+ "learning_rate": 0.0007948717948717948,
104
+ "loss": 2.8176,
105
+ "step": 800
106
+ },
107
+ {
108
+ "epoch": 0.65,
109
+ "learning_rate": 0.000782051282051282,
110
+ "loss": 2.7782,
111
+ "step": 850
112
+ },
113
+ {
114
+ "epoch": 0.69,
115
+ "learning_rate": 0.0007692307692307693,
116
+ "loss": 2.7273,
117
+ "step": 900
118
+ },
119
+ {
120
+ "epoch": 0.73,
121
+ "learning_rate": 0.0007564102564102564,
122
+ "loss": 2.7686,
123
+ "step": 950
124
+ },
125
+ {
126
+ "epoch": 0.77,
127
+ "learning_rate": 0.0007435897435897436,
128
+ "loss": 2.7274,
129
+ "step": 1000
130
+ },
131
+ {
132
+ "epoch": 0.81,
133
+ "learning_rate": 0.0007307692307692307,
134
+ "loss": 2.7514,
135
+ "step": 1050
136
+ },
137
+ {
138
+ "epoch": 0.85,
139
+ "learning_rate": 0.000717948717948718,
140
+ "loss": 2.7232,
141
+ "step": 1100
142
+ },
143
+ {
144
+ "epoch": 0.88,
145
+ "learning_rate": 0.0007051282051282052,
146
+ "loss": 2.6937,
147
+ "step": 1150
148
+ },
149
+ {
150
+ "epoch": 0.92,
151
+ "learning_rate": 0.0006923076923076923,
152
+ "loss": 2.7209,
153
+ "step": 1200
154
+ },
155
+ {
156
+ "epoch": 0.96,
157
+ "learning_rate": 0.0006794871794871796,
158
+ "loss": 2.6832,
159
+ "step": 1250
160
+ },
161
+ {
162
+ "epoch": 1.0,
163
+ "learning_rate": 0.0006666666666666666,
164
+ "loss": 2.708,
165
+ "step": 1300
166
+ },
167
+ {
168
+ "epoch": 1.0,
169
+ "eval_loss": 2.561117649078369,
170
+ "eval_runtime": 41.0726,
171
+ "eval_samples_per_second": 253.235,
172
+ "eval_steps_per_second": 7.937,
173
+ "step": 1300
174
+ },
175
+ {
176
+ "epoch": 1.04,
177
+ "learning_rate": 0.0006538461538461538,
178
+ "loss": 2.2119,
179
+ "step": 1350
180
+ },
181
+ {
182
+ "epoch": 1.08,
183
+ "learning_rate": 0.0006410256410256411,
184
+ "loss": 2.2279,
185
+ "step": 1400
186
+ },
187
+ {
188
+ "epoch": 1.12,
189
+ "learning_rate": 0.0006282051282051282,
190
+ "loss": 2.1996,
191
+ "step": 1450
192
+ },
193
+ {
194
+ "epoch": 1.15,
195
+ "learning_rate": 0.0006153846153846154,
196
+ "loss": 2.1886,
197
+ "step": 1500
198
+ },
199
+ {
200
+ "epoch": 1.19,
201
+ "learning_rate": 0.0006025641025641026,
202
+ "loss": 2.268,
203
+ "step": 1550
204
+ },
205
+ {
206
+ "epoch": 1.23,
207
+ "learning_rate": 0.0005897435897435898,
208
+ "loss": 2.2118,
209
+ "step": 1600
210
+ },
211
+ {
212
+ "epoch": 1.27,
213
+ "learning_rate": 0.0005769230769230769,
214
+ "loss": 2.2831,
215
+ "step": 1650
216
+ },
217
+ {
218
+ "epoch": 1.31,
219
+ "learning_rate": 0.0005641025641025641,
220
+ "loss": 2.2317,
221
+ "step": 1700
222
+ },
223
+ {
224
+ "epoch": 1.35,
225
+ "learning_rate": 0.0005512820512820514,
226
+ "loss": 2.2504,
227
+ "step": 1750
228
+ },
229
+ {
230
+ "epoch": 1.38,
231
+ "learning_rate": 0.0005384615384615384,
232
+ "loss": 2.241,
233
+ "step": 1800
234
+ },
235
+ {
236
+ "epoch": 1.42,
237
+ "learning_rate": 0.0005256410256410256,
238
+ "loss": 2.2381,
239
+ "step": 1850
240
+ },
241
+ {
242
+ "epoch": 1.46,
243
+ "learning_rate": 0.0005128205128205128,
244
+ "loss": 2.2313,
245
+ "step": 1900
246
+ },
247
+ {
248
+ "epoch": 1.5,
249
+ "learning_rate": 0.0005,
250
+ "loss": 2.2493,
251
+ "step": 1950
252
+ },
253
+ {
254
+ "epoch": 1.54,
255
+ "learning_rate": 0.0004871794871794872,
256
+ "loss": 2.2056,
257
+ "step": 2000
258
+ },
259
+ {
260
+ "epoch": 1.58,
261
+ "learning_rate": 0.00047435897435897434,
262
+ "loss": 2.2178,
263
+ "step": 2050
264
+ },
265
+ {
266
+ "epoch": 1.62,
267
+ "learning_rate": 0.0004615384615384616,
268
+ "loss": 2.2518,
269
+ "step": 2100
270
+ },
271
+ {
272
+ "epoch": 1.65,
273
+ "learning_rate": 0.0004487179487179487,
274
+ "loss": 2.2354,
275
+ "step": 2150
276
+ },
277
+ {
278
+ "epoch": 1.69,
279
+ "learning_rate": 0.0004358974358974359,
280
+ "loss": 2.2144,
281
+ "step": 2200
282
+ },
283
+ {
284
+ "epoch": 1.73,
285
+ "learning_rate": 0.0004230769230769231,
286
+ "loss": 2.2197,
287
+ "step": 2250
288
+ },
289
+ {
290
+ "epoch": 1.77,
291
+ "learning_rate": 0.00041025641025641023,
292
+ "loss": 2.2006,
293
+ "step": 2300
294
+ },
295
+ {
296
+ "epoch": 1.81,
297
+ "learning_rate": 0.0003974358974358974,
298
+ "loss": 2.1802,
299
+ "step": 2350
300
+ },
301
+ {
302
+ "epoch": 1.85,
303
+ "learning_rate": 0.00038461538461538467,
304
+ "loss": 2.1656,
305
+ "step": 2400
306
+ },
307
+ {
308
+ "epoch": 1.88,
309
+ "learning_rate": 0.0003717948717948718,
310
+ "loss": 2.1592,
311
+ "step": 2450
312
+ },
313
+ {
314
+ "epoch": 1.92,
315
+ "learning_rate": 0.000358974358974359,
316
+ "loss": 2.1847,
317
+ "step": 2500
318
+ },
319
+ {
320
+ "epoch": 1.96,
321
+ "learning_rate": 0.00034615384615384613,
322
+ "loss": 2.1981,
323
+ "step": 2550
324
+ },
325
+ {
326
+ "epoch": 2.0,
327
+ "learning_rate": 0.0003333333333333333,
328
+ "loss": 2.1768,
329
+ "step": 2600
330
+ },
331
+ {
332
+ "epoch": 2.0,
333
+ "eval_loss": 2.41491436958313,
334
+ "eval_runtime": 39.8227,
335
+ "eval_samples_per_second": 261.183,
336
+ "eval_steps_per_second": 8.186,
337
+ "step": 2600
338
+ }
339
+ ],
340
+ "logging_steps": 50,
341
+ "max_steps": 3900,
342
+ "num_train_epochs": 3,
343
+ "save_steps": 500,
344
+ "total_flos": 4433294636163072.0,
345
+ "trial_name": null,
346
+ "trial_params": null
347
+ }
checkpoint-2600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5863f7260e5f2acd6ed9097371440ce6f89c0fdc0fbea3c0b7bd599e0244316f
3
+ size 4027
checkpoint-3900/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilgpt2",
3
+ "_num_labels": 1,
4
+ "activation_function": "gelu_new",
5
+ "architectures": [
6
+ "GPT2LMHeadModel"
7
+ ],
8
+ "attn_pdrop": 0.1,
9
+ "bos_token_id": 50256,
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 1024,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 6,
26
+ "n_positions": 1024,
27
+ "reorder_and_upcast_attn": false,
28
+ "resid_pdrop": 0.1,
29
+ "scale_attn_by_inverse_layer_idx": false,
30
+ "scale_attn_weights": true,
31
+ "summary_activation": null,
32
+ "summary_first_dropout": 0.1,
33
+ "summary_proj_to_labels": true,
34
+ "summary_type": "cls_index",
35
+ "summary_use_proj": true,
36
+ "task_specific_params": {
37
+ "text-generation": {
38
+ "do_sample": true,
39
+ "max_length": 50
40
+ }
41
+ },
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.32.1",
44
+ "use_cache": true,
45
+ "vocab_size": 50257
46
+ }
checkpoint-3900/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.32.1"
6
+ }
checkpoint-3900/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2cdf22127a37d9cbfec1697d33a6dd20c5bed3db9c6597819288615554f655
3
+ size 655364037
checkpoint-3900/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d254aeaf5388d54112768c1e776c979981bbe1ff7708fe3e83c9c20870b05fa
3
+ size 327674773
checkpoint-3900/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e64853923e7e2bdc8669bfd7cdf72aa2ac6db3c7b3f443ec85518ea3ca067872
3
+ size 14575
checkpoint-3900/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb3e76f913ba655b35f1677dd9ce4b515a60ebabae02f9cef6d01938028d265
3
+ size 627
checkpoint-3900/trainer_state.json ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.406733751296997,
3
+ "best_model_checkpoint": "./gpt2_dolly_lite/checkpoint-3900",
4
+ "epoch": 2.9994231878484907,
5
+ "eval_steps": 500,
6
+ "global_step": 3900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.0009871794871794872,
14
+ "loss": 3.3078,
15
+ "step": 50
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.0009743589743589744,
20
+ "loss": 3.1249,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0009615384615384616,
26
+ "loss": 3.1026,
27
+ "step": 150
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 0.0009487179487179487,
32
+ "loss": 3.0496,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.000935897435897436,
38
+ "loss": 2.9706,
39
+ "step": 250
40
+ },
41
+ {
42
+ "epoch": 0.23,
43
+ "learning_rate": 0.0009230769230769232,
44
+ "loss": 3.0195,
45
+ "step": 300
46
+ },
47
+ {
48
+ "epoch": 0.27,
49
+ "learning_rate": 0.0009102564102564102,
50
+ "loss": 2.9747,
51
+ "step": 350
52
+ },
53
+ {
54
+ "epoch": 0.31,
55
+ "learning_rate": 0.0008974358974358974,
56
+ "loss": 2.9205,
57
+ "step": 400
58
+ },
59
+ {
60
+ "epoch": 0.35,
61
+ "learning_rate": 0.0008846153846153846,
62
+ "loss": 2.9495,
63
+ "step": 450
64
+ },
65
+ {
66
+ "epoch": 0.38,
67
+ "learning_rate": 0.0008717948717948718,
68
+ "loss": 2.8866,
69
+ "step": 500
70
+ },
71
+ {
72
+ "epoch": 0.42,
73
+ "learning_rate": 0.0008589743589743589,
74
+ "loss": 2.892,
75
+ "step": 550
76
+ },
77
+ {
78
+ "epoch": 0.46,
79
+ "learning_rate": 0.0008461538461538462,
80
+ "loss": 2.8849,
81
+ "step": 600
82
+ },
83
+ {
84
+ "epoch": 0.5,
85
+ "learning_rate": 0.0008333333333333334,
86
+ "loss": 2.8819,
87
+ "step": 650
88
+ },
89
+ {
90
+ "epoch": 0.54,
91
+ "learning_rate": 0.0008205128205128205,
92
+ "loss": 2.8561,
93
+ "step": 700
94
+ },
95
+ {
96
+ "epoch": 0.58,
97
+ "learning_rate": 0.0008076923076923078,
98
+ "loss": 2.8311,
99
+ "step": 750
100
+ },
101
+ {
102
+ "epoch": 0.62,
103
+ "learning_rate": 0.0007948717948717948,
104
+ "loss": 2.8176,
105
+ "step": 800
106
+ },
107
+ {
108
+ "epoch": 0.65,
109
+ "learning_rate": 0.000782051282051282,
110
+ "loss": 2.7782,
111
+ "step": 850
112
+ },
113
+ {
114
+ "epoch": 0.69,
115
+ "learning_rate": 0.0007692307692307693,
116
+ "loss": 2.7273,
117
+ "step": 900
118
+ },
119
+ {
120
+ "epoch": 0.73,
121
+ "learning_rate": 0.0007564102564102564,
122
+ "loss": 2.7686,
123
+ "step": 950
124
+ },
125
+ {
126
+ "epoch": 0.77,
127
+ "learning_rate": 0.0007435897435897436,
128
+ "loss": 2.7274,
129
+ "step": 1000
130
+ },
131
+ {
132
+ "epoch": 0.81,
133
+ "learning_rate": 0.0007307692307692307,
134
+ "loss": 2.7514,
135
+ "step": 1050
136
+ },
137
+ {
138
+ "epoch": 0.85,
139
+ "learning_rate": 0.000717948717948718,
140
+ "loss": 2.7232,
141
+ "step": 1100
142
+ },
143
+ {
144
+ "epoch": 0.88,
145
+ "learning_rate": 0.0007051282051282052,
146
+ "loss": 2.6937,
147
+ "step": 1150
148
+ },
149
+ {
150
+ "epoch": 0.92,
151
+ "learning_rate": 0.0006923076923076923,
152
+ "loss": 2.7209,
153
+ "step": 1200
154
+ },
155
+ {
156
+ "epoch": 0.96,
157
+ "learning_rate": 0.0006794871794871796,
158
+ "loss": 2.6832,
159
+ "step": 1250
160
+ },
161
+ {
162
+ "epoch": 1.0,
163
+ "learning_rate": 0.0006666666666666666,
164
+ "loss": 2.708,
165
+ "step": 1300
166
+ },
167
+ {
168
+ "epoch": 1.0,
169
+ "eval_loss": 2.561117649078369,
170
+ "eval_runtime": 41.0726,
171
+ "eval_samples_per_second": 253.235,
172
+ "eval_steps_per_second": 7.937,
173
+ "step": 1300
174
+ },
175
+ {
176
+ "epoch": 1.04,
177
+ "learning_rate": 0.0006538461538461538,
178
+ "loss": 2.2119,
179
+ "step": 1350
180
+ },
181
+ {
182
+ "epoch": 1.08,
183
+ "learning_rate": 0.0006410256410256411,
184
+ "loss": 2.2279,
185
+ "step": 1400
186
+ },
187
+ {
188
+ "epoch": 1.12,
189
+ "learning_rate": 0.0006282051282051282,
190
+ "loss": 2.1996,
191
+ "step": 1450
192
+ },
193
+ {
194
+ "epoch": 1.15,
195
+ "learning_rate": 0.0006153846153846154,
196
+ "loss": 2.1886,
197
+ "step": 1500
198
+ },
199
+ {
200
+ "epoch": 1.19,
201
+ "learning_rate": 0.0006025641025641026,
202
+ "loss": 2.268,
203
+ "step": 1550
204
+ },
205
+ {
206
+ "epoch": 1.23,
207
+ "learning_rate": 0.0005897435897435898,
208
+ "loss": 2.2118,
209
+ "step": 1600
210
+ },
211
+ {
212
+ "epoch": 1.27,
213
+ "learning_rate": 0.0005769230769230769,
214
+ "loss": 2.2831,
215
+ "step": 1650
216
+ },
217
+ {
218
+ "epoch": 1.31,
219
+ "learning_rate": 0.0005641025641025641,
220
+ "loss": 2.2317,
221
+ "step": 1700
222
+ },
223
+ {
224
+ "epoch": 1.35,
225
+ "learning_rate": 0.0005512820512820514,
226
+ "loss": 2.2504,
227
+ "step": 1750
228
+ },
229
+ {
230
+ "epoch": 1.38,
231
+ "learning_rate": 0.0005384615384615384,
232
+ "loss": 2.241,
233
+ "step": 1800
234
+ },
235
+ {
236
+ "epoch": 1.42,
237
+ "learning_rate": 0.0005256410256410256,
238
+ "loss": 2.2381,
239
+ "step": 1850
240
+ },
241
+ {
242
+ "epoch": 1.46,
243
+ "learning_rate": 0.0005128205128205128,
244
+ "loss": 2.2313,
245
+ "step": 1900
246
+ },
247
+ {
248
+ "epoch": 1.5,
249
+ "learning_rate": 0.0005,
250
+ "loss": 2.2493,
251
+ "step": 1950
252
+ },
253
+ {
254
+ "epoch": 1.54,
255
+ "learning_rate": 0.0004871794871794872,
256
+ "loss": 2.2056,
257
+ "step": 2000
258
+ },
259
+ {
260
+ "epoch": 1.58,
261
+ "learning_rate": 0.00047435897435897434,
262
+ "loss": 2.2178,
263
+ "step": 2050
264
+ },
265
+ {
266
+ "epoch": 1.62,
267
+ "learning_rate": 0.0004615384615384616,
268
+ "loss": 2.2518,
269
+ "step": 2100
270
+ },
271
+ {
272
+ "epoch": 1.65,
273
+ "learning_rate": 0.0004487179487179487,
274
+ "loss": 2.2354,
275
+ "step": 2150
276
+ },
277
+ {
278
+ "epoch": 1.69,
279
+ "learning_rate": 0.0004358974358974359,
280
+ "loss": 2.2144,
281
+ "step": 2200
282
+ },
283
+ {
284
+ "epoch": 1.73,
285
+ "learning_rate": 0.0004230769230769231,
286
+ "loss": 2.2197,
287
+ "step": 2250
288
+ },
289
+ {
290
+ "epoch": 1.77,
291
+ "learning_rate": 0.00041025641025641023,
292
+ "loss": 2.2006,
293
+ "step": 2300
294
+ },
295
+ {
296
+ "epoch": 1.81,
297
+ "learning_rate": 0.0003974358974358974,
298
+ "loss": 2.1802,
299
+ "step": 2350
300
+ },
301
+ {
302
+ "epoch": 1.85,
303
+ "learning_rate": 0.00038461538461538467,
304
+ "loss": 2.1656,
305
+ "step": 2400
306
+ },
307
+ {
308
+ "epoch": 1.88,
309
+ "learning_rate": 0.0003717948717948718,
310
+ "loss": 2.1592,
311
+ "step": 2450
312
+ },
313
+ {
314
+ "epoch": 1.92,
315
+ "learning_rate": 0.000358974358974359,
316
+ "loss": 2.1847,
317
+ "step": 2500
318
+ },
319
+ {
320
+ "epoch": 1.96,
321
+ "learning_rate": 0.00034615384615384613,
322
+ "loss": 2.1981,
323
+ "step": 2550
324
+ },
325
+ {
326
+ "epoch": 2.0,
327
+ "learning_rate": 0.0003333333333333333,
328
+ "loss": 2.1768,
329
+ "step": 2600
330
+ },
331
+ {
332
+ "epoch": 2.0,
333
+ "eval_loss": 2.41491436958313,
334
+ "eval_runtime": 39.8227,
335
+ "eval_samples_per_second": 261.183,
336
+ "eval_steps_per_second": 8.186,
337
+ "step": 2600
338
+ },
339
+ {
340
+ "epoch": 2.04,
341
+ "learning_rate": 0.00032051282051282057,
342
+ "loss": 1.7593,
343
+ "step": 2650
344
+ },
345
+ {
346
+ "epoch": 2.08,
347
+ "learning_rate": 0.0003076923076923077,
348
+ "loss": 1.7591,
349
+ "step": 2700
350
+ },
351
+ {
352
+ "epoch": 2.11,
353
+ "learning_rate": 0.0002948717948717949,
354
+ "loss": 1.7473,
355
+ "step": 2750
356
+ },
357
+ {
358
+ "epoch": 2.15,
359
+ "learning_rate": 0.00028205128205128203,
360
+ "loss": 1.7836,
361
+ "step": 2800
362
+ },
363
+ {
364
+ "epoch": 2.19,
365
+ "learning_rate": 0.0002692307692307692,
366
+ "loss": 1.7636,
367
+ "step": 2850
368
+ },
369
+ {
370
+ "epoch": 2.23,
371
+ "learning_rate": 0.0002564102564102564,
372
+ "loss": 1.763,
373
+ "step": 2900
374
+ },
375
+ {
376
+ "epoch": 2.27,
377
+ "learning_rate": 0.0002435897435897436,
378
+ "loss": 1.7639,
379
+ "step": 2950
380
+ },
381
+ {
382
+ "epoch": 2.31,
383
+ "learning_rate": 0.0002307692307692308,
384
+ "loss": 1.791,
385
+ "step": 3000
386
+ },
387
+ {
388
+ "epoch": 2.35,
389
+ "learning_rate": 0.00021794871794871795,
390
+ "loss": 1.7802,
391
+ "step": 3050
392
+ },
393
+ {
394
+ "epoch": 2.38,
395
+ "learning_rate": 0.00020512820512820512,
396
+ "loss": 1.7899,
397
+ "step": 3100
398
+ },
399
+ {
400
+ "epoch": 2.42,
401
+ "learning_rate": 0.00019230769230769233,
402
+ "loss": 1.7601,
403
+ "step": 3150
404
+ },
405
+ {
406
+ "epoch": 2.46,
407
+ "learning_rate": 0.0001794871794871795,
408
+ "loss": 1.7696,
409
+ "step": 3200
410
+ },
411
+ {
412
+ "epoch": 2.5,
413
+ "learning_rate": 0.00016666666666666666,
414
+ "loss": 1.7586,
415
+ "step": 3250
416
+ },
417
+ {
418
+ "epoch": 2.54,
419
+ "learning_rate": 0.00015384615384615385,
420
+ "loss": 1.7312,
421
+ "step": 3300
422
+ },
423
+ {
424
+ "epoch": 2.58,
425
+ "learning_rate": 0.00014102564102564101,
426
+ "loss": 1.7353,
427
+ "step": 3350
428
+ },
429
+ {
430
+ "epoch": 2.61,
431
+ "learning_rate": 0.0001282051282051282,
432
+ "loss": 1.7453,
433
+ "step": 3400
434
+ },
435
+ {
436
+ "epoch": 2.65,
437
+ "learning_rate": 0.0001153846153846154,
438
+ "loss": 1.7397,
439
+ "step": 3450
440
+ },
441
+ {
442
+ "epoch": 2.69,
443
+ "learning_rate": 0.00010256410256410256,
444
+ "loss": 1.7529,
445
+ "step": 3500
446
+ },
447
+ {
448
+ "epoch": 2.73,
449
+ "learning_rate": 8.974358974358975e-05,
450
+ "loss": 1.7464,
451
+ "step": 3550
452
+ },
453
+ {
454
+ "epoch": 2.77,
455
+ "learning_rate": 7.692307692307693e-05,
456
+ "loss": 1.7271,
457
+ "step": 3600
458
+ },
459
+ {
460
+ "epoch": 2.81,
461
+ "learning_rate": 6.41025641025641e-05,
462
+ "loss": 1.7631,
463
+ "step": 3650
464
+ },
465
+ {
466
+ "epoch": 2.85,
467
+ "learning_rate": 5.128205128205128e-05,
468
+ "loss": 1.7462,
469
+ "step": 3700
470
+ },
471
+ {
472
+ "epoch": 2.88,
473
+ "learning_rate": 3.846153846153846e-05,
474
+ "loss": 1.7318,
475
+ "step": 3750
476
+ },
477
+ {
478
+ "epoch": 2.92,
479
+ "learning_rate": 2.564102564102564e-05,
480
+ "loss": 1.724,
481
+ "step": 3800
482
+ },
483
+ {
484
+ "epoch": 2.96,
485
+ "learning_rate": 1.282051282051282e-05,
486
+ "loss": 1.7065,
487
+ "step": 3850
488
+ },
489
+ {
490
+ "epoch": 3.0,
491
+ "learning_rate": 0.0,
492
+ "loss": 1.7189,
493
+ "step": 3900
494
+ },
495
+ {
496
+ "epoch": 3.0,
497
+ "eval_loss": 2.406733751296997,
498
+ "eval_runtime": 39.7653,
499
+ "eval_samples_per_second": 261.56,
500
+ "eval_steps_per_second": 8.198,
501
+ "step": 3900
502
+ }
503
+ ],
504
+ "logging_steps": 50,
505
+ "max_steps": 3900,
506
+ "num_train_epochs": 3,
507
+ "save_steps": 500,
508
+ "total_flos": 6647135438315520.0,
509
+ "trial_name": null,
510
+ "trial_params": null
511
+ }
checkpoint-3900/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5863f7260e5f2acd6ed9097371440ce6f89c0fdc0fbea3c0b7bd599e0244316f
3
+ size 4027
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilgpt2",
3
+ "_num_labels": 1,
4
+ "activation_function": "gelu_new",
5
+ "architectures": [
6
+ "GPT2LMHeadModel"
7
+ ],
8
+ "attn_pdrop": 0.1,
9
+ "bos_token_id": 50256,
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 1024,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 6,
26
+ "n_positions": 1024,
27
+ "reorder_and_upcast_attn": false,
28
+ "resid_pdrop": 0.1,
29
+ "scale_attn_by_inverse_layer_idx": false,
30
+ "scale_attn_weights": true,
31
+ "summary_activation": null,
32
+ "summary_first_dropout": 0.1,
33
+ "summary_proj_to_labels": true,
34
+ "summary_type": "cls_index",
35
+ "summary_use_proj": true,
36
+ "task_specific_params": {
37
+ "text-generation": {
38
+ "do_sample": true,
39
+ "max_length": 50
40
+ }
41
+ },
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.32.1",
44
+ "use_cache": true,
45
+ "vocab_size": 50257
46
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.32.1"
6
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d254aeaf5388d54112768c1e776c979981bbe1ff7708fe3e83c9c20870b05fa
3
+ size 327674773
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5863f7260e5f2acd6ed9097371440ce6f89c0fdc0fbea3c0b7bd599e0244316f
3
+ size 4027