AlekseyKorshuk commited on
Commit
22cf94e
1 Parent(s): 4dede35

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/bill-wurtz")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1y01b0sy/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Bill Wurtz's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1l9hr5w2) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1l9hr5w2/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/bill-wurtz")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/27ysbe74/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Bill Wurtz's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/2f8oa51l) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/2f8oa51l/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "huggingartists/bill-wurtz",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
@@ -18,7 +18,9 @@
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
 
21
  "resid_pdrop": 0.1,
 
22
  "scale_attn_weights": true,
23
  "summary_activation": null,
24
  "summary_first_dropout": 0.1,
@@ -35,7 +37,7 @@
35
  }
36
  },
37
  "torch_dtype": "float32",
38
- "transformers_version": "4.9.2",
39
  "use_cache": true,
40
  "vocab_size": 50257
41
  }
 
1
  {
2
+ "_name_or_path": "bill-wurtz",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
  "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
  "scale_attn_weights": true,
25
  "summary_activation": null,
26
  "summary_first_dropout": 0.1,
 
37
  }
38
  },
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.16.2",
41
  "use_cache": true,
42
  "vocab_size": 50257
43
  }
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 4.039820671081543, "eval_runtime": 0.5683, "eval_samples_per_second": 36.954, "eval_steps_per_second": 5.279, "epoch": 22.0}
 
1
+ {"eval_loss": 2.747051954269409, "eval_runtime": 1.0203, "eval_samples_per_second": 19.602, "eval_steps_per_second": 2.94, "epoch": 6.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5eea9a74d99aea650690be8e5916b60d87d02d8e31dc45b59fa4bd1904a4b58
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:060e7080e916a0cfca49c43ca58ab8d2e2597dde7fe50e0b1d8ddb0095bf8b92
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa1422428e38213576ff235f2001350e546a4c4643f2b2f472d00667df459b33
3
  size 995603825
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982717a4b3eb11fd324c00becfc94519651828fb480697cf1ef1e21682a7a1bc
3
  size 995603825
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63382be8feee6fe2b303cf897efe67e158593589b9171b749dab61667b6c81f2
3
  size 510403817
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1d3bf79540744461b44cf0dadab89cabcbce43cfed8f94046bf29deb00e002
3
  size 510403817
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1d50a042836fb7244e3a57318b3664fd2cb3c84d1ecf895aeecf11cd286b9d1
3
  size 14567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12e38e4c0903310b46906cc923622a2da9fcbd366af1ad3842402f38f5c0b222
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1aee7c44e1d1ea5ba4c4593abd0231fd0c216d063ddd82618d0f2d2c0e4a5dc
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed9457e8f7ccd2a262fcd727a62c61efaa77aaf95187fa3bb6da72e39564d99e
3
  size 623
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
trainer_state.json CHANGED
@@ -1,232 +1,54 @@
1
  {
2
- "best_metric": 2.6812992095947266,
3
- "best_model_checkpoint": "output/bill-wurtz/checkpoint-140",
4
- "epoch": 5.0,
5
- "global_step": 140,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.19,
12
- "learning_rate": 0.00012591446386292745,
13
- "loss": 3.5789,
14
  "step": 5
15
  },
16
  {
17
- "epoch": 0.37,
18
- "learning_rate": 9.577107195028616e-05,
19
- "loss": 3.3728,
20
  "step": 10
21
  },
22
  {
23
- "epoch": 0.56,
24
- "learning_rate": 5.668773501204858e-05,
25
- "loss": 3.2429,
26
  "step": 15
27
  },
28
  {
29
- "epoch": 0.74,
30
- "learning_rate": 2.152382364220488e-05,
31
- "loss": 3.2491,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 0.93,
36
- "learning_rate": 1.849121878224087e-06,
37
- "loss": 3.1415,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_loss": 3.1433706283569336,
43
- "eval_runtime": 0.6351,
44
- "eval_samples_per_second": 42.515,
45
- "eval_steps_per_second": 6.299,
46
- "step": 27
47
- },
48
- {
49
- "epoch": 1.11,
50
- "learning_rate": 4.137086214086682e-06,
51
- "loss": 2.9586,
52
- "step": 30
53
- },
54
- {
55
- "epoch": 1.3,
56
- "learning_rate": 2.7634920609188867e-05,
57
- "loss": 3.0624,
58
- "step": 35
59
- },
60
- {
61
- "epoch": 1.48,
62
- "learning_rate": 6.461126473674133e-05,
63
- "loss": 2.9663,
64
- "step": 40
65
- },
66
- {
67
- "epoch": 1.67,
68
- "learning_rate": 0.00010290000000000001,
69
- "loss": 2.8978,
70
- "step": 45
71
- },
72
- {
73
- "epoch": 1.85,
74
- "learning_rate": 0.0001299031991261861,
75
- "loss": 2.9441,
76
- "step": 50
77
- },
78
- {
79
- "epoch": 2.0,
80
- "eval_loss": 3.056016445159912,
81
- "eval_runtime": 0.6316,
82
- "eval_samples_per_second": 42.746,
83
- "eval_steps_per_second": 6.333,
84
- "step": 54
85
- },
86
- {
87
- "epoch": 1.96,
88
- "learning_rate": 0.00013676865759867644,
89
- "loss": 2.5878,
90
- "step": 55
91
- },
92
- {
93
- "epoch": 2.0,
94
- "eval_loss": 2.762599468231201,
95
- "eval_runtime": 0.5151,
96
- "eval_samples_per_second": 40.769,
97
- "eval_steps_per_second": 5.824,
98
- "step": 56
99
- },
100
- {
101
- "epoch": 2.14,
102
- "learning_rate": 0.00013040646433810595,
103
- "loss": 2.8115,
104
- "step": 60
105
- },
106
- {
107
- "epoch": 2.32,
108
- "learning_rate": 0.00010509740044895205,
109
- "loss": 2.5949,
110
- "step": 65
111
- },
112
- {
113
- "epoch": 2.5,
114
- "learning_rate": 6.860000000000001e-05,
115
- "loss": 2.6643,
116
- "step": 70
117
- },
118
- {
119
- "epoch": 2.68,
120
- "learning_rate": 3.210259955104798e-05,
121
- "loss": 2.5837,
122
- "step": 75
123
- },
124
- {
125
- "epoch": 2.86,
126
- "learning_rate": 6.793535661894062e-06,
127
- "loss": 2.7461,
128
- "step": 80
129
- },
130
- {
131
- "epoch": 3.0,
132
- "eval_loss": 2.6894850730895996,
133
- "eval_runtime": 0.5306,
134
- "eval_samples_per_second": 39.579,
135
- "eval_steps_per_second": 5.654,
136
- "step": 84
137
- },
138
- {
139
- "epoch": 3.04,
140
- "learning_rate": 4.313424013235498e-07,
141
- "loss": 2.4693,
142
- "step": 85
143
- },
144
- {
145
- "epoch": 3.21,
146
- "learning_rate": 1.496636030269314e-05,
147
- "loss": 2.4734,
148
- "step": 90
149
- },
150
- {
151
- "epoch": 3.39,
152
- "learning_rate": 4.594285634987545e-05,
153
- "loss": 2.5555,
154
- "step": 95
155
- },
156
- {
157
- "epoch": 3.57,
158
- "learning_rate": 8.386493606940314e-05,
159
- "loss": 2.498,
160
- "step": 100
161
- },
162
- {
163
- "epoch": 3.75,
164
- "learning_rate": 0.0001171075251893971,
165
- "loss": 2.3626,
166
- "step": 105
167
- },
168
- {
169
- "epoch": 3.93,
170
- "learning_rate": 0.0001354800547756731,
171
- "loss": 2.4603,
172
- "step": 110
173
- },
174
- {
175
- "epoch": 4.0,
176
- "eval_loss": 2.6920154094696045,
177
- "eval_runtime": 0.5456,
178
- "eval_samples_per_second": 38.49,
179
- "eval_steps_per_second": 5.499,
180
- "step": 112
181
- },
182
- {
183
- "epoch": 4.11,
184
- "learning_rate": 0.00013335039645915404,
185
- "loss": 2.3237,
186
- "step": 115
187
- },
188
- {
189
- "epoch": 4.29,
190
- "learning_rate": 0.00011137140040750914,
191
- "loss": 2.2052,
192
- "step": 120
193
- },
194
- {
195
- "epoch": 4.46,
196
- "learning_rate": 7.62807630606869e-05,
197
- "loss": 2.2121,
198
- "step": 125
199
- },
200
- {
201
- "epoch": 4.64,
202
- "learning_rate": 3.883557549653544e-05,
203
- "loss": 2.2551,
204
- "step": 130
205
- },
206
- {
207
- "epoch": 4.82,
208
- "learning_rate": 1.0514719932939762e-05,
209
- "loss": 2.2849,
210
- "step": 135
211
- },
212
- {
213
- "epoch": 5.0,
214
- "learning_rate": 0.0,
215
- "loss": 2.2654,
216
- "step": 140
217
- },
218
- {
219
- "epoch": 5.0,
220
- "eval_loss": 2.6812992095947266,
221
- "eval_runtime": 0.5264,
222
- "eval_samples_per_second": 39.891,
223
- "eval_steps_per_second": 5.699,
224
- "step": 140
225
  }
226
  ],
227
- "max_steps": 616,
228
- "num_train_epochs": 22,
229
- "total_flos": 145278369792000.0,
230
  "trial_name": null,
231
  "trial_params": null
232
  }
 
1
  {
2
+ "best_metric": 2.747051954269409,
3
+ "best_model_checkpoint": "output/bill-wurtz/checkpoint-28",
4
+ "epoch": 1.0,
5
+ "global_step": 28,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.18,
12
+ "learning_rate": 0.00012668528006706036,
13
+ "loss": 2.3906,
14
  "step": 5
15
  },
16
  {
17
+ "epoch": 0.36,
18
+ "learning_rate": 9.836442450346464e-05,
19
+ "loss": 2.3509,
20
  "step": 10
21
  },
22
  {
23
+ "epoch": 0.54,
24
+ "learning_rate": 6.091923693931308e-05,
25
+ "loss": 2.3424,
26
  "step": 15
27
  },
28
  {
29
+ "epoch": 0.71,
30
+ "learning_rate": 2.582859959249092e-05,
31
+ "loss": 2.4842,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 0.89,
36
+ "learning_rate": 3.849603540845946e-06,
37
+ "loss": 2.52,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_loss": 2.747051954269409,
43
+ "eval_runtime": 0.9153,
44
+ "eval_samples_per_second": 21.851,
45
+ "eval_steps_per_second": 3.278,
46
+ "step": 28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  }
48
  ],
49
+ "max_steps": 168,
50
+ "num_train_epochs": 6,
51
+ "total_flos": 29264707584000.0,
52
  "trial_name": null,
53
  "trial_params": null
54
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41eee99e83d6fdcd4ae4efff4b1f5e17120c0670785282724daf91004a53de18
3
- size 2671
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1417a02d3efa067f099a494a6c3d4dbe9db44852a2687418d8cd162ecbd5d91a
3
+ size 3055