AlekseyKorshuk commited on
Commit
1caa528
1 Parent(s): 59ed122

huggingartists

Browse files
README.md CHANGED
@@ -14,11 +14,11 @@ widget:
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
- style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/10795217955d95e2543993f8e83fe5c8.960x960x1.jpg&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
21
- <div style="text-align: center; font-size: 16px; font-weight: 800">MiyaGi</div>
22
  <a href="https://genius.com/artists/miyagi">
23
  <div style="text-align: center; font-size: 14px;">@miyagi</div>
24
  </a>
@@ -34,7 +34,7 @@ To understand how the model was developed, check the [W&B report](https://wandb.
34
 
35
  ## Training data
36
 
37
- The model was trained on lyrics from MiyaGi.
38
 
39
  Dataset is available [here](https://huggingface.co/datasets/huggingartists/miyagi).
40
  And can be used with:
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/miyagi")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1ai9l9x0/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
- The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on MiyaGi's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1jowduev) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1jowduev/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
+ style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/b6e783ce8d8c51516715e291dbc87535.1000x1000x1.jpg&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
21
+ <div style="text-align: center; font-size: 16px; font-weight: 800">Miyagi</div>
22
  <a href="https://genius.com/artists/miyagi">
23
  <div style="text-align: center; font-size: 14px;">@miyagi</div>
24
  </a>
34
 
35
  ## Training data
36
 
37
+ The model was trained on lyrics from Miyagi.
38
 
39
  Dataset is available [here](https://huggingface.co/datasets/huggingartists/miyagi).
40
  And can be used with:
45
  dataset = load_dataset("huggingartists/miyagi")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1c4sny4a/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
+ The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Miyagi's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1v51pw0u) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1v51pw0u/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "huggingartists/miyagi",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
@@ -18,7 +18,9 @@
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
 
21
  "resid_pdrop": 0.1,
 
22
  "scale_attn_weights": true,
23
  "summary_activation": null,
24
  "summary_first_dropout": 0.1,
@@ -35,7 +37,7 @@
35
  }
36
  },
37
  "torch_dtype": "float32",
38
- "transformers_version": "4.10.2",
39
  "use_cache": true,
40
  "vocab_size": 50257
41
  }
1
  {
2
+ "_name_or_path": "miyagi",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
  "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
  "scale_attn_weights": true,
25
  "summary_activation": null,
26
  "summary_first_dropout": 0.1,
37
  }
38
  },
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.20.1",
41
  "use_cache": true,
42
  "vocab_size": 50257
43
  }
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 1.8749154806137085, "eval_runtime": 7.3205, "eval_samples_per_second": 21.993, "eval_steps_per_second": 2.869, "epoch": 2.0}
1
+ {"eval_loss": 1.5914676189422607, "eval_runtime": 4.1139, "eval_samples_per_second": 45.699, "eval_steps_per_second": 5.834, "epoch": 7.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc6c27a566d394fbdfff2c02e91b8c4f7dcbbda449ff9001dad2a4e5e9c49d15
3
  size 497764120
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a88522fd7fb4c4615bae3c3d07e5ded91b0bf21a5dfd66ca3f85be5209139b8d
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b786b373617192574138bda4ac4b3640a92f885392072876141d13ed205ff93f
3
- size 995603825
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265731eee23bd3d189fe5369a563077fb3d0384be204f28434a5d57192049a13
3
+ size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7efb14a7343abd44b17d9c21ac3a67b41d36a9948a87d8429abc40754990793b
3
- size 510403817
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6add507eb308c14d70e52641a1608fcffee1c65638446eb45d1430a306b21d1
3
+ size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c04cc128afd4b444801d7a22ed3a8e15a0e9e121067a7a197e91e35268473f22
3
  size 14567
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3980c79e66b16ab3e203128b955ee2dda43c76a8327ed0967d73839d2be5a4cc
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34ccf3854b39bd52f430cb63b3a54f30c543980a0b5913372cb9a1e99f761d9b
3
  size 623
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11c7b09b01cedaa5d1e45d7238ddd1cb693755cdfd02e047b278bc47fef3bc66
3
  size 623
special_tokens_map.json CHANGED
@@ -1 +1,5 @@
1
- {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1,10 @@
1
- {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "huggingartists/miyagi", "tokenizer_class": "GPT2Tokenizer"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "huggingartists/miyagi",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 1.8749154806137085,
3
- "best_model_checkpoint": "output/miyagi/checkpoint-242",
4
- "epoch": 2.0,
5
- "global_step": 242,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -318,11 +318,753 @@
318
  "eval_samples_per_second": 22.555,
319
  "eval_steps_per_second": 2.942,
320
  "step": 242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  }
322
  ],
323
- "max_steps": 242,
324
- "num_train_epochs": 2,
325
- "total_flos": 252146810880000.0,
326
  "trial_name": null,
327
  "trial_params": null
328
  }
1
  {
2
+ "best_metric": 1.5914676189422607,
3
+ "best_model_checkpoint": "output/miyagi/checkpoint-826",
4
+ "epoch": 7.0,
5
+ "global_step": 826,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
318
  "eval_samples_per_second": 22.555,
319
  "eval_steps_per_second": 2.942,
320
  "step": 242
321
+ },
322
+ {
323
+ "epoch": 2.08,
324
+ "learning_rate": 0.00013524009067795913,
325
+ "loss": 1.7074,
326
+ "step": 245
327
+ },
328
+ {
329
+ "epoch": 2.12,
330
+ "learning_rate": 0.00013248966177323044,
331
+ "loss": 1.7453,
332
+ "step": 250
333
+ },
334
+ {
335
+ "epoch": 2.16,
336
+ "learning_rate": 0.00012860874759889254,
337
+ "loss": 1.8948,
338
+ "step": 255
339
+ },
340
+ {
341
+ "epoch": 2.2,
342
+ "learning_rate": 0.00012366601836206413,
343
+ "loss": 1.9676,
344
+ "step": 260
345
+ },
346
+ {
347
+ "epoch": 2.25,
348
+ "learning_rate": 0.00011774893238446447,
349
+ "loss": 1.7939,
350
+ "step": 265
351
+ },
352
+ {
353
+ "epoch": 2.29,
354
+ "learning_rate": 0.00011096218858530879,
355
+ "loss": 1.8352,
356
+ "step": 270
357
+ },
358
+ {
359
+ "epoch": 2.33,
360
+ "learning_rate": 0.00010342587390324441,
361
+ "loss": 2.0245,
362
+ "step": 275
363
+ },
364
+ {
365
+ "epoch": 2.37,
366
+ "learning_rate": 9.527333843746984e-05,
367
+ "loss": 1.9197,
368
+ "step": 280
369
+ },
370
+ {
371
+ "epoch": 2.42,
372
+ "learning_rate": 8.664883590600801e-05,
373
+ "loss": 1.9335,
374
+ "step": 285
375
+ },
376
+ {
377
+ "epoch": 2.46,
378
+ "learning_rate": 7.77049711716633e-05,
379
+ "loss": 1.875,
380
+ "step": 290
381
+ },
382
+ {
383
+ "epoch": 2.5,
384
+ "learning_rate": 6.860000000000001e-05,
385
+ "loss": 1.8544,
386
+ "step": 295
387
+ },
388
+ {
389
+ "epoch": 2.54,
390
+ "learning_rate": 5.949502882833675e-05,
391
+ "loss": 1.9954,
392
+ "step": 300
393
+ },
394
+ {
395
+ "epoch": 2.58,
396
+ "learning_rate": 5.055116409399204e-05,
397
+ "loss": 1.8779,
398
+ "step": 305
399
+ },
400
+ {
401
+ "epoch": 2.63,
402
+ "learning_rate": 4.192666156253025e-05,
403
+ "loss": 1.8122,
404
+ "step": 310
405
+ },
406
+ {
407
+ "epoch": 2.67,
408
+ "learning_rate": 3.377412609675556e-05,
409
+ "loss": 1.8168,
410
+ "step": 315
411
+ },
412
+ {
413
+ "epoch": 2.71,
414
+ "learning_rate": 2.6237811414691256e-05,
415
+ "loss": 1.7168,
416
+ "step": 320
417
+ },
418
+ {
419
+ "epoch": 2.75,
420
+ "learning_rate": 1.9451067615535547e-05,
421
+ "loss": 1.7654,
422
+ "step": 325
423
+ },
424
+ {
425
+ "epoch": 2.8,
426
+ "learning_rate": 1.3533981637935892e-05,
427
+ "loss": 1.8255,
428
+ "step": 330
429
+ },
430
+ {
431
+ "epoch": 2.84,
432
+ "learning_rate": 8.591252401107479e-06,
433
+ "loss": 1.8593,
434
+ "step": 335
435
+ },
436
+ {
437
+ "epoch": 2.88,
438
+ "learning_rate": 4.710338226769622e-06,
439
+ "loss": 1.7707,
440
+ "step": 340
441
+ },
442
+ {
443
+ "epoch": 2.92,
444
+ "learning_rate": 1.959909322040904e-06,
445
+ "loss": 1.7329,
446
+ "step": 345
447
+ },
448
+ {
449
+ "epoch": 2.97,
450
+ "learning_rate": 3.886327055845878e-07,
451
+ "loss": 1.8968,
452
+ "step": 350
453
+ },
454
+ {
455
+ "epoch": 3.0,
456
+ "eval_loss": 1.6329461336135864,
457
+ "eval_runtime": 4.0979,
458
+ "eval_samples_per_second": 45.877,
459
+ "eval_steps_per_second": 5.857,
460
+ "step": 354
461
+ },
462
+ {
463
+ "epoch": 3.01,
464
+ "learning_rate": 2.4311076931149823e-08,
465
+ "loss": 1.7177,
466
+ "step": 355
467
+ },
468
+ {
469
+ "epoch": 3.05,
470
+ "learning_rate": 8.733908661157559e-07,
471
+ "loss": 1.8309,
472
+ "step": 360
473
+ },
474
+ {
475
+ "epoch": 3.09,
476
+ "learning_rate": 2.920848168366426e-06,
477
+ "loss": 1.6587,
478
+ "step": 365
479
+ },
480
+ {
481
+ "epoch": 3.14,
482
+ "learning_rate": 6.130454582152937e-06,
483
+ "loss": 1.8153,
484
+ "step": 370
485
+ },
486
+ {
487
+ "epoch": 3.18,
488
+ "learning_rate": 1.044541824676853e-05,
489
+ "loss": 1.6569,
490
+ "step": 375
491
+ },
492
+ {
493
+ "epoch": 3.22,
494
+ "learning_rate": 1.5789388736708423e-05,
495
+ "loss": 1.7716,
496
+ "step": 380
497
+ },
498
+ {
499
+ "epoch": 3.26,
500
+ "learning_rate": 2.206780803190438e-05,
501
+ "loss": 1.7591,
502
+ "step": 385
503
+ },
504
+ {
505
+ "epoch": 3.31,
506
+ "learning_rate": 2.9169583659291692e-05,
507
+ "loss": 1.8322,
508
+ "step": 390
509
+ },
510
+ {
511
+ "epoch": 3.35,
512
+ "learning_rate": 3.696905440057621e-05,
513
+ "loss": 1.5984,
514
+ "step": 395
515
+ },
516
+ {
517
+ "epoch": 3.39,
518
+ "learning_rate": 4.5328213784303035e-05,
519
+ "loss": 1.7572,
520
+ "step": 400
521
+ },
522
+ {
523
+ "epoch": 3.43,
524
+ "learning_rate": 5.4099152019007745e-05,
525
+ "loss": 1.8058,
526
+ "step": 405
527
+ },
528
+ {
529
+ "epoch": 3.47,
530
+ "learning_rate": 6.312667315905842e-05,
531
+ "loss": 1.9387,
532
+ "step": 410
533
+ },
534
+ {
535
+ "epoch": 3.52,
536
+ "learning_rate": 7.225104119417345e-05,
537
+ "loss": 1.9436,
538
+ "step": 415
539
+ },
540
+ {
541
+ "epoch": 3.56,
542
+ "learning_rate": 8.1310806472376e-05,
543
+ "loss": 1.7717,
544
+ "step": 420
545
+ },
546
+ {
547
+ "epoch": 3.6,
548
+ "learning_rate": 9.01456624447057e-05,
549
+ "loss": 1.6776,
550
+ "step": 425
551
+ },
552
+ {
553
+ "epoch": 3.64,
554
+ "learning_rate": 9.859928218347747e-05,
555
+ "loss": 1.7468,
556
+ "step": 430
557
+ },
558
+ {
559
+ "epoch": 3.69,
560
+ "learning_rate": 0.0001065220844837778,
561
+ "loss": 1.7603,
562
+ "step": 435
563
+ },
564
+ {
565
+ "epoch": 3.73,
566
+ "learning_rate": 0.00011377388060386165,
567
+ "loss": 1.7576,
568
+ "step": 440
569
+ },
570
+ {
571
+ "epoch": 3.77,
572
+ "learning_rate": 0.00012022635481213106,
573
+ "loss": 1.8352,
574
+ "step": 445
575
+ },
576
+ {
577
+ "epoch": 3.81,
578
+ "learning_rate": 0.00012576533484906052,
579
+ "loss": 1.748,
580
+ "step": 450
581
+ },
582
+ {
583
+ "epoch": 3.86,
584
+ "learning_rate": 0.00013029281212974562,
585
+ "loss": 1.6407,
586
+ "step": 455
587
+ },
588
+ {
589
+ "epoch": 3.9,
590
+ "learning_rate": 0.00013372867594093092,
591
+ "loss": 1.7799,
592
+ "step": 460
593
+ },
594
+ {
595
+ "epoch": 3.94,
596
+ "learning_rate": 0.00013601213094704693,
597
+ "loss": 1.8188,
598
+ "step": 465
599
+ },
600
+ {
601
+ "epoch": 3.98,
602
+ "learning_rate": 0.00013710277292342587,
603
+ "loss": 1.7514,
604
+ "step": 470
605
+ },
606
+ {
607
+ "epoch": 4.0,
608
+ "eval_loss": 1.6437361240386963,
609
+ "eval_runtime": 4.1027,
610
+ "eval_samples_per_second": 45.824,
611
+ "eval_steps_per_second": 5.85,
612
+ "step": 472
613
+ },
614
+ {
615
+ "epoch": 4.03,
616
+ "learning_rate": 0.00013698130368230946,
617
+ "loss": 1.8553,
618
+ "step": 475
619
+ },
620
+ {
621
+ "epoch": 4.07,
622
+ "learning_rate": 0.00013564987254150566,
623
+ "loss": 1.677,
624
+ "step": 480
625
+ },
626
+ {
627
+ "epoch": 4.11,
628
+ "learning_rate": 0.00013313203829363288,
629
+ "loss": 1.6993,
630
+ "step": 485
631
+ },
632
+ {
633
+ "epoch": 4.15,
634
+ "learning_rate": 0.00012947235234888086,
635
+ "loss": 1.7383,
636
+ "step": 490
637
+ },
638
+ {
639
+ "epoch": 4.19,
640
+ "learning_rate": 0.00012473557042730042,
641
+ "loss": 1.7513,
642
+ "step": 495
643
+ },
644
+ {
645
+ "epoch": 4.24,
646
+ "learning_rate": 0.00011900550674920642,
647
+ "loss": 1.6364,
648
+ "step": 500
649
+ },
650
+ {
651
+ "epoch": 4.28,
652
+ "learning_rate": 0.00011238355099803469,
653
+ "loss": 1.6798,
654
+ "step": 505
655
+ },
656
+ {
657
+ "epoch": 4.32,
658
+ "learning_rate": 0.00010498687429701432,
659
+ "loss": 1.7396,
660
+ "step": 510
661
+ },
662
+ {
663
+ "epoch": 4.36,
664
+ "learning_rate": 9.694635594371065e-05,
665
+ "loss": 1.5501,
666
+ "step": 515
667
+ },
668
+ {
669
+ "epoch": 4.41,
670
+ "learning_rate": 8.840426758749807e-05,
671
+ "loss": 1.6678,
672
+ "step": 520
673
+ },
674
+ {
675
+ "epoch": 4.45,
676
+ "learning_rate": 7.951175582690834e-05,
677
+ "loss": 1.703,
678
+ "step": 525
679
+ },
680
+ {
681
+ "epoch": 4.49,
682
+ "learning_rate": 7.042616777063153e-05,
683
+ "loss": 1.7374,
684
+ "step": 530
685
+ },
686
+ {
687
+ "epoch": 4.53,
688
+ "learning_rate": 6.130826688459083e-05,
689
+ "loss": 1.7254,
690
+ "step": 535
691
+ },
692
+ {
693
+ "epoch": 4.58,
694
+ "learning_rate": 5.231938838884156e-05,
695
+ "loss": 1.6928,
696
+ "step": 540
697
+ },
698
+ {
699
+ "epoch": 4.62,
700
+ "learning_rate": 4.361858453765647e-05,
701
+ "loss": 1.742,
702
+ "step": 545
703
+ },
704
+ {
705
+ "epoch": 4.66,
706
+ "learning_rate": 3.535981029518021e-05,
707
+ "loss": 1.6946,
708
+ "step": 550
709
+ },
710
+ {
711
+ "epoch": 4.7,
712
+ "learning_rate": 2.768919920425875e-05,
713
+ "loss": 1.6874,
714
+ "step": 555
715
+ },
716
+ {
717
+ "epoch": 4.75,
718
+ "learning_rate": 2.0742477650140126e-05,
719
+ "loss": 1.7176,
720
+ "step": 560
721
+ },
722
+ {
723
+ "epoch": 4.79,
724
+ "learning_rate": 1.464256327193875e-05,
725
+ "loss": 1.6007,
726
+ "step": 565
727
+ },
728
+ {
729
+ "epoch": 4.83,
730
+ "learning_rate": 9.497390016384942e-06,
731
+ "loss": 1.7331,
732
+ "step": 570
733
+ },
734
+ {
735
+ "epoch": 4.87,
736
+ "learning_rate": 5.397998318089678e-06,
737
+ "loss": 1.4962,
738
+ "step": 575
739
+ },
740
+ {
741
+ "epoch": 4.92,
742
+ "learning_rate": 2.416924199324192e-06,
743
+ "loss": 1.6863,
744
+ "step": 580
745
+ },
746
+ {
747
+ "epoch": 4.96,
748
+ "learning_rate": 6.069157931251217e-07,
749
+ "loss": 1.7319,
750
+ "step": 585
751
+ },
752
+ {
753
+ "epoch": 5.0,
754
+ "learning_rate": 0.0,
755
+ "loss": 1.6369,
756
+ "step": 590
757
+ },
758
+ {
759
+ "epoch": 5.0,
760
+ "eval_loss": 1.601211667060852,
761
+ "eval_runtime": 4.0987,
762
+ "eval_samples_per_second": 45.868,
763
+ "eval_steps_per_second": 5.856,
764
+ "step": 590
765
+ },
766
+ {
767
+ "epoch": 5.04,
768
+ "learning_rate": 6.06915793125114e-07,
769
+ "loss": 1.6055,
770
+ "step": 595
771
+ },
772
+ {
773
+ "epoch": 5.08,
774
+ "learning_rate": 2.416924199324169e-06,
775
+ "loss": 1.5901,
776
+ "step": 600
777
+ },
778
+ {
779
+ "epoch": 5.13,
780
+ "learning_rate": 5.39799831808964e-06,
781
+ "loss": 1.6282,
782
+ "step": 605
783
+ },
784
+ {
785
+ "epoch": 5.17,
786
+ "learning_rate": 9.497390016384903e-06,
787
+ "loss": 1.6198,
788
+ "step": 610
789
+ },
790
+ {
791
+ "epoch": 5.21,
792
+ "learning_rate": 1.464256327193862e-05,
793
+ "loss": 1.6603,
794
+ "step": 615
795
+ },
796
+ {
797
+ "epoch": 5.25,
798
+ "learning_rate": 2.074247765013998e-05,
799
+ "loss": 1.6531,
800
+ "step": 620
801
+ },
802
+ {
803
+ "epoch": 5.3,
804
+ "learning_rate": 2.768919920425878e-05,
805
+ "loss": 1.5269,
806
+ "step": 625
807
+ },
808
+ {
809
+ "epoch": 5.34,
810
+ "learning_rate": 3.535981029518024e-05,
811
+ "loss": 1.6306,
812
+ "step": 630
813
+ },
814
+ {
815
+ "epoch": 5.38,
816
+ "learning_rate": 4.3618584537656514e-05,
817
+ "loss": 1.4954,
818
+ "step": 635
819
+ },
820
+ {
821
+ "epoch": 5.42,
822
+ "learning_rate": 5.231938838884147e-05,
823
+ "loss": 1.6128,
824
+ "step": 640
825
+ },
826
+ {
827
+ "epoch": 5.47,
828
+ "learning_rate": 6.130826688459075e-05,
829
+ "loss": 1.7703,
830
+ "step": 645
831
+ },
832
+ {
833
+ "epoch": 5.51,
834
+ "learning_rate": 7.042616777063145e-05,
835
+ "loss": 1.6367,
836
+ "step": 650
837
+ },
838
+ {
839
+ "epoch": 5.55,
840
+ "learning_rate": 7.951175582690827e-05,
841
+ "loss": 1.6559,
842
+ "step": 655
843
+ },
844
+ {
845
+ "epoch": 5.59,
846
+ "learning_rate": 8.8404267587498e-05,
847
+ "loss": 1.646,
848
+ "step": 660
849
+ },
850
+ {
851
+ "epoch": 5.64,
852
+ "learning_rate": 9.694635594371057e-05,
853
+ "loss": 1.6984,
854
+ "step": 665
855
+ },
856
+ {
857
+ "epoch": 5.68,
858
+ "learning_rate": 0.00010498687429701424,
859
+ "loss": 1.6156,
860
+ "step": 670
861
+ },
862
+ {
863
+ "epoch": 5.72,
864
+ "learning_rate": 0.00011238355099803463,
865
+ "loss": 1.6668,
866
+ "step": 675
867
+ },
868
+ {
869
+ "epoch": 5.76,
870
+ "learning_rate": 0.00011900550674920627,
871
+ "loss": 1.7146,
872
+ "step": 680
873
+ },
874
+ {
875
+ "epoch": 5.81,
876
+ "learning_rate": 0.00012473557042730032,
877
+ "loss": 1.5721,
878
+ "step": 685
879
+ },
880
+ {
881
+ "epoch": 5.85,
882
+ "learning_rate": 0.00012947235234888078,
883
+ "loss": 1.6113,
884
+ "step": 690
885
+ },
886
+ {
887
+ "epoch": 5.89,
888
+ "learning_rate": 0.00013313203829363288,
889
+ "loss": 1.6012,
890
+ "step": 695
891
+ },
892
+ {
893
+ "epoch": 5.93,
894
+ "learning_rate": 0.00013564987254150568,
895
+ "loss": 1.54,
896
+ "step": 700
897
+ },
898
+ {
899
+ "epoch": 5.97,
900
+ "learning_rate": 0.00013698130368230946,
901
+ "loss": 1.5771,
902
+ "step": 705
903
+ },
904
+ {
905
+ "epoch": 6.0,
906
+ "eval_loss": 1.6223803758621216,
907
+ "eval_runtime": 4.1028,
908
+ "eval_samples_per_second": 45.822,
909
+ "eval_steps_per_second": 5.85,
910
+ "step": 708
911
+ },
912
+ {
913
+ "epoch": 6.02,
914
+ "learning_rate": 0.00013710277292342587,
915
+ "loss": 1.6545,
916
+ "step": 710
917
+ },
918
+ {
919
+ "epoch": 6.06,
920
+ "learning_rate": 0.00013601213094704693,
921
+ "loss": 1.6159,
922
+ "step": 715
923
+ },
924
+ {
925
+ "epoch": 6.1,
926
+ "learning_rate": 0.000133728675940931,
927
+ "loss": 1.5379,
928
+ "step": 720
929
+ },
930
+ {
931
+ "epoch": 6.14,
932
+ "learning_rate": 0.00013029281212974567,
933
+ "loss": 1.5675,
934
+ "step": 725
935
+ },
936
+ {
937
+ "epoch": 6.19,
938
+ "learning_rate": 0.00012576533484906052,
939
+ "loss": 1.5401,
940
+ "step": 730
941
+ },
942
+ {
943
+ "epoch": 6.23,
944
+ "learning_rate": 0.0001202263548121312,
945
+ "loss": 1.7157,
946
+ "step": 735
947
+ },
948
+ {
949
+ "epoch": 6.27,
950
+ "learning_rate": 0.00011377388060386172,
951
+ "loss": 1.6472,
952
+ "step": 740
953
+ },
954
+ {
955
+ "epoch": 6.31,
956
+ "learning_rate": 0.00010652208448377808,
957
+ "loss": 1.6329,
958
+ "step": 745
959
+ },
960
+ {
961
+ "epoch": 6.36,
962
+ "learning_rate": 9.859928218347764e-05,
963
+ "loss": 1.6732,
964
+ "step": 750
965
+ },
966
+ {
967
+ "epoch": 6.4,
968
+ "learning_rate": 9.014566244470579e-05,
969
+ "loss": 1.5562,
970
+ "step": 755
971
+ },
972
+ {
973
+ "epoch": 6.44,
974
+ "learning_rate": 8.131080647237608e-05,
975
+ "loss": 1.669,
976
+ "step": 760
977
+ },
978
+ {
979
+ "epoch": 6.48,
980
+ "learning_rate": 7.225104119417342e-05,
981
+ "loss": 1.5005,
982
+ "step": 765
983
+ },
984
+ {
985
+ "epoch": 6.53,
986
+ "learning_rate": 6.31266731590584e-05,
987
+ "loss": 1.5282,
988
+ "step": 770
989
+ },
990
+ {
991
+ "epoch": 6.57,
992
+ "learning_rate": 5.4099152019007833e-05,
993
+ "loss": 1.5426,
994
+ "step": 775
995
+ },
996
+ {
997
+ "epoch": 6.61,
998
+ "learning_rate": 4.532821378430311e-05,
999
+ "loss": 1.5988,
1000
+ "step": 780
1001
+ },
1002
+ {
1003
+ "epoch": 6.65,
1004
+ "learning_rate": 3.696905440057639e-05,
1005
+ "loss": 1.4735,
1006
+ "step": 785
1007
+ },
1008
+ {
1009
+ "epoch": 6.69,
1010
+ "learning_rate": 2.916958365929176e-05,
1011
+ "loss": 1.561,
1012
+ "step": 790
1013
+ },
1014
+ {
1015
+ "epoch": 6.74,
1016
+ "learning_rate": 2.206780803190435e-05,
1017
+ "loss": 1.5727,
1018
+ "step": 795
1019
+ },
1020
+ {
1021
+ "epoch": 6.78,
1022
+ "learning_rate": 1.578938873670855e-05,
1023
+ "loss": 1.5714,
1024
+ "step": 800
1025
+ },
1026
+ {
1027
+ "epoch": 6.82,
1028
+ "learning_rate": 1.0445418246768637e-05,
1029
+ "loss": 1.3599,
1030
+ "step": 805
1031
+ },
1032
+ {
1033
+ "epoch": 6.86,
1034
+ "learning_rate": 6.130454582152975e-06,
1035
+ "loss": 1.4926,
1036
+ "step": 810
1037
+ },
1038
+ {
1039
+ "epoch": 6.91,
1040
+ "learning_rate": 2.9208481683664865e-06,
1041
+ "loss": 1.4652,
1042
+ "step": 815
1043
+ },
1044
+ {
1045
+ "epoch": 6.95,
1046
+ "learning_rate": 8.733908661157864e-07,
1047
+ "loss": 1.6516,
1048
+ "step": 820
1049
+ },
1050
+ {
1051
+ "epoch": 6.99,
1052
+ "learning_rate": 2.4311076931157437e-08,
1053
+ "loss": 1.6115,
1054
+ "step": 825
1055
+ },
1056
+ {
1057
+ "epoch": 7.0,
1058
+ "eval_loss": 1.5914676189422607,
1059
+ "eval_runtime": 4.1088,
1060
+ "eval_samples_per_second": 45.755,
1061
+ "eval_steps_per_second": 5.841,
1062
+ "step": 826
1063
  }
1064
  ],
1065
+ "max_steps": 826,
1066
+ "num_train_epochs": 7,
1067
+ "total_flos": 859258847232000.0,
1068
  "trial_name": null,
1069
  "trial_params": null
1070
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fea27806c0a8b58c971861b19a8e6a16537326283f0b22c46854ba1150e10ed
3
- size 2671
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd1782f3f75c74631b3cd9920d85b1f04eebc2f2a023905946a75841206980fa
3
+ size 3311