AlekseyKorshuk commited on
Commit
0f9a51e
1 Parent(s): 3062cd9

huggingartists

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ widget:
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
- style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/a23400b7447be0fb91c3c4c839c6efe7.442x442x1.jpg&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/rihanna")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/3g4a2qa9/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Rihanna's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/3du2fhxz) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/3du2fhxz/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
+ style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/f83548d76e427d0a4fdcafdf2f62b647.1000x1000x1.png&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
45
  dataset = load_dataset("huggingartists/rihanna")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/3c5muzh8/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Rihanna's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/uywap06b) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/uywap06b/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "huggingartists/rihanna",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
@@ -18,7 +18,9 @@
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
 
21
  "resid_pdrop": 0.1,
 
22
  "scale_attn_weights": true,
23
  "summary_activation": null,
24
  "summary_first_dropout": 0.1,
@@ -35,7 +37,7 @@
35
  }
36
  },
37
  "torch_dtype": "float32",
38
- "transformers_version": "4.9.2",
39
  "use_cache": true,
40
  "vocab_size": 50257
41
  }
1
  {
2
+ "_name_or_path": "rihanna",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
18
  "n_inner": null,
19
  "n_layer": 12,
20
  "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
  "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
  "scale_attn_weights": true,
25
  "summary_activation": null,
26
  "summary_first_dropout": 0.1,
37
  }
38
  },
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.20.0",
41
  "use_cache": true,
42
  "vocab_size": 50257
43
  }
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 2.237405776977539, "eval_runtime": 8.4305, "eval_samples_per_second": 22.181, "eval_steps_per_second": 2.847, "epoch": 2.0}
1
+ {"eval_loss": 1.752521276473999, "eval_runtime": 4.2607, "eval_samples_per_second": 45.532, "eval_steps_per_second": 5.868, "epoch": 7.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd6be841db360da1a00ad23a76226c5b19e42247bc72575b241a6f446840b95d
3
  size 497764120
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b41a408893cb0ef0914b7c69fe63100edbcfd9827c5b8d83eff838506cbe6e73
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a71909dbe3a3827eb2d96903ec0cd4f6475fbd143d3adcebee21e68eb480324
3
  size 995604017
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3006a0c685ddebea1136f939b292ab67e24da1f0edd23faadf7a85985f46b34
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5c96ee233c3281090b842a4a11ff662f1751f775bd8bd50db7037dbc4bdf1d8
3
- size 510403817
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5b73d16cb2c0edc96673fe23be7a5214f61a1c9d00aaf44aacd9f96d1ea618
3
+ size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26ebd8f09a6c1db84aa3aaeb1ea1345bcb3c2ae6c124b74ff19c3f608acd9e3c
3
  size 14503
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e82f1222285d031b8e21d934b74558389b98a783bb42db1010d0f55af3bc16
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b2179c852b9a3c2fc23aeed095aeb8cf7d63d1da7d9c6eca4435ee27bc43703
3
  size 623
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc3e0c06b55e5f6cc34f1c4c7ab6e7fc251827bfa5f1b3ed68bfe476fa6cc0c
3
  size 623
special_tokens_map.json CHANGED
@@ -1 +1,5 @@
1
- {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1,10 @@
1
- {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "huggingartists/rihanna", "tokenizer_class": "GPT2Tokenizer"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "huggingartists/rihanna",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 2.237405776977539,
3
- "best_model_checkpoint": "output/rihanna/checkpoint-262",
4
- "epoch": 2.0,
5
- "global_step": 262,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -334,11 +334,667 @@
334
  "eval_samples_per_second": 22.521,
335
  "eval_steps_per_second": 2.89,
336
  "step": 262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  }
338
  ],
339
- "max_steps": 262,
340
- "num_train_epochs": 2,
341
- "total_flos": 272919527424000.0,
342
  "trial_name": null,
343
  "trial_params": null
344
  }
1
  {
2
+ "best_metric": 1.752521276473999,
3
+ "best_model_checkpoint": "output/rihanna/checkpoint-780",
4
+ "epoch": 6.0,
5
+ "global_step": 780,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
334
  "eval_samples_per_second": 22.521,
335
  "eval_steps_per_second": 2.89,
336
  "step": 262
337
+ },
338
+ {
339
+ "epoch": 2.04,
340
+ "learning_rate": 0.0001366998287631265,
341
+ "loss": 2.0176,
342
+ "step": 265
343
+ },
344
+ {
345
+ "epoch": 2.08,
346
+ "learning_rate": 0.00013520660867542716,
347
+ "loss": 2.0043,
348
+ "step": 270
349
+ },
350
+ {
351
+ "epoch": 2.12,
352
+ "learning_rate": 0.00013274211424821946,
353
+ "loss": 1.743,
354
+ "step": 275
355
+ },
356
+ {
357
+ "epoch": 2.15,
358
+ "learning_rate": 0.00012934228335981023,
359
+ "loss": 2.0598,
360
+ "step": 280
361
+ },
362
+ {
363
+ "epoch": 2.19,
364
+ "learning_rate": 0.00012505669320030482,
365
+ "loss": 1.9087,
366
+ "step": 285
367
+ },
368
+ {
369
+ "epoch": 2.23,
370
+ "learning_rate": 0.00011994783732453754,
371
+ "loss": 1.6869,
372
+ "step": 290
373
+ },
374
+ {
375
+ "epoch": 2.27,
376
+ "learning_rate": 0.00011409021435531858,
377
+ "loss": 1.726,
378
+ "step": 295
379
+ },
380
+ {
381
+ "epoch": 2.31,
382
+ "learning_rate": 0.00010756924162575734,
383
+ "loss": 1.966,
384
+ "step": 300
385
+ },
386
+ {
387
+ "epoch": 2.35,
388
+ "learning_rate": 0.00010048000960220251,
389
+ "loss": 2.0242,
390
+ "step": 305
391
+ },
392
+ {
393
+ "epoch": 2.38,
394
+ "learning_rate": 9.292589525111797e-05,
395
+ "loss": 1.804,
396
+ "step": 310
397
+ },
398
+ {
399
+ "epoch": 2.42,
400
+ "learning_rate": 8.501705457012652e-05,
401
+ "loss": 1.7316,
402
+ "step": 315
403
+ },
404
+ {
405
+ "epoch": 2.46,
406
+ "learning_rate": 7.686881626551514e-05,
407
+ "loss": 2.1338,
408
+ "step": 320
409
+ },
410
+ {
411
+ "epoch": 2.5,
412
+ "learning_rate": 6.860000000000001e-05,
413
+ "loss": 2.067,
414
+ "step": 325
415
+ },
416
+ {
417
+ "epoch": 2.54,
418
+ "learning_rate": 6.03311837344849e-05,
419
+ "loss": 1.993,
420
+ "step": 330
421
+ },
422
+ {
423
+ "epoch": 2.58,
424
+ "learning_rate": 5.218294542987351e-05,
425
+ "loss": 1.8933,
426
+ "step": 335
427
+ },
428
+ {
429
+ "epoch": 2.62,
430
+ "learning_rate": 4.427410474888207e-05,
431
+ "loss": 1.7142,
432
+ "step": 340
433
+ },
434
+ {
435
+ "epoch": 2.65,
436
+ "learning_rate": 3.6719990397797524e-05,
437
+ "loss": 1.9927,
438
+ "step": 345
439
+ },
440
+ {
441
+ "epoch": 2.69,
442
+ "learning_rate": 2.9630758374242683e-05,
443
+ "loss": 1.8104,
444
+ "step": 350
445
+ },
446
+ {
447
+ "epoch": 2.73,
448
+ "learning_rate": 2.310978564468145e-05,
449
+ "loss": 1.8292,
450
+ "step": 355
451
+ },
452
+ {
453
+ "epoch": 2.77,
454
+ "learning_rate": 1.7252162675462497e-05,
455
+ "loss": 2.0388,
456
+ "step": 360
457
+ },
458
+ {
459
+ "epoch": 2.81,
460
+ "learning_rate": 1.214330679969522e-05,
461
+ "loss": 1.8121,
462
+ "step": 365
463
+ },
464
+ {
465
+ "epoch": 2.85,
466
+ "learning_rate": 7.8577166401898e-06,
467
+ "loss": 1.8632,
468
+ "step": 370
469
+ },
470
+ {
471
+ "epoch": 2.88,
472
+ "learning_rate": 4.457885751780558e-06,
473
+ "loss": 2.0386,
474
+ "step": 375
475
+ },
476
+ {
477
+ "epoch": 2.92,
478
+ "learning_rate": 1.9933913245728472e-06,
479
+ "loss": 1.7312,
480
+ "step": 380
481
+ },
482
+ {
483
+ "epoch": 2.96,
484
+ "learning_rate": 5.001712368734975e-07,
485
+ "loss": 2.0118,
486
+ "step": 385
487
+ },
488
+ {
489
+ "epoch": 3.0,
490
+ "learning_rate": 0.0,
491
+ "loss": 1.7726,
492
+ "step": 390
493
+ },
494
+ {
495
+ "epoch": 3.0,
496
+ "eval_loss": 1.8738644123077393,
497
+ "eval_runtime": 4.2323,
498
+ "eval_samples_per_second": 45.838,
499
+ "eval_steps_per_second": 5.907,
500
+ "step": 390
501
+ },
502
+ {
503
+ "epoch": 3.04,
504
+ "learning_rate": 5.001712368734899e-07,
505
+ "loss": 1.519,
506
+ "step": 395
507
+ },
508
+ {
509
+ "epoch": 3.08,
510
+ "learning_rate": 1.9933913245728396e-06,
511
+ "loss": 1.6714,
512
+ "step": 400
513
+ },
514
+ {
515
+ "epoch": 3.12,
516
+ "learning_rate": 4.457885751780535e-06,
517
+ "loss": 1.6067,
518
+ "step": 405
519
+ },
520
+ {
521
+ "epoch": 3.15,
522
+ "learning_rate": 7.857716640189778e-06,
523
+ "loss": 1.4835,
524
+ "step": 410
525
+ },
526
+ {
527
+ "epoch": 3.19,
528
+ "learning_rate": 1.2143306799695189e-05,
529
+ "loss": 1.9692,
530
+ "step": 415
531
+ },
532
+ {
533
+ "epoch": 3.23,
534
+ "learning_rate": 1.725216267546246e-05,
535
+ "loss": 1.5852,
536
+ "step": 420
537
+ },
538
+ {
539
+ "epoch": 3.27,
540
+ "learning_rate": 2.310978564468141e-05,
541
+ "loss": 1.766,
542
+ "step": 425
543
+ },
544
+ {
545
+ "epoch": 3.31,
546
+ "learning_rate": 2.9630758374242642e-05,
547
+ "loss": 1.7481,
548
+ "step": 430
549
+ },
550
+ {
551
+ "epoch": 3.35,
552
+ "learning_rate": 3.671999039779748e-05,
553
+ "loss": 1.9411,
554
+ "step": 435
555
+ },
556
+ {
557
+ "epoch": 3.38,
558
+ "learning_rate": 4.427410474888202e-05,
559
+ "loss": 1.5399,
560
+ "step": 440
561
+ },
562
+ {
563
+ "epoch": 3.42,
564
+ "learning_rate": 5.218294542987346e-05,
565
+ "loss": 1.9887,
566
+ "step": 445
567
+ },
568
+ {
569
+ "epoch": 3.46,
570
+ "learning_rate": 6.033118373448485e-05,
571
+ "loss": 1.6023,
572
+ "step": 450
573
+ },
574
+ {
575
+ "epoch": 3.5,
576
+ "learning_rate": 6.859999999999997e-05,
577
+ "loss": 1.665,
578
+ "step": 455
579
+ },
580
+ {
581
+ "epoch": 3.54,
582
+ "learning_rate": 7.68688162655151e-05,
583
+ "loss": 1.6708,
584
+ "step": 460
585
+ },
586
+ {
587
+ "epoch": 3.58,
588
+ "learning_rate": 8.501705457012648e-05,
589
+ "loss": 1.7596,
590
+ "step": 465
591
+ },
592
+ {
593
+ "epoch": 3.62,
594
+ "learning_rate": 9.292589525111793e-05,
595
+ "loss": 2.0391,
596
+ "step": 470
597
+ },
598
+ {
599
+ "epoch": 3.65,
600
+ "learning_rate": 0.00010048000960220248,
601
+ "loss": 1.6346,
602
+ "step": 475
603
+ },
604
+ {
605
+ "epoch": 3.69,
606
+ "learning_rate": 0.00010756924162575731,
607
+ "loss": 1.5059,
608
+ "step": 480
609
+ },
610
+ {
611
+ "epoch": 3.73,
612
+ "learning_rate": 0.00011409021435531856,
613
+ "loss": 1.7107,
614
+ "step": 485
615
+ },
616
+ {
617
+ "epoch": 3.77,
618
+ "learning_rate": 0.0001199478373245375,
619
+ "loss": 1.6263,
620
+ "step": 490
621
+ },
622
+ {
623
+ "epoch": 3.81,
624
+ "learning_rate": 0.0001250566932003048,
625
+ "loss": 1.7098,
626
+ "step": 495
627
+ },
628
+ {
629
+ "epoch": 3.85,
630
+ "learning_rate": 0.00012934228335981018,
631
+ "loss": 1.5807,
632
+ "step": 500
633
+ },
634
+ {
635
+ "epoch": 3.88,
636
+ "learning_rate": 0.00013274211424821943,
637
+ "loss": 1.9171,
638
+ "step": 505
639
+ },
640
+ {
641
+ "epoch": 3.92,
642
+ "learning_rate": 0.00013520660867542716,
643
+ "loss": 1.6038,
644
+ "step": 510
645
+ },
646
+ {
647
+ "epoch": 3.96,
648
+ "learning_rate": 0.00013669982876312649,
649
+ "loss": 1.548,
650
+ "step": 515
651
+ },
652
+ {
653
+ "epoch": 4.0,
654
+ "learning_rate": 0.0001372,
655
+ "loss": 1.8101,
656
+ "step": 520
657
+ },
658
+ {
659
+ "epoch": 4.0,
660
+ "eval_loss": 1.8380614519119263,
661
+ "eval_runtime": 4.2339,
662
+ "eval_samples_per_second": 45.821,
663
+ "eval_steps_per_second": 5.905,
664
+ "step": 520
665
+ },
666
+ {
667
+ "epoch": 4.04,
668
+ "learning_rate": 0.0001366998287631265,
669
+ "loss": 1.7809,
670
+ "step": 525
671
+ },
672
+ {
673
+ "epoch": 4.08,
674
+ "learning_rate": 0.0001352066086754272,
675
+ "loss": 1.71,
676
+ "step": 530
677
+ },
678
+ {
679
+ "epoch": 4.12,
680
+ "learning_rate": 0.0001327421142482195,
681
+ "loss": 1.3283,
682
+ "step": 535
683
+ },
684
+ {
685
+ "epoch": 4.15,
686
+ "learning_rate": 0.00012934228335981015,
687
+ "loss": 1.4905,
688
+ "step": 540
689
+ },
690
+ {
691
+ "epoch": 4.19,
692
+ "learning_rate": 0.00012505669320030482,
693
+ "loss": 1.2511,
694
+ "step": 545
695
+ },
696
+ {
697
+ "epoch": 4.23,
698
+ "learning_rate": 0.00011994783732453755,
699
+ "loss": 1.6209,
700
+ "step": 550
701
+ },
702
+ {
703
+ "epoch": 4.27,
704
+ "learning_rate": 0.00011409021435531858,
705
+ "loss": 1.6988,
706
+ "step": 555
707
+ },
708
+ {
709
+ "epoch": 4.31,
710
+ "learning_rate": 0.00010756924162575738,
711
+ "loss": 1.2228,
712
+ "step": 560
713
+ },
714
+ {
715
+ "epoch": 4.35,
716
+ "learning_rate": 0.00010048000960220263,
717
+ "loss": 1.6827,
718
+ "step": 565
719
+ },
720
+ {
721
+ "epoch": 4.38,
722
+ "learning_rate": 9.292589525111788e-05,
723
+ "loss": 1.6977,
724
+ "step": 570
725
+ },
726
+ {
727
+ "epoch": 4.42,
728
+ "learning_rate": 8.501705457012643e-05,
729
+ "loss": 1.4269,
730
+ "step": 575
731
+ },
732
+ {
733
+ "epoch": 4.46,
734
+ "learning_rate": 7.686881626551516e-05,
735
+ "loss": 1.6831,
736
+ "step": 580
737
+ },
738
+ {
739
+ "epoch": 4.5,
740
+ "learning_rate": 6.860000000000003e-05,
741
+ "loss": 1.0505,
742
+ "step": 585
743
+ },
744
+ {
745
+ "epoch": 4.54,
746
+ "learning_rate": 6.033118373448492e-05,
747
+ "loss": 1.4459,
748
+ "step": 590
749
+ },
750
+ {
751
+ "epoch": 4.58,
752
+ "learning_rate": 5.218294542987365e-05,
753
+ "loss": 1.4365,
754
+ "step": 595
755
+ },
756
+ {
757
+ "epoch": 4.62,
758
+ "learning_rate": 4.42741047488822e-05,
759
+ "loss": 1.6545,
760
+ "step": 600
761
+ },
762
+ {
763
+ "epoch": 4.65,
764
+ "learning_rate": 3.671999039779743e-05,
765
+ "loss": 1.6826,
766
+ "step": 605
767
+ },
768
+ {
769
+ "epoch": 4.69,
770
+ "learning_rate": 2.9630758374242696e-05,
771
+ "loss": 1.6816,
772
+ "step": 610
773
+ },
774
+ {
775
+ "epoch": 4.73,
776
+ "learning_rate": 2.3109785644681465e-05,
777
+ "loss": 1.43,
778
+ "step": 615
779
+ },
780
+ {
781
+ "epoch": 4.77,
782
+ "learning_rate": 1.7252162675462504e-05,
783
+ "loss": 1.1238,
784
+ "step": 620
785
+ },
786
+ {
787
+ "epoch": 4.81,
788
+ "learning_rate": 1.2143306799695228e-05,
789
+ "loss": 1.1441,
790
+ "step": 625
791
+ },
792
+ {
793
+ "epoch": 4.85,
794
+ "learning_rate": 7.857716640189861e-06,
795
+ "loss": 1.5854,
796
+ "step": 630
797
+ },
798
+ {
799
+ "epoch": 4.88,
800
+ "learning_rate": 4.4578857517805195e-06,
801
+ "loss": 1.2825,
802
+ "step": 635
803
+ },
804
+ {
805
+ "epoch": 4.92,
806
+ "learning_rate": 1.9933913245728244e-06,
807
+ "loss": 1.3848,
808
+ "step": 640
809
+ },
810
+ {
811
+ "epoch": 4.96,
812
+ "learning_rate": 5.001712368734975e-07,
813
+ "loss": 1.4917,
814
+ "step": 645
815
+ },
816
+ {
817
+ "epoch": 5.0,
818
+ "learning_rate": 0.0,
819
+ "loss": 1.0341,
820
+ "step": 650
821
+ },
822
+ {
823
+ "epoch": 5.0,
824
+ "eval_loss": 1.7726789712905884,
825
+ "eval_runtime": 4.2405,
826
+ "eval_samples_per_second": 45.75,
827
+ "eval_steps_per_second": 5.896,
828
+ "step": 650
829
+ },
830
+ {
831
+ "epoch": 5.04,
832
+ "learning_rate": 5.001712368734899e-07,
833
+ "loss": 1.3946,
834
+ "step": 655
835
+ },
836
+ {
837
+ "epoch": 5.08,
838
+ "learning_rate": 1.9933913245728015e-06,
839
+ "loss": 1.1575,
840
+ "step": 660
841
+ },
842
+ {
843
+ "epoch": 5.12,
844
+ "learning_rate": 4.457885751780535e-06,
845
+ "loss": 1.2407,
846
+ "step": 665
847
+ },
848
+ {
849
+ "epoch": 5.15,
850
+ "learning_rate": 7.857716640189824e-06,
851
+ "loss": 1.299,
852
+ "step": 670
853
+ },
854
+ {
855
+ "epoch": 5.19,
856
+ "learning_rate": 1.2143306799695106e-05,
857
+ "loss": 1.4759,
858
+ "step": 675
859
+ },
860
+ {
861
+ "epoch": 5.23,
862
+ "learning_rate": 1.725216267546245e-05,
863
+ "loss": 1.2347,
864
+ "step": 680
865
+ },
866
+ {
867
+ "epoch": 5.27,
868
+ "learning_rate": 2.3109785644681495e-05,
869
+ "loss": 1.4233,
870
+ "step": 685
871
+ },
872
+ {
873
+ "epoch": 5.31,
874
+ "learning_rate": 2.963075837424263e-05,
875
+ "loss": 1.4062,
876
+ "step": 690
877
+ },
878
+ {
879
+ "epoch": 5.35,
880
+ "learning_rate": 3.6719990397797463e-05,
881
+ "loss": 1.2485,
882
+ "step": 695
883
+ },
884
+ {
885
+ "epoch": 5.38,
886
+ "learning_rate": 4.4274104748882125e-05,
887
+ "loss": 1.4536,
888
+ "step": 700
889
+ },
890
+ {
891
+ "epoch": 5.42,
892
+ "learning_rate": 5.2182945429873444e-05,
893
+ "loss": 1.2116,
894
+ "step": 705
895
+ },
896
+ {
897
+ "epoch": 5.46,
898
+ "learning_rate": 6.033118373448483e-05,
899
+ "loss": 1.1743,
900
+ "step": 710
901
+ },
902
+ {
903
+ "epoch": 5.5,
904
+ "learning_rate": 6.859999999999984e-05,
905
+ "loss": 1.2798,
906
+ "step": 715
907
+ },
908
+ {
909
+ "epoch": 5.54,
910
+ "learning_rate": 7.686881626551508e-05,
911
+ "loss": 1.1637,
912
+ "step": 720
913
+ },
914
+ {
915
+ "epoch": 5.58,
916
+ "learning_rate": 8.501705457012647e-05,
917
+ "loss": 1.4529,
918
+ "step": 725
919
+ },
920
+ {
921
+ "epoch": 5.62,
922
+ "learning_rate": 9.292589525111778e-05,
923
+ "loss": 1.1575,
924
+ "step": 730
925
+ },
926
+ {
927
+ "epoch": 5.65,
928
+ "learning_rate": 0.00010048000960220244,
929
+ "loss": 1.4035,
930
+ "step": 735
931
+ },
932
+ {
933
+ "epoch": 5.69,
934
+ "learning_rate": 0.0001075692416257573,
935
+ "loss": 1.2311,
936
+ "step": 740
937
+ },
938
+ {
939
+ "epoch": 5.73,
940
+ "learning_rate": 0.00011409021435531843,
941
+ "loss": 1.3514,
942
+ "step": 745
943
+ },
944
+ {
945
+ "epoch": 5.77,
946
+ "learning_rate": 0.00011994783732453749,
947
+ "loss": 1.0519,
948
+ "step": 750
949
+ },
950
+ {
951
+ "epoch": 5.81,
952
+ "learning_rate": 0.00012505669320030485,
953
+ "loss": 1.39,
954
+ "step": 755
955
+ },
956
+ {
957
+ "epoch": 5.85,
958
+ "learning_rate": 0.00012934228335981013,
959
+ "loss": 1.2267,
960
+ "step": 760
961
+ },
962
+ {
963
+ "epoch": 5.88,
964
+ "learning_rate": 0.00013274211424821943,
965
+ "loss": 1.431,
966
+ "step": 765
967
+ },
968
+ {
969
+ "epoch": 5.92,
970
+ "learning_rate": 0.00013520660867542716,
971
+ "loss": 1.5557,
972
+ "step": 770
973
+ },
974
+ {
975
+ "epoch": 5.96,
976
+ "learning_rate": 0.00013669982876312649,
977
+ "loss": 1.2241,
978
+ "step": 775
979
+ },
980
+ {
981
+ "epoch": 6.0,
982
+ "learning_rate": 0.0001372,
983
+ "loss": 1.1435,
984
+ "step": 780
985
+ },
986
+ {
987
+ "epoch": 6.0,
988
+ "eval_loss": 1.752521276473999,
989
+ "eval_runtime": 4.2466,
990
+ "eval_samples_per_second": 45.684,
991
+ "eval_steps_per_second": 5.887,
992
+ "step": 780
993
  }
994
  ],
995
+ "max_steps": 910,
996
+ "num_train_epochs": 7,
997
+ "total_flos": 811181113344000.0,
998
  "trial_name": null,
999
  "trial_params": null
1000
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74c710dd5a8d4b67cb50ad0843ddfa8eb650173d3491cfb493f1dda199d69582
3
- size 2671
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bf17e5c0319daac4eb8decc923d42c574d61cb10c1cc3267a038b27c7c37d44
3
+ size 3311