MohamedAhmedAE commited on
Commit
e444d8f
1 Parent(s): c6d6a26

Training in progress, step 18800, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
- "gate_proj",
25
  "o_proj",
26
  "k_proj",
 
 
27
  "up_proj",
28
- "v_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
 
24
  "o_proj",
25
  "k_proj",
26
+ "gate_proj",
27
+ "down_proj",
28
  "up_proj",
29
+ "v_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b74082b50533e4333b88e29acd5ec8eecf39804ba0295840da3c866df5050cc
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9c62f875c287b7c4d9167adebf029a6ce30f517d94dbbe27ecde3a226f0357
3
  size 167832240
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4f867fe76c879c3168a89d511c76e1fa1b26ec6a18b25ec83cf06f006a135e0
3
- size 84581014
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b6897debb3d46ba4387737336e6f97292f866a07ef61439e53f84fd4505d62
3
+ size 85736914
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68341ef163cc0109a8713173add8dbcfd98bf67468e1a64939a8b03523666bf9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9989851a017fcf0ea72ad3948880d1a8db6c3206bc2c0667a86129b6301b196
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6fa196d6a1215f8cb5411f481e0c89a67b0e9876e4efe3f1ada6c1ba44ba0a3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821633e9583ccb6e3bb6cf440d7524519270ce3ff11ad9b4c39204df191e39fc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.008626491509226999,
5
  "eval_steps": 2000,
6
- "global_step": 11600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -413,14 +413,266 @@
413
  "learning_rate": 1.9999853411898932e-05,
414
  "loss": 1.5097,
415
  "step": 11600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  }
417
  ],
418
  "logging_steps": 200,
419
- "max_steps": 6723475,
420
  "num_input_tokens_seen": 0,
421
  "num_train_epochs": 5,
422
  "save_steps": 200,
423
- "total_flos": 1.5179994612574618e+17,
424
  "train_batch_size": 1,
425
  "trial_name": null,
426
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.027961731098873722,
5
  "eval_steps": 2000,
6
+ "global_step": 18800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
413
  "learning_rate": 1.9999853411898932e-05,
414
  "loss": 1.5097,
415
  "step": 11600
416
+ },
417
+ {
418
+ "epoch": 0.02,
419
+ "grad_norm": 3.229198455810547,
420
+ "learning_rate": 1.999939333873553e-05,
421
+ "loss": 1.5721,
422
+ "step": 11800
423
+ },
424
+ {
425
+ "epoch": 0.02,
426
+ "grad_norm": 1.2855397462844849,
427
+ "learning_rate": 1.9999372576820398e-05,
428
+ "loss": 1.5382,
429
+ "step": 12000
430
+ },
431
+ {
432
+ "epoch": 0.02,
433
+ "grad_norm": 1.536872148513794,
434
+ "learning_rate": 1.9999351465598642e-05,
435
+ "loss": 1.5964,
436
+ "step": 12200
437
+ },
438
+ {
439
+ "epoch": 0.02,
440
+ "grad_norm": 2.0981087684631348,
441
+ "learning_rate": 1.9999330005070992e-05,
442
+ "loss": 1.5269,
443
+ "step": 12400
444
+ },
445
+ {
446
+ "epoch": 0.02,
447
+ "grad_norm": 2.213561773300171,
448
+ "learning_rate": 1.99993081952382e-05,
449
+ "loss": 1.488,
450
+ "step": 12600
451
+ },
452
+ {
453
+ "epoch": 0.02,
454
+ "grad_norm": 2.3960020542144775,
455
+ "learning_rate": 1.999928603610103e-05,
456
+ "loss": 1.5714,
457
+ "step": 12800
458
+ },
459
+ {
460
+ "epoch": 0.02,
461
+ "grad_norm": 2.198500394821167,
462
+ "learning_rate": 1.9999263641071352e-05,
463
+ "loss": 1.5587,
464
+ "step": 13000
465
+ },
466
+ {
467
+ "epoch": 0.02,
468
+ "grad_norm": 2.4841859340667725,
469
+ "learning_rate": 1.9999240785074275e-05,
470
+ "loss": 1.5417,
471
+ "step": 13200
472
+ },
473
+ {
474
+ "epoch": 0.02,
475
+ "grad_norm": 2.9682819843292236,
476
+ "learning_rate": 1.999921757977517e-05,
477
+ "loss": 1.578,
478
+ "step": 13400
479
+ },
480
+ {
481
+ "epoch": 0.02,
482
+ "grad_norm": 2.8368330001831055,
483
+ "learning_rate": 1.999919402517485e-05,
484
+ "loss": 1.5703,
485
+ "step": 13600
486
+ },
487
+ {
488
+ "epoch": 0.02,
489
+ "grad_norm": 3.0925166606903076,
490
+ "learning_rate": 1.9999170121274143e-05,
491
+ "loss": 1.5163,
492
+ "step": 13800
493
+ },
494
+ {
495
+ "epoch": 0.02,
496
+ "grad_norm": 2.2362563610076904,
497
+ "learning_rate": 1.999914586807388e-05,
498
+ "loss": 1.6078,
499
+ "step": 14000
500
+ },
501
+ {
502
+ "epoch": 0.02,
503
+ "grad_norm": 3.019454002380371,
504
+ "learning_rate": 1.9999121265574902e-05,
505
+ "loss": 1.5317,
506
+ "step": 14200
507
+ },
508
+ {
509
+ "epoch": 0.02,
510
+ "grad_norm": 2.67069411277771,
511
+ "learning_rate": 1.9999096313778082e-05,
512
+ "loss": 1.529,
513
+ "step": 14400
514
+ },
515
+ {
516
+ "epoch": 0.02,
517
+ "grad_norm": 2.8095571994781494,
518
+ "learning_rate": 1.9999071012684285e-05,
519
+ "loss": 1.557,
520
+ "step": 14600
521
+ },
522
+ {
523
+ "epoch": 0.02,
524
+ "grad_norm": 2.3300442695617676,
525
+ "learning_rate": 1.9999045362294388e-05,
526
+ "loss": 1.5554,
527
+ "step": 14800
528
+ },
529
+ {
530
+ "epoch": 0.02,
531
+ "grad_norm": 2.160933256149292,
532
+ "learning_rate": 1.9999019362609297e-05,
533
+ "loss": 1.528,
534
+ "step": 15000
535
+ },
536
+ {
537
+ "epoch": 0.02,
538
+ "grad_norm": 1.6309542655944824,
539
+ "learning_rate": 1.999899301362992e-05,
540
+ "loss": 1.5344,
541
+ "step": 15200
542
+ },
543
+ {
544
+ "epoch": 0.02,
545
+ "grad_norm": 3.1774258613586426,
546
+ "learning_rate": 1.9998966315357173e-05,
547
+ "loss": 1.5661,
548
+ "step": 15400
549
+ },
550
+ {
551
+ "epoch": 0.02,
552
+ "grad_norm": 2.8362374305725098,
553
+ "learning_rate": 1.9998939267791986e-05,
554
+ "loss": 1.5404,
555
+ "step": 15600
556
+ },
557
+ {
558
+ "epoch": 0.02,
559
+ "grad_norm": 1.6643764972686768,
560
+ "learning_rate": 1.999891187093531e-05,
561
+ "loss": 1.562,
562
+ "step": 15800
563
+ },
564
+ {
565
+ "epoch": 0.02,
566
+ "grad_norm": 2.519455671310425,
567
+ "learning_rate": 1.99988841247881e-05,
568
+ "loss": 1.5468,
569
+ "step": 16000
570
+ },
571
+ {
572
+ "epoch": 0.02,
573
+ "grad_norm": 1.8681560754776,
574
+ "learning_rate": 1.9998856029351327e-05,
575
+ "loss": 1.501,
576
+ "step": 16200
577
+ },
578
+ {
579
+ "epoch": 0.02,
580
+ "grad_norm": 1.5082764625549316,
581
+ "learning_rate": 1.999882758462597e-05,
582
+ "loss": 1.5632,
583
+ "step": 16400
584
+ },
585
+ {
586
+ "epoch": 0.02,
587
+ "grad_norm": 1.8632557392120361,
588
+ "learning_rate": 1.9998798790613018e-05,
589
+ "loss": 1.5509,
590
+ "step": 16600
591
+ },
592
+ {
593
+ "epoch": 0.02,
594
+ "grad_norm": 3.0881147384643555,
595
+ "learning_rate": 1.999876964731349e-05,
596
+ "loss": 1.5277,
597
+ "step": 16800
598
+ },
599
+ {
600
+ "epoch": 0.03,
601
+ "grad_norm": 1.9005630016326904,
602
+ "learning_rate": 1.9998740303060157e-05,
603
+ "loss": 1.5542,
604
+ "step": 17000
605
+ },
606
+ {
607
+ "epoch": 0.03,
608
+ "grad_norm": 1.4960647821426392,
609
+ "learning_rate": 1.9998710462936946e-05,
610
+ "loss": 1.5781,
611
+ "step": 17200
612
+ },
613
+ {
614
+ "epoch": 0.03,
615
+ "grad_norm": 2.5842814445495605,
616
+ "learning_rate": 1.9998680273530233e-05,
617
+ "loss": 1.5535,
618
+ "step": 17400
619
+ },
620
+ {
621
+ "epoch": 0.03,
622
+ "grad_norm": 2.9667937755584717,
623
+ "learning_rate": 1.9998649734841075e-05,
624
+ "loss": 1.5764,
625
+ "step": 17600
626
+ },
627
+ {
628
+ "epoch": 0.03,
629
+ "grad_norm": 2.2704834938049316,
630
+ "learning_rate": 1.9998618846870542e-05,
631
+ "loss": 1.55,
632
+ "step": 17800
633
+ },
634
+ {
635
+ "epoch": 0.03,
636
+ "grad_norm": 2.67142391204834,
637
+ "learning_rate": 1.9998587609619712e-05,
638
+ "loss": 1.5648,
639
+ "step": 18000
640
+ },
641
+ {
642
+ "epoch": 0.03,
643
+ "grad_norm": 2.281129837036133,
644
+ "learning_rate": 1.9998556023089672e-05,
645
+ "loss": 1.5405,
646
+ "step": 18200
647
+ },
648
+ {
649
+ "epoch": 0.03,
650
+ "grad_norm": 2.508354425430298,
651
+ "learning_rate": 1.999852408728153e-05,
652
+ "loss": 1.5574,
653
+ "step": 18400
654
+ },
655
+ {
656
+ "epoch": 0.03,
657
+ "grad_norm": 2.8000833988189697,
658
+ "learning_rate": 1.99984918021964e-05,
659
+ "loss": 1.5638,
660
+ "step": 18600
661
+ },
662
+ {
663
+ "epoch": 0.03,
664
+ "grad_norm": 3.3880839347839355,
665
+ "learning_rate": 1.999845916783541e-05,
666
+ "loss": 1.553,
667
+ "step": 18800
668
  }
669
  ],
670
  "logging_steps": 200,
671
+ "max_steps": 3361735,
672
  "num_input_tokens_seen": 0,
673
  "num_train_epochs": 5,
674
  "save_steps": 200,
675
+ "total_flos": 3.385280035214623e+17,
676
  "train_batch_size": 1,
677
  "trial_name": null,
678
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45d4c3e11daa354fcf86b7301a1fc0e8bc31167e5dec2140d52b76922a0af4ca
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc9e75826f834526adf57daa4ee7a58f88bf2ec9679f7599af2037d01589eb4f
3
  size 4920