bayuela commited on
Commit
001b831
1 Parent(s): 31195bb

New model Version

Browse files
Files changed (3) hide show
  1. rng_state.pth +3 -0
  2. scheduler.pt +3 -0
  3. trainer_state.json +143 -0
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6991cc6552c5b76dbe1f0de75531c3a899fb22204ca73ad87b7212dd9296b6da
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a415030a49c53672ccac7723cf7f1e258eb6d2a6e1da2d25ddca5348e51f0041
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2674808204174042,
3
+ "best_model_checkpoint": "./results/checkpoint-2416",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 7248,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.20695364238410596,
13
+ "grad_norm": 0.06496760994195938,
14
+ "learning_rate": 1.862030905077263e-05,
15
+ "loss": 0.1879,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.4139072847682119,
20
+ "grad_norm": 0.2046840637922287,
21
+ "learning_rate": 1.7240618101545256e-05,
22
+ "loss": 0.1544,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.6208609271523179,
27
+ "grad_norm": 78.30413818359375,
28
+ "learning_rate": 1.5860927152317882e-05,
29
+ "loss": 0.1443,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.8278145695364238,
34
+ "grad_norm": 13.444316864013672,
35
+ "learning_rate": 1.448123620309051e-05,
36
+ "loss": 0.1214,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_loss": 0.2674808204174042,
42
+ "eval_runtime": 68.0862,
43
+ "eval_samples_per_second": 31.548,
44
+ "eval_steps_per_second": 3.951,
45
+ "step": 2416
46
+ },
47
+ {
48
+ "epoch": 1.0347682119205297,
49
+ "grad_norm": 11.859747886657715,
50
+ "learning_rate": 1.3101545253863135e-05,
51
+ "loss": 0.1299,
52
+ "step": 2500
53
+ },
54
+ {
55
+ "epoch": 1.2417218543046358,
56
+ "grad_norm": 7.801605701446533,
57
+ "learning_rate": 1.1721854304635763e-05,
58
+ "loss": 0.1358,
59
+ "step": 3000
60
+ },
61
+ {
62
+ "epoch": 1.4486754966887418,
63
+ "grad_norm": 9.417176246643066,
64
+ "learning_rate": 1.034216335540839e-05,
65
+ "loss": 0.1331,
66
+ "step": 3500
67
+ },
68
+ {
69
+ "epoch": 1.6556291390728477,
70
+ "grad_norm": 0.017003627493977547,
71
+ "learning_rate": 8.962472406181017e-06,
72
+ "loss": 0.1325,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 1.8625827814569536,
77
+ "grad_norm": 7.609455585479736,
78
+ "learning_rate": 7.582781456953643e-06,
79
+ "loss": 0.1273,
80
+ "step": 4500
81
+ },
82
+ {
83
+ "epoch": 2.0,
84
+ "eval_loss": 0.2778768837451935,
85
+ "eval_runtime": 67.8572,
86
+ "eval_samples_per_second": 31.655,
87
+ "eval_steps_per_second": 3.964,
88
+ "step": 4832
89
+ },
90
+ {
91
+ "epoch": 2.0695364238410594,
92
+ "grad_norm": 0.0077050491236150265,
93
+ "learning_rate": 6.203090507726269e-06,
94
+ "loss": 0.125,
95
+ "step": 5000
96
+ },
97
+ {
98
+ "epoch": 2.2764900662251657,
99
+ "grad_norm": 9.524530410766602,
100
+ "learning_rate": 4.823399558498897e-06,
101
+ "loss": 0.0672,
102
+ "step": 5500
103
+ },
104
+ {
105
+ "epoch": 2.4834437086092715,
106
+ "grad_norm": 3.2392263412475586,
107
+ "learning_rate": 3.443708609271523e-06,
108
+ "loss": 0.0731,
109
+ "step": 6000
110
+ },
111
+ {
112
+ "epoch": 2.6903973509933774,
113
+ "grad_norm": 0.4321236312389374,
114
+ "learning_rate": 2.06401766004415e-06,
115
+ "loss": 0.0818,
116
+ "step": 6500
117
+ },
118
+ {
119
+ "epoch": 2.8973509933774837,
120
+ "grad_norm": 0.006676756776869297,
121
+ "learning_rate": 6.843267108167771e-07,
122
+ "loss": 0.0691,
123
+ "step": 7000
124
+ },
125
+ {
126
+ "epoch": 3.0,
127
+ "eval_loss": 0.30005592107772827,
128
+ "eval_runtime": 67.9967,
129
+ "eval_samples_per_second": 31.59,
130
+ "eval_steps_per_second": 3.956,
131
+ "step": 7248
132
+ }
133
+ ],
134
+ "logging_steps": 500,
135
+ "max_steps": 7248,
136
+ "num_input_tokens_seen": 0,
137
+ "num_train_epochs": 3,
138
+ "save_steps": 500,
139
+ "total_flos": 1.52529694887168e+16,
140
+ "train_batch_size": 8,
141
+ "trial_name": null,
142
+ "trial_params": null
143
+ }