innovation-hacking2 commited on
Commit
ad5fb8e
·
verified ·
1 Parent(s): e99eb9d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. custom-tokenizer/full-finetuning/LLaMmlein_120M/config.json +30 -0
  2. custom-tokenizer/full-finetuning/LLaMmlein_120M/generation_config.json +6 -0
  3. custom-tokenizer/full-finetuning/LLaMmlein_120M/model.safetensors +3 -0
  4. custom-tokenizer/full-finetuning/LLaMmlein_120M/optimizer.pt +3 -0
  5. custom-tokenizer/full-finetuning/LLaMmlein_120M/rng_state.pth +3 -0
  6. custom-tokenizer/full-finetuning/LLaMmlein_120M/scheduler.pt +3 -0
  7. custom-tokenizer/full-finetuning/LLaMmlein_120M/trainer_state.json +1338 -0
  8. custom-tokenizer/full-finetuning/LLaMmlein_120M/training_args.bin +3 -0
  9. custom-tokenizer/full-finetuning/german-gpt2/config.json +40 -0
  10. custom-tokenizer/full-finetuning/german-gpt2/generation_config.json +6 -0
  11. custom-tokenizer/full-finetuning/german-gpt2/model.safetensors +3 -0
  12. custom-tokenizer/full-finetuning/german-gpt2/optimizer.pt +3 -0
  13. custom-tokenizer/full-finetuning/german-gpt2/rng_state.pth +3 -0
  14. custom-tokenizer/full-finetuning/german-gpt2/scheduler.pt +3 -0
  15. custom-tokenizer/full-finetuning/german-gpt2/trainer_state.json +1338 -0
  16. custom-tokenizer/full-finetuning/german-gpt2/training_args.bin +3 -0
  17. full-finetuning/LLaMmlein_120M/config.json +30 -0
  18. full-finetuning/LLaMmlein_120M/generation_config.json +6 -0
  19. full-finetuning/LLaMmlein_120M/model.safetensors +3 -0
  20. full-finetuning/LLaMmlein_120M/optimizer.pt +3 -0
  21. full-finetuning/LLaMmlein_120M/rng_state.pth +3 -0
  22. full-finetuning/LLaMmlein_120M/scheduler.pt +3 -0
  23. full-finetuning/LLaMmlein_120M/trainer_state.json +1338 -0
  24. full-finetuning/LLaMmlein_120M/training_args.bin +3 -0
  25. full-finetuning/german-gpt2/config.json +40 -0
  26. full-finetuning/german-gpt2/generation_config.json +6 -0
  27. full-finetuning/german-gpt2/model.safetensors +3 -0
  28. full-finetuning/german-gpt2/optimizer.pt +3 -0
  29. full-finetuning/german-gpt2/rng_state.pth +3 -0
  30. full-finetuning/german-gpt2/scheduler.pt +3 -0
  31. full-finetuning/german-gpt2/trainer_state.json +1338 -0
  32. full-finetuning/german-gpt2/training_args.bin +3 -0
  33. instruct-finetuning/base/LLaMmlein_120M/README.md +202 -0
  34. instruct-finetuning/base/LLaMmlein_120M/adapter_config.json +35 -0
  35. instruct-finetuning/base/LLaMmlein_120M/adapter_model.safetensors +3 -0
  36. instruct-finetuning/base/LLaMmlein_120M/optimizer.pt +3 -0
  37. instruct-finetuning/base/LLaMmlein_120M/rng_state.pth +3 -0
  38. instruct-finetuning/base/LLaMmlein_120M/scheduler.pt +3 -0
  39. instruct-finetuning/base/LLaMmlein_120M/special_tokens_map.json +24 -0
  40. instruct-finetuning/base/LLaMmlein_120M/tokenizer.json +0 -0
  41. instruct-finetuning/base/LLaMmlein_120M/tokenizer_config.json +43 -0
  42. instruct-finetuning/base/LLaMmlein_120M/trainer_state.json +1593 -0
  43. instruct-finetuning/base/LLaMmlein_120M/training_args.bin +3 -0
  44. instruct-finetuning/base/german-gpt2/README.md +202 -0
  45. instruct-finetuning/base/german-gpt2/adapter_config.json +35 -0
  46. instruct-finetuning/base/german-gpt2/adapter_model.safetensors +3 -0
  47. instruct-finetuning/base/german-gpt2/added_tokens.json +3 -0
  48. instruct-finetuning/base/german-gpt2/merges.txt +0 -0
  49. instruct-finetuning/base/german-gpt2/optimizer.pt +3 -0
  50. instruct-finetuning/base/german-gpt2/rng_state.pth +3 -0
custom-tokenizer/full-finetuning/LLaMmlein_120M/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "LSX-UniWue/LLaMmlein_120M",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 2048,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.47.1",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
custom-tokenizer/full-finetuning/LLaMmlein_120M/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.47.1"
6
+ }
custom-tokenizer/full-finetuning/LLaMmlein_120M/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0b3044bcc64ac95bdd2658b8d0f9cabeb741f51fc1d99410a052b7d1c7182cb
3
+ size 498687008
custom-tokenizer/full-finetuning/LLaMmlein_120M/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08037fda9d57107296c2d1153683af7fc2effcc6947bfbe698b27c6809f44476
3
+ size 997443194
custom-tokenizer/full-finetuning/LLaMmlein_120M/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3bc1ac846859ad13fd356d3ed41fd20834205a7e8764e4198636863fca3f64c
3
+ size 14244
custom-tokenizer/full-finetuning/LLaMmlein_120M/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a4bab2c9813a0e45b4057cd5fbd9a5b26b99af9e15f94c8344ed33c80a85b8c
3
+ size 1064
custom-tokenizer/full-finetuning/LLaMmlein_120M/trainer_state.json ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.157134532928467,
3
+ "best_model_checkpoint": "./models/custom-tokenizer/full-finetuning/LLaMmlein_120M/checkpoint-59000",
4
+ "epoch": 1.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 59835,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008356313194618534,
13
+ "grad_norm": 35.479248046875,
14
+ "learning_rate": 4.96e-05,
15
+ "loss": 4.5691,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.016712626389237067,
20
+ "grad_norm": 21.039249420166016,
21
+ "learning_rate": 4.958203421252212e-05,
22
+ "loss": 4.1783,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.016712626389237067,
27
+ "eval_loss": 4.322763442993164,
28
+ "eval_runtime": 22.3501,
29
+ "eval_samples_per_second": 199.641,
30
+ "eval_steps_per_second": 24.966,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.025068939583855605,
35
+ "grad_norm": 18.14427947998047,
36
+ "learning_rate": 4.916069773320974e-05,
37
+ "loss": 4.0526,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.033425252778474135,
42
+ "grad_norm": 16.241540908813477,
43
+ "learning_rate": 4.874020392685599e-05,
44
+ "loss": 3.9836,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.033425252778474135,
49
+ "eval_loss": 4.129116058349609,
50
+ "eval_runtime": 22.3714,
51
+ "eval_samples_per_second": 199.451,
52
+ "eval_steps_per_second": 24.943,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.04178156597309267,
57
+ "grad_norm": 18.163301467895508,
58
+ "learning_rate": 4.8318867447543606e-05,
59
+ "loss": 3.8827,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.05013787916771121,
64
+ "grad_norm": 24.604848861694336,
65
+ "learning_rate": 4.789753096823123e-05,
66
+ "loss": 3.7892,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.05013787916771121,
71
+ "eval_loss": 4.018996238708496,
72
+ "eval_runtime": 22.3789,
73
+ "eval_samples_per_second": 199.384,
74
+ "eval_steps_per_second": 24.934,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.05849419236232974,
79
+ "grad_norm": 11.950243949890137,
80
+ "learning_rate": 4.747619448891885e-05,
81
+ "loss": 3.8354,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.06685050555694827,
86
+ "grad_norm": 15.779219627380371,
87
+ "learning_rate": 4.705485800960647e-05,
88
+ "loss": 3.7895,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.06685050555694827,
93
+ "eval_loss": 3.9454355239868164,
94
+ "eval_runtime": 22.2538,
95
+ "eval_samples_per_second": 200.505,
96
+ "eval_steps_per_second": 25.074,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.0752068187515668,
101
+ "grad_norm": 18.874004364013672,
102
+ "learning_rate": 4.6633521530294095e-05,
103
+ "loss": 3.7398,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.08356313194618534,
108
+ "grad_norm": 15.567193984985352,
109
+ "learning_rate": 4.621218505098172e-05,
110
+ "loss": 3.6895,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.08356313194618534,
115
+ "eval_loss": 3.897683620452881,
116
+ "eval_runtime": 22.3197,
117
+ "eval_samples_per_second": 199.913,
118
+ "eval_steps_per_second": 25.0,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.09191944514080387,
123
+ "grad_norm": 16.107986450195312,
124
+ "learning_rate": 4.579084857166934e-05,
125
+ "loss": 3.6716,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.10027575833542242,
130
+ "grad_norm": 13.824947357177734,
131
+ "learning_rate": 4.5369512092356955e-05,
132
+ "loss": 3.6242,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.10027575833542242,
137
+ "eval_loss": 3.839233636856079,
138
+ "eval_runtime": 22.2807,
139
+ "eval_samples_per_second": 200.263,
140
+ "eval_steps_per_second": 25.044,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.10863207153004095,
145
+ "grad_norm": 16.347686767578125,
146
+ "learning_rate": 4.494817561304458e-05,
147
+ "loss": 3.5884,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.11698838472465949,
152
+ "grad_norm": 12.54311466217041,
153
+ "learning_rate": 4.4527681806690827e-05,
154
+ "loss": 3.6266,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.11698838472465949,
159
+ "eval_loss": 3.802494525909424,
160
+ "eval_runtime": 22.3755,
161
+ "eval_samples_per_second": 199.415,
162
+ "eval_steps_per_second": 24.938,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.12534469791927802,
167
+ "grad_norm": 14.918440818786621,
168
+ "learning_rate": 4.410634532737845e-05,
169
+ "loss": 3.6035,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.13370101111389654,
174
+ "grad_norm": 14.448460578918457,
175
+ "learning_rate": 4.3685008848066064e-05,
176
+ "loss": 3.5471,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.13370101111389654,
181
+ "eval_loss": 3.7752952575683594,
182
+ "eval_runtime": 22.2456,
183
+ "eval_samples_per_second": 200.579,
184
+ "eval_steps_per_second": 25.084,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.1420573243085151,
189
+ "grad_norm": 16.727405548095703,
190
+ "learning_rate": 4.326367236875369e-05,
191
+ "loss": 3.5496,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.1504136375031336,
196
+ "grad_norm": 12.798069953918457,
197
+ "learning_rate": 4.284233588944131e-05,
198
+ "loss": 3.5484,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.1504136375031336,
203
+ "eval_loss": 3.7489852905273438,
204
+ "eval_runtime": 22.3471,
205
+ "eval_samples_per_second": 199.668,
206
+ "eval_steps_per_second": 24.97,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.15876995069775215,
211
+ "grad_norm": 11.953953742980957,
212
+ "learning_rate": 4.242099941012893e-05,
213
+ "loss": 3.5066,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.16712626389237067,
218
+ "grad_norm": 13.229384422302246,
219
+ "learning_rate": 4.1999662930816554e-05,
220
+ "loss": 3.496,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.16712626389237067,
225
+ "eval_loss": 3.716583251953125,
226
+ "eval_runtime": 22.3577,
227
+ "eval_samples_per_second": 199.573,
228
+ "eval_steps_per_second": 24.958,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 0.17548257708698922,
233
+ "grad_norm": 11.95522689819336,
234
+ "learning_rate": 4.1578326451504176e-05,
235
+ "loss": 3.462,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 0.18383889028160774,
240
+ "grad_norm": 23.668384552001953,
241
+ "learning_rate": 4.11569899721918e-05,
242
+ "loss": 3.4485,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 0.18383889028160774,
247
+ "eval_loss": 3.6793529987335205,
248
+ "eval_runtime": 22.3814,
249
+ "eval_samples_per_second": 199.362,
250
+ "eval_steps_per_second": 24.931,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 0.1921952034762263,
255
+ "grad_norm": 17.76200294494629,
256
+ "learning_rate": 4.0735653492879414e-05,
257
+ "loss": 3.4631,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 0.20055151667084484,
262
+ "grad_norm": 10.677286148071289,
263
+ "learning_rate": 4.0314317013567036e-05,
264
+ "loss": 3.4498,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 0.20055151667084484,
269
+ "eval_loss": 3.659281015396118,
270
+ "eval_runtime": 22.257,
271
+ "eval_samples_per_second": 200.476,
272
+ "eval_steps_per_second": 25.071,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 0.20890782986546336,
277
+ "grad_norm": 15.71968936920166,
278
+ "learning_rate": 3.989298053425466e-05,
279
+ "loss": 3.4596,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 0.2172641430600819,
284
+ "grad_norm": 10.1958589553833,
285
+ "learning_rate": 3.947164405494228e-05,
286
+ "loss": 3.3978,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 0.2172641430600819,
291
+ "eval_loss": 3.649770736694336,
292
+ "eval_runtime": 22.2606,
293
+ "eval_samples_per_second": 200.444,
294
+ "eval_steps_per_second": 25.067,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 0.22562045625470042,
299
+ "grad_norm": 10.865114212036133,
300
+ "learning_rate": 3.90503075756299e-05,
301
+ "loss": 3.3866,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 0.23397676944931897,
306
+ "grad_norm": 13.456110954284668,
307
+ "learning_rate": 3.8628971096317526e-05,
308
+ "loss": 3.3983,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 0.23397676944931897,
313
+ "eval_loss": 3.6266236305236816,
314
+ "eval_runtime": 22.2738,
315
+ "eval_samples_per_second": 200.325,
316
+ "eval_steps_per_second": 25.052,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 0.2423330826439375,
321
+ "grad_norm": 10.894895553588867,
322
+ "learning_rate": 3.820847728996377e-05,
323
+ "loss": 3.396,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 0.25068939583855604,
328
+ "grad_norm": 15.169280052185059,
329
+ "learning_rate": 3.778714081065139e-05,
330
+ "loss": 3.3957,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 0.25068939583855604,
335
+ "eval_loss": 3.598552703857422,
336
+ "eval_runtime": 22.3613,
337
+ "eval_samples_per_second": 199.541,
338
+ "eval_steps_per_second": 24.954,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 0.2590457090331746,
343
+ "grad_norm": 11.281633377075195,
344
+ "learning_rate": 3.736580433133901e-05,
345
+ "loss": 3.3797,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 0.2674020222277931,
350
+ "grad_norm": 14.883914947509766,
351
+ "learning_rate": 3.6944467852026635e-05,
352
+ "loss": 3.3148,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 0.2674020222277931,
357
+ "eval_loss": 3.595505475997925,
358
+ "eval_runtime": 22.34,
359
+ "eval_samples_per_second": 199.731,
360
+ "eval_steps_per_second": 24.978,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 0.2757583354224116,
365
+ "grad_norm": 18.707447052001953,
366
+ "learning_rate": 3.652313137271425e-05,
367
+ "loss": 3.3563,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 0.2841146486170302,
372
+ "grad_norm": 17.429594039916992,
373
+ "learning_rate": 3.610179489340187e-05,
374
+ "loss": 3.3366,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 0.2841146486170302,
379
+ "eval_loss": 3.5599985122680664,
380
+ "eval_runtime": 22.4605,
381
+ "eval_samples_per_second": 198.66,
382
+ "eval_steps_per_second": 24.844,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 0.2924709618116487,
387
+ "grad_norm": 13.94611644744873,
388
+ "learning_rate": 3.568130108704812e-05,
389
+ "loss": 3.287,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 0.3008272750062672,
394
+ "grad_norm": 14.198286056518555,
395
+ "learning_rate": 3.5260807280694364e-05,
396
+ "loss": 3.2916,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 0.3008272750062672,
401
+ "eval_loss": 3.549407720565796,
402
+ "eval_runtime": 22.3088,
403
+ "eval_samples_per_second": 200.011,
404
+ "eval_steps_per_second": 25.013,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 0.30918358820088576,
409
+ "grad_norm": 14.446388244628906,
410
+ "learning_rate": 3.483947080138199e-05,
411
+ "loss": 3.2898,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 0.3175399013955043,
416
+ "grad_norm": 11.602550506591797,
417
+ "learning_rate": 3.441813432206961e-05,
418
+ "loss": 3.2933,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 0.3175399013955043,
423
+ "eval_loss": 3.531162977218628,
424
+ "eval_runtime": 22.2383,
425
+ "eval_samples_per_second": 200.645,
426
+ "eval_steps_per_second": 25.092,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 0.32589621459012286,
431
+ "grad_norm": 12.805480003356934,
432
+ "learning_rate": 3.3996797842757225e-05,
433
+ "loss": 3.3027,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 0.33425252778474135,
438
+ "grad_norm": 15.020812034606934,
439
+ "learning_rate": 3.357546136344485e-05,
440
+ "loss": 3.2619,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 0.33425252778474135,
445
+ "eval_loss": 3.5211801528930664,
446
+ "eval_runtime": 22.2737,
447
+ "eval_samples_per_second": 200.326,
448
+ "eval_steps_per_second": 25.052,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 0.3426088409793599,
453
+ "grad_norm": 11.382076263427734,
454
+ "learning_rate": 3.3154967557091096e-05,
455
+ "loss": 3.293,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 0.35096515417397844,
460
+ "grad_norm": 8.293545722961426,
461
+ "learning_rate": 3.273363107777871e-05,
462
+ "loss": 3.2406,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 0.35096515417397844,
467
+ "eval_loss": 3.5136260986328125,
468
+ "eval_runtime": 22.2375,
469
+ "eval_samples_per_second": 200.652,
470
+ "eval_steps_per_second": 25.093,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 0.359321467368597,
475
+ "grad_norm": 9.583738327026367,
476
+ "learning_rate": 3.2312294598466334e-05,
477
+ "loss": 3.2416,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 0.3676777805632155,
482
+ "grad_norm": 10.723130226135254,
483
+ "learning_rate": 3.1890958119153956e-05,
484
+ "loss": 3.2568,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 0.3676777805632155,
489
+ "eval_loss": 3.491396427154541,
490
+ "eval_runtime": 22.2389,
491
+ "eval_samples_per_second": 200.639,
492
+ "eval_steps_per_second": 25.091,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 0.37603409375783403,
497
+ "grad_norm": 12.601049423217773,
498
+ "learning_rate": 3.14704643128002e-05,
499
+ "loss": 3.316,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 0.3843904069524526,
504
+ "grad_norm": 12.495054244995117,
505
+ "learning_rate": 3.104912783348782e-05,
506
+ "loss": 3.2242,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 0.3843904069524526,
511
+ "eval_loss": 3.4799916744232178,
512
+ "eval_runtime": 22.2275,
513
+ "eval_samples_per_second": 200.742,
514
+ "eval_steps_per_second": 25.104,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 0.3927467201470711,
519
+ "grad_norm": 10.878073692321777,
520
+ "learning_rate": 3.062779135417544e-05,
521
+ "loss": 3.2398,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 0.4011030333416897,
526
+ "grad_norm": 11.56193733215332,
527
+ "learning_rate": 3.0206454874863066e-05,
528
+ "loss": 3.209,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 0.4011030333416897,
533
+ "eval_loss": 3.461331367492676,
534
+ "eval_runtime": 22.2718,
535
+ "eval_samples_per_second": 200.343,
536
+ "eval_steps_per_second": 25.054,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 0.40945934653630817,
541
+ "grad_norm": 14.65617561340332,
542
+ "learning_rate": 2.9785118395550688e-05,
543
+ "loss": 3.2087,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 0.4178156597309267,
548
+ "grad_norm": 10.846756935119629,
549
+ "learning_rate": 2.936378191623831e-05,
550
+ "loss": 3.1671,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 0.4178156597309267,
555
+ "eval_loss": 3.447679042816162,
556
+ "eval_runtime": 22.2558,
557
+ "eval_samples_per_second": 200.487,
558
+ "eval_steps_per_second": 25.072,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 0.42617197292554526,
563
+ "grad_norm": 11.97311019897461,
564
+ "learning_rate": 2.894244543692593e-05,
565
+ "loss": 3.1925,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 0.4345282861201638,
570
+ "grad_norm": 12.073387145996094,
571
+ "learning_rate": 2.852110895761355e-05,
572
+ "loss": 3.2222,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 0.4345282861201638,
577
+ "eval_loss": 3.4386138916015625,
578
+ "eval_runtime": 22.247,
579
+ "eval_samples_per_second": 200.567,
580
+ "eval_steps_per_second": 25.082,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 0.4428845993147823,
585
+ "grad_norm": 10.205495834350586,
586
+ "learning_rate": 2.8099772478301174e-05,
587
+ "loss": 3.1603,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 0.45124091250940085,
592
+ "grad_norm": 11.937155723571777,
593
+ "learning_rate": 2.7678435998988793e-05,
594
+ "loss": 3.1898,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 0.45124091250940085,
599
+ "eval_loss": 3.4313693046569824,
600
+ "eval_runtime": 22.2782,
601
+ "eval_samples_per_second": 200.286,
602
+ "eval_steps_per_second": 25.047,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 0.4595972257040194,
607
+ "grad_norm": 9.137228965759277,
608
+ "learning_rate": 2.725794219263504e-05,
609
+ "loss": 3.1701,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 0.46795353889863794,
614
+ "grad_norm": 16.823511123657227,
615
+ "learning_rate": 2.683660571332266e-05,
616
+ "loss": 3.142,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 0.46795353889863794,
621
+ "eval_loss": 3.4176623821258545,
622
+ "eval_runtime": 22.2488,
623
+ "eval_samples_per_second": 200.55,
624
+ "eval_steps_per_second": 25.08,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 0.47630985209325644,
629
+ "grad_norm": 8.703514099121094,
630
+ "learning_rate": 2.641526923401028e-05,
631
+ "loss": 3.167,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 0.484666165287875,
636
+ "grad_norm": 10.721179962158203,
637
+ "learning_rate": 2.5993932754697902e-05,
638
+ "loss": 3.1781,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 0.484666165287875,
643
+ "eval_loss": 3.410005569458008,
644
+ "eval_runtime": 22.2563,
645
+ "eval_samples_per_second": 200.482,
646
+ "eval_steps_per_second": 25.072,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 0.49302247848249353,
651
+ "grad_norm": 12.79716968536377,
652
+ "learning_rate": 2.5572596275385524e-05,
653
+ "loss": 3.1427,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 0.5013787916771121,
658
+ "grad_norm": 10.93630599975586,
659
+ "learning_rate": 2.5151259796073147e-05,
660
+ "loss": 3.1848,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 0.5013787916771121,
665
+ "eval_loss": 3.393404483795166,
666
+ "eval_runtime": 22.2492,
667
+ "eval_samples_per_second": 200.547,
668
+ "eval_steps_per_second": 25.08,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 0.5097351048717306,
673
+ "grad_norm": 13.065569877624512,
674
+ "learning_rate": 2.4729923316760766e-05,
675
+ "loss": 3.1423,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 0.5180914180663492,
680
+ "grad_norm": 13.116445541381836,
681
+ "learning_rate": 2.4308586837448388e-05,
682
+ "loss": 3.124,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 0.5180914180663492,
687
+ "eval_loss": 3.3824470043182373,
688
+ "eval_runtime": 22.2069,
689
+ "eval_samples_per_second": 200.929,
690
+ "eval_steps_per_second": 25.127,
691
+ "step": 31000
692
+ },
693
+ {
694
+ "epoch": 0.5264477312609677,
695
+ "grad_norm": 13.007023811340332,
696
+ "learning_rate": 2.3888093031094634e-05,
697
+ "loss": 3.1545,
698
+ "step": 31500
699
+ },
700
+ {
701
+ "epoch": 0.5348040444555862,
702
+ "grad_norm": 11.246246337890625,
703
+ "learning_rate": 2.3466756551782253e-05,
704
+ "loss": 3.1299,
705
+ "step": 32000
706
+ },
707
+ {
708
+ "epoch": 0.5348040444555862,
709
+ "eval_loss": 3.3686602115631104,
710
+ "eval_runtime": 22.2456,
711
+ "eval_samples_per_second": 200.579,
712
+ "eval_steps_per_second": 25.084,
713
+ "step": 32000
714
+ },
715
+ {
716
+ "epoch": 0.5431603576502048,
717
+ "grad_norm": 10.350507736206055,
718
+ "learning_rate": 2.3045420072469875e-05,
719
+ "loss": 3.1309,
720
+ "step": 32500
721
+ },
722
+ {
723
+ "epoch": 0.5515166708448233,
724
+ "grad_norm": 11.70686149597168,
725
+ "learning_rate": 2.2624083593157497e-05,
726
+ "loss": 3.1039,
727
+ "step": 33000
728
+ },
729
+ {
730
+ "epoch": 0.5515166708448233,
731
+ "eval_loss": 3.361196756362915,
732
+ "eval_runtime": 22.2263,
733
+ "eval_samples_per_second": 200.753,
734
+ "eval_steps_per_second": 25.105,
735
+ "step": 33000
736
+ },
737
+ {
738
+ "epoch": 0.5598729840394417,
739
+ "grad_norm": 14.55528736114502,
740
+ "learning_rate": 2.220358978680374e-05,
741
+ "loss": 3.1191,
742
+ "step": 33500
743
+ },
744
+ {
745
+ "epoch": 0.5682292972340603,
746
+ "grad_norm": 10.029934883117676,
747
+ "learning_rate": 2.178309598044999e-05,
748
+ "loss": 3.1253,
749
+ "step": 34000
750
+ },
751
+ {
752
+ "epoch": 0.5682292972340603,
753
+ "eval_loss": 3.3478872776031494,
754
+ "eval_runtime": 25.1339,
755
+ "eval_samples_per_second": 177.529,
756
+ "eval_steps_per_second": 22.201,
757
+ "step": 34000
758
+ },
759
+ {
760
+ "epoch": 0.5765856104286788,
761
+ "grad_norm": 12.41011905670166,
762
+ "learning_rate": 2.136175950113761e-05,
763
+ "loss": 3.1118,
764
+ "step": 34500
765
+ },
766
+ {
767
+ "epoch": 0.5849419236232974,
768
+ "grad_norm": 7.660207748413086,
769
+ "learning_rate": 2.094042302182523e-05,
770
+ "loss": 3.0562,
771
+ "step": 35000
772
+ },
773
+ {
774
+ "epoch": 0.5849419236232974,
775
+ "eval_loss": 3.343930244445801,
776
+ "eval_runtime": 22.2454,
777
+ "eval_samples_per_second": 200.581,
778
+ "eval_steps_per_second": 25.084,
779
+ "step": 35000
780
+ },
781
+ {
782
+ "epoch": 0.5932982368179159,
783
+ "grad_norm": 14.761736869812012,
784
+ "learning_rate": 2.0519086542512852e-05,
785
+ "loss": 3.1209,
786
+ "step": 35500
787
+ },
788
+ {
789
+ "epoch": 0.6016545500125344,
790
+ "grad_norm": 13.07590103149414,
791
+ "learning_rate": 2.0097750063200475e-05,
792
+ "loss": 3.0823,
793
+ "step": 36000
794
+ },
795
+ {
796
+ "epoch": 0.6016545500125344,
797
+ "eval_loss": 3.3252499103546143,
798
+ "eval_runtime": 22.2801,
799
+ "eval_samples_per_second": 200.268,
800
+ "eval_steps_per_second": 25.045,
801
+ "step": 36000
802
+ },
803
+ {
804
+ "epoch": 0.610010863207153,
805
+ "grad_norm": 11.221796989440918,
806
+ "learning_rate": 1.9677256256846717e-05,
807
+ "loss": 3.0989,
808
+ "step": 36500
809
+ },
810
+ {
811
+ "epoch": 0.6183671764017715,
812
+ "grad_norm": 12.109387397766113,
813
+ "learning_rate": 1.925591977753434e-05,
814
+ "loss": 3.0864,
815
+ "step": 37000
816
+ },
817
+ {
818
+ "epoch": 0.6183671764017715,
819
+ "eval_loss": 3.313296318054199,
820
+ "eval_runtime": 22.2819,
821
+ "eval_samples_per_second": 200.252,
822
+ "eval_steps_per_second": 25.043,
823
+ "step": 37000
824
+ },
825
+ {
826
+ "epoch": 0.6267234895963901,
827
+ "grad_norm": 10.61958122253418,
828
+ "learning_rate": 1.8834583298221962e-05,
829
+ "loss": 3.0931,
830
+ "step": 37500
831
+ },
832
+ {
833
+ "epoch": 0.6350798027910086,
834
+ "grad_norm": 12.216238021850586,
835
+ "learning_rate": 1.8413246818909584e-05,
836
+ "loss": 3.1024,
837
+ "step": 38000
838
+ },
839
+ {
840
+ "epoch": 0.6350798027910086,
841
+ "eval_loss": 3.300213098526001,
842
+ "eval_runtime": 22.3151,
843
+ "eval_samples_per_second": 199.955,
844
+ "eval_steps_per_second": 25.006,
845
+ "step": 38000
846
+ },
847
+ {
848
+ "epoch": 0.6434361159856271,
849
+ "grad_norm": 11.379255294799805,
850
+ "learning_rate": 1.7991910339597203e-05,
851
+ "loss": 3.021,
852
+ "step": 38500
853
+ },
854
+ {
855
+ "epoch": 0.6517924291802457,
856
+ "grad_norm": 12.111197471618652,
857
+ "learning_rate": 1.7570573860284825e-05,
858
+ "loss": 3.0478,
859
+ "step": 39000
860
+ },
861
+ {
862
+ "epoch": 0.6517924291802457,
863
+ "eval_loss": 3.294560432434082,
864
+ "eval_runtime": 22.2566,
865
+ "eval_samples_per_second": 200.479,
866
+ "eval_steps_per_second": 25.071,
867
+ "step": 39000
868
+ },
869
+ {
870
+ "epoch": 0.6601487423748642,
871
+ "grad_norm": 14.204049110412598,
872
+ "learning_rate": 1.7149237380972448e-05,
873
+ "loss": 3.0032,
874
+ "step": 39500
875
+ },
876
+ {
877
+ "epoch": 0.6685050555694827,
878
+ "grad_norm": 11.631662368774414,
879
+ "learning_rate": 1.6727900901660067e-05,
880
+ "loss": 3.0655,
881
+ "step": 40000
882
+ },
883
+ {
884
+ "epoch": 0.6685050555694827,
885
+ "eval_loss": 3.285043239593506,
886
+ "eval_runtime": 22.2429,
887
+ "eval_samples_per_second": 200.603,
888
+ "eval_steps_per_second": 25.087,
889
+ "step": 40000
890
+ },
891
+ {
892
+ "epoch": 0.6768613687641013,
893
+ "grad_norm": 13.394169807434082,
894
+ "learning_rate": 1.6307407095306312e-05,
895
+ "loss": 3.0097,
896
+ "step": 40500
897
+ },
898
+ {
899
+ "epoch": 0.6852176819587198,
900
+ "grad_norm": 14.214789390563965,
901
+ "learning_rate": 1.5886070615993935e-05,
902
+ "loss": 3.0313,
903
+ "step": 41000
904
+ },
905
+ {
906
+ "epoch": 0.6852176819587198,
907
+ "eval_loss": 3.277559280395508,
908
+ "eval_runtime": 22.242,
909
+ "eval_samples_per_second": 200.611,
910
+ "eval_steps_per_second": 25.088,
911
+ "step": 41000
912
+ },
913
+ {
914
+ "epoch": 0.6935739951533384,
915
+ "grad_norm": 8.208776473999023,
916
+ "learning_rate": 1.5464734136681554e-05,
917
+ "loss": 2.9826,
918
+ "step": 41500
919
+ },
920
+ {
921
+ "epoch": 0.7019303083479569,
922
+ "grad_norm": 10.259654998779297,
923
+ "learning_rate": 1.50442403303278e-05,
924
+ "loss": 3.0034,
925
+ "step": 42000
926
+ },
927
+ {
928
+ "epoch": 0.7019303083479569,
929
+ "eval_loss": 3.2744297981262207,
930
+ "eval_runtime": 22.2597,
931
+ "eval_samples_per_second": 200.452,
932
+ "eval_steps_per_second": 25.068,
933
+ "step": 42000
934
+ },
935
+ {
936
+ "epoch": 0.7102866215425754,
937
+ "grad_norm": 14.066951751708984,
938
+ "learning_rate": 1.462290385101542e-05,
939
+ "loss": 2.9843,
940
+ "step": 42500
941
+ },
942
+ {
943
+ "epoch": 0.718642934737194,
944
+ "grad_norm": 16.31036376953125,
945
+ "learning_rate": 1.4201567371703042e-05,
946
+ "loss": 3.0232,
947
+ "step": 43000
948
+ },
949
+ {
950
+ "epoch": 0.718642934737194,
951
+ "eval_loss": 3.2568132877349854,
952
+ "eval_runtime": 22.2183,
953
+ "eval_samples_per_second": 200.825,
954
+ "eval_steps_per_second": 25.114,
955
+ "step": 43000
956
+ },
957
+ {
958
+ "epoch": 0.7269992479318125,
959
+ "grad_norm": 12.592016220092773,
960
+ "learning_rate": 1.3780230892390663e-05,
961
+ "loss": 2.9779,
962
+ "step": 43500
963
+ },
964
+ {
965
+ "epoch": 0.735355561126431,
966
+ "grad_norm": 11.640332221984863,
967
+ "learning_rate": 1.3358894413078285e-05,
968
+ "loss": 2.9875,
969
+ "step": 44000
970
+ },
971
+ {
972
+ "epoch": 0.735355561126431,
973
+ "eval_loss": 3.2479593753814697,
974
+ "eval_runtime": 22.2574,
975
+ "eval_samples_per_second": 200.473,
976
+ "eval_steps_per_second": 25.07,
977
+ "step": 44000
978
+ },
979
+ {
980
+ "epoch": 0.7437118743210496,
981
+ "grad_norm": 18.809364318847656,
982
+ "learning_rate": 1.2937557933765906e-05,
983
+ "loss": 3.014,
984
+ "step": 44500
985
+ },
986
+ {
987
+ "epoch": 0.7520681875156681,
988
+ "grad_norm": 10.02562141418457,
989
+ "learning_rate": 1.2516221454453528e-05,
990
+ "loss": 2.9864,
991
+ "step": 45000
992
+ },
993
+ {
994
+ "epoch": 0.7520681875156681,
995
+ "eval_loss": 3.23964262008667,
996
+ "eval_runtime": 22.2529,
997
+ "eval_samples_per_second": 200.513,
998
+ "eval_steps_per_second": 25.075,
999
+ "step": 45000
1000
+ },
1001
+ {
1002
+ "epoch": 0.7604245007102867,
1003
+ "grad_norm": 14.984621047973633,
1004
+ "learning_rate": 1.2095727648099774e-05,
1005
+ "loss": 3.0169,
1006
+ "step": 45500
1007
+ },
1008
+ {
1009
+ "epoch": 0.7687808139049052,
1010
+ "grad_norm": 14.823599815368652,
1011
+ "learning_rate": 1.1674391168787393e-05,
1012
+ "loss": 2.9889,
1013
+ "step": 46000
1014
+ },
1015
+ {
1016
+ "epoch": 0.7687808139049052,
1017
+ "eval_loss": 3.2315359115600586,
1018
+ "eval_runtime": 22.2439,
1019
+ "eval_samples_per_second": 200.594,
1020
+ "eval_steps_per_second": 25.086,
1021
+ "step": 46000
1022
+ },
1023
+ {
1024
+ "epoch": 0.7771371270995237,
1025
+ "grad_norm": 15.036163330078125,
1026
+ "learning_rate": 1.1253054689475015e-05,
1027
+ "loss": 2.9498,
1028
+ "step": 46500
1029
+ },
1030
+ {
1031
+ "epoch": 0.7854934402941423,
1032
+ "grad_norm": 11.967373847961426,
1033
+ "learning_rate": 1.0831718210162636e-05,
1034
+ "loss": 2.9473,
1035
+ "step": 47000
1036
+ },
1037
+ {
1038
+ "epoch": 0.7854934402941423,
1039
+ "eval_loss": 3.2260513305664062,
1040
+ "eval_runtime": 22.2171,
1041
+ "eval_samples_per_second": 200.836,
1042
+ "eval_steps_per_second": 25.116,
1043
+ "step": 47000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7938497534887607,
1047
+ "grad_norm": 16.133859634399414,
1048
+ "learning_rate": 1.0410381730850258e-05,
1049
+ "loss": 2.9733,
1050
+ "step": 47500
1051
+ },
1052
+ {
1053
+ "epoch": 0.8022060666833793,
1054
+ "grad_norm": 9.699966430664062,
1055
+ "learning_rate": 9.989045251537879e-06,
1056
+ "loss": 2.9981,
1057
+ "step": 48000
1058
+ },
1059
+ {
1060
+ "epoch": 0.8022060666833793,
1061
+ "eval_loss": 3.219076156616211,
1062
+ "eval_runtime": 22.2412,
1063
+ "eval_samples_per_second": 200.618,
1064
+ "eval_steps_per_second": 25.089,
1065
+ "step": 48000
1066
+ },
1067
+ {
1068
+ "epoch": 0.8105623798779978,
1069
+ "grad_norm": 12.452173233032227,
1070
+ "learning_rate": 9.5677087722255e-06,
1071
+ "loss": 2.9572,
1072
+ "step": 48500
1073
+ },
1074
+ {
1075
+ "epoch": 0.8189186930726163,
1076
+ "grad_norm": 10.909358024597168,
1077
+ "learning_rate": 9.14637229291312e-06,
1078
+ "loss": 2.9086,
1079
+ "step": 49000
1080
+ },
1081
+ {
1082
+ "epoch": 0.8189186930726163,
1083
+ "eval_loss": 3.2129132747650146,
1084
+ "eval_runtime": 22.2484,
1085
+ "eval_samples_per_second": 200.554,
1086
+ "eval_steps_per_second": 25.08,
1087
+ "step": 49000
1088
+ },
1089
+ {
1090
+ "epoch": 0.8272750062672349,
1091
+ "grad_norm": 15.663894653320312,
1092
+ "learning_rate": 8.725878486559366e-06,
1093
+ "loss": 2.9755,
1094
+ "step": 49500
1095
+ },
1096
+ {
1097
+ "epoch": 0.8356313194618534,
1098
+ "grad_norm": 9.793340682983398,
1099
+ "learning_rate": 8.304542007246988e-06,
1100
+ "loss": 2.9533,
1101
+ "step": 50000
1102
+ },
1103
+ {
1104
+ "epoch": 0.8356313194618534,
1105
+ "eval_loss": 3.204308271408081,
1106
+ "eval_runtime": 22.2781,
1107
+ "eval_samples_per_second": 200.286,
1108
+ "eval_steps_per_second": 25.047,
1109
+ "step": 50000
1110
+ },
1111
+ {
1112
+ "epoch": 0.8439876326564719,
1113
+ "grad_norm": 13.584871292114258,
1114
+ "learning_rate": 7.883205527934609e-06,
1115
+ "loss": 2.9323,
1116
+ "step": 50500
1117
+ },
1118
+ {
1119
+ "epoch": 0.8523439458510905,
1120
+ "grad_norm": 17.01344871520996,
1121
+ "learning_rate": 7.4618690486222304e-06,
1122
+ "loss": 2.9298,
1123
+ "step": 51000
1124
+ },
1125
+ {
1126
+ "epoch": 0.8523439458510905,
1127
+ "eval_loss": 3.1974070072174072,
1128
+ "eval_runtime": 22.2555,
1129
+ "eval_samples_per_second": 200.489,
1130
+ "eval_steps_per_second": 25.072,
1131
+ "step": 51000
1132
+ },
1133
+ {
1134
+ "epoch": 0.860700259045709,
1135
+ "grad_norm": 11.150009155273438,
1136
+ "learning_rate": 7.041375242268476e-06,
1137
+ "loss": 2.9175,
1138
+ "step": 51500
1139
+ },
1140
+ {
1141
+ "epoch": 0.8690565722403276,
1142
+ "grad_norm": 10.69240951538086,
1143
+ "learning_rate": 6.620038762956098e-06,
1144
+ "loss": 2.9156,
1145
+ "step": 52000
1146
+ },
1147
+ {
1148
+ "epoch": 0.8690565722403276,
1149
+ "eval_loss": 3.192389488220215,
1150
+ "eval_runtime": 22.2526,
1151
+ "eval_samples_per_second": 200.516,
1152
+ "eval_steps_per_second": 25.076,
1153
+ "step": 52000
1154
+ },
1155
+ {
1156
+ "epoch": 0.8774128854349461,
1157
+ "grad_norm": 12.42056655883789,
1158
+ "learning_rate": 6.198702283643718e-06,
1159
+ "loss": 2.8909,
1160
+ "step": 52500
1161
+ },
1162
+ {
1163
+ "epoch": 0.8857691986295646,
1164
+ "grad_norm": 11.83171558380127,
1165
+ "learning_rate": 5.77736580433134e-06,
1166
+ "loss": 2.9452,
1167
+ "step": 53000
1168
+ },
1169
+ {
1170
+ "epoch": 0.8857691986295646,
1171
+ "eval_loss": 3.186549425125122,
1172
+ "eval_runtime": 22.2935,
1173
+ "eval_samples_per_second": 200.148,
1174
+ "eval_steps_per_second": 25.03,
1175
+ "step": 53000
1176
+ },
1177
+ {
1178
+ "epoch": 0.8941255118241832,
1179
+ "grad_norm": 17.534683227539062,
1180
+ "learning_rate": 5.3568719979775856e-06,
1181
+ "loss": 2.9173,
1182
+ "step": 53500
1183
+ },
1184
+ {
1185
+ "epoch": 0.9024818250188017,
1186
+ "grad_norm": 13.407784461975098,
1187
+ "learning_rate": 4.935535518665206e-06,
1188
+ "loss": 2.959,
1189
+ "step": 54000
1190
+ },
1191
+ {
1192
+ "epoch": 0.9024818250188017,
1193
+ "eval_loss": 3.1784276962280273,
1194
+ "eval_runtime": 22.2696,
1195
+ "eval_samples_per_second": 200.363,
1196
+ "eval_steps_per_second": 25.057,
1197
+ "step": 54000
1198
+ },
1199
+ {
1200
+ "epoch": 0.9108381382134202,
1201
+ "grad_norm": 14.451789855957031,
1202
+ "learning_rate": 4.514199039352828e-06,
1203
+ "loss": 2.9441,
1204
+ "step": 54500
1205
+ },
1206
+ {
1207
+ "epoch": 0.9191944514080388,
1208
+ "grad_norm": 11.849891662597656,
1209
+ "learning_rate": 4.092862560040449e-06,
1210
+ "loss": 2.8955,
1211
+ "step": 55000
1212
+ },
1213
+ {
1214
+ "epoch": 0.9191944514080388,
1215
+ "eval_loss": 3.1731808185577393,
1216
+ "eval_runtime": 22.2282,
1217
+ "eval_samples_per_second": 200.736,
1218
+ "eval_steps_per_second": 25.103,
1219
+ "step": 55000
1220
+ },
1221
+ {
1222
+ "epoch": 0.9275507646026573,
1223
+ "grad_norm": 13.188152313232422,
1224
+ "learning_rate": 3.6715260807280694e-06,
1225
+ "loss": 2.889,
1226
+ "step": 55500
1227
+ },
1228
+ {
1229
+ "epoch": 0.9359070777972759,
1230
+ "grad_norm": 16.554080963134766,
1231
+ "learning_rate": 3.250189601415691e-06,
1232
+ "loss": 2.9453,
1233
+ "step": 56000
1234
+ },
1235
+ {
1236
+ "epoch": 0.9359070777972759,
1237
+ "eval_loss": 3.167375087738037,
1238
+ "eval_runtime": 22.2342,
1239
+ "eval_samples_per_second": 200.682,
1240
+ "eval_steps_per_second": 25.096,
1241
+ "step": 56000
1242
+ },
1243
+ {
1244
+ "epoch": 0.9442633909918944,
1245
+ "grad_norm": 11.261225700378418,
1246
+ "learning_rate": 2.828853122103312e-06,
1247
+ "loss": 2.9546,
1248
+ "step": 56500
1249
+ },
1250
+ {
1251
+ "epoch": 0.9526197041865129,
1252
+ "grad_norm": 9.682096481323242,
1253
+ "learning_rate": 2.407516642790933e-06,
1254
+ "loss": 2.8689,
1255
+ "step": 57000
1256
+ },
1257
+ {
1258
+ "epoch": 0.9526197041865129,
1259
+ "eval_loss": 3.1633212566375732,
1260
+ "eval_runtime": 22.2444,
1261
+ "eval_samples_per_second": 200.59,
1262
+ "eval_steps_per_second": 25.085,
1263
+ "step": 57000
1264
+ },
1265
+ {
1266
+ "epoch": 0.9609760173811315,
1267
+ "grad_norm": 10.658622741699219,
1268
+ "learning_rate": 1.9870228364371787e-06,
1269
+ "loss": 2.8708,
1270
+ "step": 57500
1271
+ },
1272
+ {
1273
+ "epoch": 0.96933233057575,
1274
+ "grad_norm": 11.864119529724121,
1275
+ "learning_rate": 1.5656863571248002e-06,
1276
+ "loss": 2.8988,
1277
+ "step": 58000
1278
+ },
1279
+ {
1280
+ "epoch": 0.96933233057575,
1281
+ "eval_loss": 3.1584391593933105,
1282
+ "eval_runtime": 22.2736,
1283
+ "eval_samples_per_second": 200.327,
1284
+ "eval_steps_per_second": 25.052,
1285
+ "step": 58000
1286
+ },
1287
+ {
1288
+ "epoch": 0.9776886437703685,
1289
+ "grad_norm": 9.938982009887695,
1290
+ "learning_rate": 1.144349877812421e-06,
1291
+ "loss": 2.9186,
1292
+ "step": 58500
1293
+ },
1294
+ {
1295
+ "epoch": 0.9860449569649871,
1296
+ "grad_norm": 12.801502227783203,
1297
+ "learning_rate": 7.230133985000422e-07,
1298
+ "loss": 2.8888,
1299
+ "step": 59000
1300
+ },
1301
+ {
1302
+ "epoch": 0.9860449569649871,
1303
+ "eval_loss": 3.157134532928467,
1304
+ "eval_runtime": 22.2821,
1305
+ "eval_samples_per_second": 200.251,
1306
+ "eval_steps_per_second": 25.043,
1307
+ "step": 59000
1308
+ },
1309
+ {
1310
+ "epoch": 0.9944012701596056,
1311
+ "grad_norm": 16.9565372467041,
1312
+ "learning_rate": 3.025195921462881e-07,
1313
+ "loss": 2.8857,
1314
+ "step": 59500
1315
+ }
1316
+ ],
1317
+ "logging_steps": 500,
1318
+ "max_steps": 59835,
1319
+ "num_input_tokens_seen": 0,
1320
+ "num_train_epochs": 1,
1321
+ "save_steps": 2000,
1322
+ "stateful_callbacks": {
1323
+ "TrainerControl": {
1324
+ "args": {
1325
+ "should_epoch_stop": false,
1326
+ "should_evaluate": false,
1327
+ "should_log": false,
1328
+ "should_save": true,
1329
+ "should_training_stop": true
1330
+ },
1331
+ "attributes": {}
1332
+ }
1333
+ },
1334
+ "total_flos": 2.943723113325527e+17,
1335
+ "train_batch_size": 8,
1336
+ "trial_name": null,
1337
+ "trial_params": null
1338
+ }
custom-tokenizer/full-finetuning/LLaMmlein_120M/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f88c9a573a75529b8b6e9fd0abb8f8716c5a97badf4da729f6e39e4f77539b11
3
+ size 5368
custom-tokenizer/full-finetuning/german-gpt2/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dbmdz/german-gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.0,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.47.1",
38
+ "use_cache": true,
39
+ "vocab_size": 50266
40
+ }
custom-tokenizer/full-finetuning/german-gpt2/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.47.1"
6
+ }
custom-tokenizer/full-finetuning/german-gpt2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eb02a452b7bcd4d795b4683e55cc8658721a3944d1ab17594f817ff07d3c06d
3
+ size 497801856
custom-tokenizer/full-finetuning/german-gpt2/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3962fa444f5a31e9641be7ed55d6a2cc3604aa41ef2757f52f75bd66f0075a6
3
+ size 995697594
custom-tokenizer/full-finetuning/german-gpt2/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3bc1ac846859ad13fd356d3ed41fd20834205a7e8764e4198636863fca3f64c
3
+ size 14244
custom-tokenizer/full-finetuning/german-gpt2/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23f99f08d1d78974aae2a3a91c7471d7ba832f92d524504de2b7948fa62af5d8
3
+ size 1064
custom-tokenizer/full-finetuning/german-gpt2/trainer_state.json ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.311403751373291,
3
+ "best_model_checkpoint": "./models/custom-tokenizer/full-finetuning/german-gpt2/checkpoint-59000",
4
+ "epoch": 1.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 59835,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008356313194618534,
13
+ "grad_norm": 7.739098072052002,
14
+ "learning_rate": 4.97e-05,
15
+ "loss": 5.7212,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.016712626389237067,
20
+ "grad_norm": 6.011812686920166,
21
+ "learning_rate": 4.95811915395635e-05,
22
+ "loss": 4.8525,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.016712626389237067,
27
+ "eval_loss": 4.5701584815979,
28
+ "eval_runtime": 34.6716,
29
+ "eval_samples_per_second": 128.693,
30
+ "eval_steps_per_second": 16.094,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.025068939583855605,
35
+ "grad_norm": 5.455048561096191,
36
+ "learning_rate": 4.915985506025112e-05,
37
+ "loss": 4.6135,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.033425252778474135,
42
+ "grad_norm": 7.002511024475098,
43
+ "learning_rate": 4.873851858093874e-05,
44
+ "loss": 4.4939,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.033425252778474135,
49
+ "eval_loss": 4.25942325592041,
50
+ "eval_runtime": 34.6626,
51
+ "eval_samples_per_second": 128.727,
52
+ "eval_steps_per_second": 16.098,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.04178156597309267,
57
+ "grad_norm": 6.586827754974365,
58
+ "learning_rate": 4.8317182101626365e-05,
59
+ "loss": 4.3682,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.05013787916771121,
64
+ "grad_norm": 5.179999351501465,
65
+ "learning_rate": 4.789584562231398e-05,
66
+ "loss": 4.2088,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.05013787916771121,
71
+ "eval_loss": 4.1048455238342285,
72
+ "eval_runtime": 34.6948,
73
+ "eval_samples_per_second": 128.607,
74
+ "eval_steps_per_second": 16.083,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.05849419236232974,
79
+ "grad_norm": 4.009952545166016,
80
+ "learning_rate": 4.74745091430016e-05,
81
+ "loss": 4.2534,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.06685050555694827,
86
+ "grad_norm": 5.067552089691162,
87
+ "learning_rate": 4.7053172663689226e-05,
88
+ "loss": 4.2047,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.06685050555694827,
93
+ "eval_loss": 4.001619815826416,
94
+ "eval_runtime": 34.7045,
95
+ "eval_samples_per_second": 128.571,
96
+ "eval_steps_per_second": 16.079,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.0752068187515668,
101
+ "grad_norm": 8.432042121887207,
102
+ "learning_rate": 4.663183618437685e-05,
103
+ "loss": 4.1514,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.08356313194618534,
108
+ "grad_norm": 5.663743019104004,
109
+ "learning_rate": 4.621049970506447e-05,
110
+ "loss": 4.0911,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.08356313194618534,
115
+ "eval_loss": 3.927936553955078,
116
+ "eval_runtime": 34.7705,
117
+ "eval_samples_per_second": 128.327,
118
+ "eval_steps_per_second": 16.048,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.09191944514080387,
123
+ "grad_norm": 6.105470180511475,
124
+ "learning_rate": 4.5789163225752086e-05,
125
+ "loss": 4.0603,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.10027575833542242,
130
+ "grad_norm": 4.524149417877197,
131
+ "learning_rate": 4.536782674643971e-05,
132
+ "loss": 4.0241,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.10027575833542242,
137
+ "eval_loss": 3.868943691253662,
138
+ "eval_runtime": 34.7126,
139
+ "eval_samples_per_second": 128.541,
140
+ "eval_steps_per_second": 16.075,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.10863207153004095,
145
+ "grad_norm": 5.74066686630249,
146
+ "learning_rate": 4.494649026712733e-05,
147
+ "loss": 3.9523,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.11698838472465949,
152
+ "grad_norm": 5.0315046310424805,
153
+ "learning_rate": 4.452599646077358e-05,
154
+ "loss": 3.9988,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.11698838472465949,
159
+ "eval_loss": 3.817981719970703,
160
+ "eval_runtime": 34.7005,
161
+ "eval_samples_per_second": 128.586,
162
+ "eval_steps_per_second": 16.08,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.12534469791927802,
167
+ "grad_norm": 6.331228733062744,
168
+ "learning_rate": 4.41046599814612e-05,
169
+ "loss": 3.9683,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.13370101111389654,
174
+ "grad_norm": 5.36767578125,
175
+ "learning_rate": 4.368332350214882e-05,
176
+ "loss": 3.91,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.13370101111389654,
181
+ "eval_loss": 3.774301528930664,
182
+ "eval_runtime": 34.7154,
183
+ "eval_samples_per_second": 128.531,
184
+ "eval_steps_per_second": 16.074,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.1420573243085151,
189
+ "grad_norm": 6.756371021270752,
190
+ "learning_rate": 4.326198702283644e-05,
191
+ "loss": 3.9031,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.1504136375031336,
196
+ "grad_norm": 5.863716125488281,
197
+ "learning_rate": 4.2840650543524055e-05,
198
+ "loss": 3.8962,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.1504136375031336,
203
+ "eval_loss": 3.7521703243255615,
204
+ "eval_runtime": 34.7163,
205
+ "eval_samples_per_second": 128.527,
206
+ "eval_steps_per_second": 16.073,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.15876995069775215,
211
+ "grad_norm": 4.68450403213501,
212
+ "learning_rate": 4.241931406421168e-05,
213
+ "loss": 3.853,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.16712626389237067,
218
+ "grad_norm": 5.654048442840576,
219
+ "learning_rate": 4.199882025785793e-05,
220
+ "loss": 3.8244,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.16712626389237067,
225
+ "eval_loss": 3.7112536430358887,
226
+ "eval_runtime": 34.6707,
227
+ "eval_samples_per_second": 128.697,
228
+ "eval_steps_per_second": 16.094,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 0.17548257708698922,
233
+ "grad_norm": 4.9623284339904785,
234
+ "learning_rate": 4.157748377854555e-05,
235
+ "loss": 3.8098,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 0.18383889028160774,
240
+ "grad_norm": 9.91655158996582,
241
+ "learning_rate": 4.115614729923317e-05,
242
+ "loss": 3.7687,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 0.18383889028160774,
247
+ "eval_loss": 3.6787209510803223,
248
+ "eval_runtime": 34.653,
249
+ "eval_samples_per_second": 128.762,
250
+ "eval_steps_per_second": 16.103,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 0.1921952034762263,
255
+ "grad_norm": 7.150275707244873,
256
+ "learning_rate": 4.0734810819920794e-05,
257
+ "loss": 3.8006,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 0.20055151667084484,
262
+ "grad_norm": 3.9780406951904297,
263
+ "learning_rate": 4.031347434060841e-05,
264
+ "loss": 3.7853,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 0.20055151667084484,
269
+ "eval_loss": 3.655853748321533,
270
+ "eval_runtime": 34.6805,
271
+ "eval_samples_per_second": 128.66,
272
+ "eval_steps_per_second": 16.09,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 0.20890782986546336,
277
+ "grad_norm": 7.393148422241211,
278
+ "learning_rate": 3.989298053425466e-05,
279
+ "loss": 3.8083,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 0.2172641430600819,
284
+ "grad_norm": 4.079512119293213,
285
+ "learning_rate": 3.947164405494228e-05,
286
+ "loss": 3.7215,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 0.2172641430600819,
291
+ "eval_loss": 3.6416282653808594,
292
+ "eval_runtime": 34.6971,
293
+ "eval_samples_per_second": 128.599,
294
+ "eval_steps_per_second": 16.082,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 0.22562045625470042,
299
+ "grad_norm": 3.7765800952911377,
300
+ "learning_rate": 3.90503075756299e-05,
301
+ "loss": 3.7131,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 0.23397676944931897,
306
+ "grad_norm": 5.007580280303955,
307
+ "learning_rate": 3.8628971096317526e-05,
308
+ "loss": 3.7509,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 0.23397676944931897,
313
+ "eval_loss": 3.6096222400665283,
314
+ "eval_runtime": 34.7053,
315
+ "eval_samples_per_second": 128.568,
316
+ "eval_steps_per_second": 16.078,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 0.2423330826439375,
321
+ "grad_norm": 4.108855724334717,
322
+ "learning_rate": 3.820763461700515e-05,
323
+ "loss": 3.7315,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 0.25068939583855604,
328
+ "grad_norm": 5.825819492340088,
329
+ "learning_rate": 3.7786298137692763e-05,
330
+ "loss": 3.7185,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 0.25068939583855604,
335
+ "eval_loss": 3.5938196182250977,
336
+ "eval_runtime": 34.6849,
337
+ "eval_samples_per_second": 128.644,
338
+ "eval_steps_per_second": 16.088,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 0.2590457090331746,
343
+ "grad_norm": 5.564838409423828,
344
+ "learning_rate": 3.736496165838038e-05,
345
+ "loss": 3.7059,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 0.2674020222277931,
350
+ "grad_norm": 5.582629203796387,
351
+ "learning_rate": 3.6943625179068e-05,
352
+ "loss": 3.6318,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 0.2674020222277931,
357
+ "eval_loss": 3.5812368392944336,
358
+ "eval_runtime": 34.698,
359
+ "eval_samples_per_second": 128.595,
360
+ "eval_steps_per_second": 16.082,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 0.2757583354224116,
365
+ "grad_norm": 7.805865287780762,
366
+ "learning_rate": 3.652313137271425e-05,
367
+ "loss": 3.6836,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 0.2841146486170302,
372
+ "grad_norm": 7.230532169342041,
373
+ "learning_rate": 3.61026375663605e-05,
374
+ "loss": 3.6965,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 0.2841146486170302,
379
+ "eval_loss": 3.5589852333068848,
380
+ "eval_runtime": 34.7222,
381
+ "eval_samples_per_second": 128.506,
382
+ "eval_steps_per_second": 16.07,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 0.2924709618116487,
387
+ "grad_norm": 5.060893535614014,
388
+ "learning_rate": 3.568130108704812e-05,
389
+ "loss": 3.6219,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 0.3008272750062672,
394
+ "grad_norm": 6.640148639678955,
395
+ "learning_rate": 3.525996460773574e-05,
396
+ "loss": 3.6204,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 0.3008272750062672,
401
+ "eval_loss": 3.5444483757019043,
402
+ "eval_runtime": 34.8334,
403
+ "eval_samples_per_second": 128.095,
404
+ "eval_steps_per_second": 16.019,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 0.30918358820088576,
409
+ "grad_norm": 6.7097930908203125,
410
+ "learning_rate": 3.483862812842336e-05,
411
+ "loss": 3.6134,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 0.3175399013955043,
416
+ "grad_norm": 5.851566314697266,
417
+ "learning_rate": 3.441729164911098e-05,
418
+ "loss": 3.6138,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 0.3175399013955043,
423
+ "eval_loss": 3.529630184173584,
424
+ "eval_runtime": 34.7073,
425
+ "eval_samples_per_second": 128.561,
426
+ "eval_steps_per_second": 16.077,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 0.32589621459012286,
431
+ "grad_norm": 4.4097981452941895,
432
+ "learning_rate": 3.3995955169798604e-05,
433
+ "loss": 3.6202,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 0.33425252778474135,
438
+ "grad_norm": 6.881157398223877,
439
+ "learning_rate": 3.357461869048623e-05,
440
+ "loss": 3.5862,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 0.33425252778474135,
445
+ "eval_loss": 3.5189144611358643,
446
+ "eval_runtime": 34.7046,
447
+ "eval_samples_per_second": 128.571,
448
+ "eval_steps_per_second": 16.079,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 0.3426088409793599,
453
+ "grad_norm": 4.771751880645752,
454
+ "learning_rate": 3.315328221117385e-05,
455
+ "loss": 3.613,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 0.35096515417397844,
460
+ "grad_norm": 3.5520098209381104,
461
+ "learning_rate": 3.273278840482009e-05,
462
+ "loss": 3.5767,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 0.35096515417397844,
467
+ "eval_loss": 3.506736993789673,
468
+ "eval_runtime": 35.4581,
469
+ "eval_samples_per_second": 125.839,
470
+ "eval_steps_per_second": 15.737,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 0.359321467368597,
475
+ "grad_norm": 3.511641263961792,
476
+ "learning_rate": 3.2311451925507714e-05,
477
+ "loss": 3.5692,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 0.3676777805632155,
482
+ "grad_norm": 4.536217212677002,
483
+ "learning_rate": 3.1890115446195336e-05,
484
+ "loss": 3.5913,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 0.3676777805632155,
489
+ "eval_loss": 3.490816354751587,
490
+ "eval_runtime": 34.731,
491
+ "eval_samples_per_second": 128.473,
492
+ "eval_steps_per_second": 16.066,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 0.37603409375783403,
497
+ "grad_norm": 5.8858184814453125,
498
+ "learning_rate": 3.146877896688296e-05,
499
+ "loss": 3.6495,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 0.3843904069524526,
504
+ "grad_norm": 5.291027069091797,
505
+ "learning_rate": 3.10482851605292e-05,
506
+ "loss": 3.5796,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 0.3843904069524526,
511
+ "eval_loss": 3.4832894802093506,
512
+ "eval_runtime": 34.6998,
513
+ "eval_samples_per_second": 128.589,
514
+ "eval_steps_per_second": 16.081,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 0.3927467201470711,
519
+ "grad_norm": 4.761454105377197,
520
+ "learning_rate": 3.062694868121682e-05,
521
+ "loss": 3.5603,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 0.4011030333416897,
526
+ "grad_norm": 4.90023946762085,
527
+ "learning_rate": 3.0205612201904442e-05,
528
+ "loss": 3.5428,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 0.4011030333416897,
533
+ "eval_loss": 3.4762682914733887,
534
+ "eval_runtime": 34.6953,
535
+ "eval_samples_per_second": 128.605,
536
+ "eval_steps_per_second": 16.083,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 0.40945934653630817,
541
+ "grad_norm": 6.675970554351807,
542
+ "learning_rate": 2.9784275722592064e-05,
543
+ "loss": 3.5559,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 0.4178156597309267,
548
+ "grad_norm": 4.5930914878845215,
549
+ "learning_rate": 2.936378191623831e-05,
550
+ "loss": 3.5087,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 0.4178156597309267,
555
+ "eval_loss": 3.465832233428955,
556
+ "eval_runtime": 34.6863,
557
+ "eval_samples_per_second": 128.639,
558
+ "eval_steps_per_second": 16.087,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 0.42617197292554526,
563
+ "grad_norm": 4.835921287536621,
564
+ "learning_rate": 2.894244543692593e-05,
565
+ "loss": 3.5474,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 0.4345282861201638,
570
+ "grad_norm": 5.828157424926758,
571
+ "learning_rate": 2.852110895761355e-05,
572
+ "loss": 3.5724,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 0.4345282861201638,
577
+ "eval_loss": 3.458848714828491,
578
+ "eval_runtime": 34.6689,
579
+ "eval_samples_per_second": 128.703,
580
+ "eval_steps_per_second": 16.095,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 0.4428845993147823,
585
+ "grad_norm": 3.7706058025360107,
586
+ "learning_rate": 2.8099772478301174e-05,
587
+ "loss": 3.49,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 0.45124091250940085,
592
+ "grad_norm": 4.761475563049316,
593
+ "learning_rate": 2.7679278671947416e-05,
594
+ "loss": 3.5493,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 0.45124091250940085,
599
+ "eval_loss": 3.447903633117676,
600
+ "eval_runtime": 34.6802,
601
+ "eval_samples_per_second": 128.661,
602
+ "eval_steps_per_second": 16.09,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 0.4595972257040194,
607
+ "grad_norm": 4.8344292640686035,
608
+ "learning_rate": 2.7258784865593662e-05,
609
+ "loss": 3.5161,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 0.46795353889863794,
614
+ "grad_norm": 8.372416496276855,
615
+ "learning_rate": 2.6837448386281284e-05,
616
+ "loss": 3.4814,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 0.46795353889863794,
621
+ "eval_loss": 3.4423773288726807,
622
+ "eval_runtime": 34.6998,
623
+ "eval_samples_per_second": 128.588,
624
+ "eval_steps_per_second": 16.081,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 0.47630985209325644,
629
+ "grad_norm": 3.9577817916870117,
630
+ "learning_rate": 2.6416111906968903e-05,
631
+ "loss": 3.5169,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 0.484666165287875,
636
+ "grad_norm": 4.661665439605713,
637
+ "learning_rate": 2.5994775427656526e-05,
638
+ "loss": 3.5444,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 0.484666165287875,
643
+ "eval_loss": 3.4300851821899414,
644
+ "eval_runtime": 34.6929,
645
+ "eval_samples_per_second": 128.614,
646
+ "eval_steps_per_second": 16.084,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 0.49302247848249353,
651
+ "grad_norm": 5.853386402130127,
652
+ "learning_rate": 2.5573438948344148e-05,
653
+ "loss": 3.5049,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 0.5013787916771121,
658
+ "grad_norm": 3.900343894958496,
659
+ "learning_rate": 2.515210246903177e-05,
660
+ "loss": 3.5451,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 0.5013787916771121,
665
+ "eval_loss": 3.421967029571533,
666
+ "eval_runtime": 34.6635,
667
+ "eval_samples_per_second": 128.723,
668
+ "eval_steps_per_second": 16.098,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 0.5097351048717306,
673
+ "grad_norm": 5.265810966491699,
674
+ "learning_rate": 2.473076598971939e-05,
675
+ "loss": 3.4952,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 0.5180914180663492,
680
+ "grad_norm": 6.181280612945557,
681
+ "learning_rate": 2.430942951040701e-05,
682
+ "loss": 3.4879,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 0.5180914180663492,
687
+ "eval_loss": 3.4192793369293213,
688
+ "eval_runtime": 34.6959,
689
+ "eval_samples_per_second": 128.603,
690
+ "eval_steps_per_second": 16.083,
691
+ "step": 31000
692
+ },
693
+ {
694
+ "epoch": 0.5264477312609677,
695
+ "grad_norm": 7.2022294998168945,
696
+ "learning_rate": 2.3888093031094634e-05,
697
+ "loss": 3.5258,
698
+ "step": 31500
699
+ },
700
+ {
701
+ "epoch": 0.5348040444555862,
702
+ "grad_norm": 4.311310291290283,
703
+ "learning_rate": 2.3468441897699503e-05,
704
+ "loss": 3.4957,
705
+ "step": 32000
706
+ },
707
+ {
708
+ "epoch": 0.5348040444555862,
709
+ "eval_loss": 3.406111240386963,
710
+ "eval_runtime": 34.7236,
711
+ "eval_samples_per_second": 128.501,
712
+ "eval_steps_per_second": 16.07,
713
+ "step": 32000
714
+ },
715
+ {
716
+ "epoch": 0.5431603576502048,
717
+ "grad_norm": 4.1522722244262695,
718
+ "learning_rate": 2.3047105418387125e-05,
719
+ "loss": 3.5138,
720
+ "step": 32500
721
+ },
722
+ {
723
+ "epoch": 0.5515166708448233,
724
+ "grad_norm": 4.656116962432861,
725
+ "learning_rate": 2.2625768939074744e-05,
726
+ "loss": 3.4637,
727
+ "step": 33000
728
+ },
729
+ {
730
+ "epoch": 0.5515166708448233,
731
+ "eval_loss": 3.3979318141937256,
732
+ "eval_runtime": 34.7041,
733
+ "eval_samples_per_second": 128.573,
734
+ "eval_steps_per_second": 16.079,
735
+ "step": 33000
736
+ },
737
+ {
738
+ "epoch": 0.5598729840394417,
739
+ "grad_norm": 7.333806037902832,
740
+ "learning_rate": 2.2204432459762367e-05,
741
+ "loss": 3.5071,
742
+ "step": 33500
743
+ },
744
+ {
745
+ "epoch": 0.5682292972340603,
746
+ "grad_norm": 4.071677207946777,
747
+ "learning_rate": 2.178309598044999e-05,
748
+ "loss": 3.4836,
749
+ "step": 34000
750
+ },
751
+ {
752
+ "epoch": 0.5682292972340603,
753
+ "eval_loss": 3.396083354949951,
754
+ "eval_runtime": 34.7068,
755
+ "eval_samples_per_second": 128.563,
756
+ "eval_steps_per_second": 16.078,
757
+ "step": 34000
758
+ },
759
+ {
760
+ "epoch": 0.5765856104286788,
761
+ "grad_norm": 4.475882053375244,
762
+ "learning_rate": 2.136175950113761e-05,
763
+ "loss": 3.4764,
764
+ "step": 34500
765
+ },
766
+ {
767
+ "epoch": 0.5849419236232974,
768
+ "grad_norm": 3.323747158050537,
769
+ "learning_rate": 2.094042302182523e-05,
770
+ "loss": 3.4379,
771
+ "step": 35000
772
+ },
773
+ {
774
+ "epoch": 0.5849419236232974,
775
+ "eval_loss": 3.389554977416992,
776
+ "eval_runtime": 34.7078,
777
+ "eval_samples_per_second": 128.559,
778
+ "eval_steps_per_second": 16.077,
779
+ "step": 35000
780
+ },
781
+ {
782
+ "epoch": 0.5932982368179159,
783
+ "grad_norm": 6.1050543785095215,
784
+ "learning_rate": 2.0519086542512852e-05,
785
+ "loss": 3.4959,
786
+ "step": 35500
787
+ },
788
+ {
789
+ "epoch": 0.6016545500125344,
790
+ "grad_norm": 5.739749431610107,
791
+ "learning_rate": 2.0097750063200475e-05,
792
+ "loss": 3.4672,
793
+ "step": 36000
794
+ },
795
+ {
796
+ "epoch": 0.6016545500125344,
797
+ "eval_loss": 3.3847856521606445,
798
+ "eval_runtime": 34.7286,
799
+ "eval_samples_per_second": 128.482,
800
+ "eval_steps_per_second": 16.067,
801
+ "step": 36000
802
+ },
803
+ {
804
+ "epoch": 0.610010863207153,
805
+ "grad_norm": 4.3374924659729,
806
+ "learning_rate": 1.9677256256846717e-05,
807
+ "loss": 3.4903,
808
+ "step": 36500
809
+ },
810
+ {
811
+ "epoch": 0.6183671764017715,
812
+ "grad_norm": 5.808351516723633,
813
+ "learning_rate": 1.925591977753434e-05,
814
+ "loss": 3.4723,
815
+ "step": 37000
816
+ },
817
+ {
818
+ "epoch": 0.6183671764017715,
819
+ "eval_loss": 3.379159450531006,
820
+ "eval_runtime": 34.676,
821
+ "eval_samples_per_second": 128.677,
822
+ "eval_steps_per_second": 16.092,
823
+ "step": 37000
824
+ },
825
+ {
826
+ "epoch": 0.6267234895963901,
827
+ "grad_norm": 4.286294460296631,
828
+ "learning_rate": 1.8834583298221962e-05,
829
+ "loss": 3.4922,
830
+ "step": 37500
831
+ },
832
+ {
833
+ "epoch": 0.6350798027910086,
834
+ "grad_norm": 4.990993976593018,
835
+ "learning_rate": 1.8413246818909584e-05,
836
+ "loss": 3.4815,
837
+ "step": 38000
838
+ },
839
+ {
840
+ "epoch": 0.6350798027910086,
841
+ "eval_loss": 3.376216411590576,
842
+ "eval_runtime": 34.7129,
843
+ "eval_samples_per_second": 128.54,
844
+ "eval_steps_per_second": 16.075,
845
+ "step": 38000
846
+ },
847
+ {
848
+ "epoch": 0.6434361159856271,
849
+ "grad_norm": 4.532817840576172,
850
+ "learning_rate": 1.7992753012555827e-05,
851
+ "loss": 3.4146,
852
+ "step": 38500
853
+ },
854
+ {
855
+ "epoch": 0.6517924291802457,
856
+ "grad_norm": 5.179249286651611,
857
+ "learning_rate": 1.757141653324345e-05,
858
+ "loss": 3.4357,
859
+ "step": 39000
860
+ },
861
+ {
862
+ "epoch": 0.6517924291802457,
863
+ "eval_loss": 3.3684535026550293,
864
+ "eval_runtime": 34.5425,
865
+ "eval_samples_per_second": 129.174,
866
+ "eval_steps_per_second": 16.154,
867
+ "step": 39000
868
+ },
869
+ {
870
+ "epoch": 0.6601487423748642,
871
+ "grad_norm": 5.6737542152404785,
872
+ "learning_rate": 1.715008005393107e-05,
873
+ "loss": 3.3936,
874
+ "step": 39500
875
+ },
876
+ {
877
+ "epoch": 0.6685050555694827,
878
+ "grad_norm": 4.707485675811768,
879
+ "learning_rate": 1.672874357461869e-05,
880
+ "loss": 3.4711,
881
+ "step": 40000
882
+ },
883
+ {
884
+ "epoch": 0.6685050555694827,
885
+ "eval_loss": 3.3646059036254883,
886
+ "eval_runtime": 34.5756,
887
+ "eval_samples_per_second": 129.051,
888
+ "eval_steps_per_second": 16.139,
889
+ "step": 40000
890
+ },
891
+ {
892
+ "epoch": 0.6768613687641013,
893
+ "grad_norm": 4.501412868499756,
894
+ "learning_rate": 1.630824976826494e-05,
895
+ "loss": 3.4073,
896
+ "step": 40500
897
+ },
898
+ {
899
+ "epoch": 0.6852176819587198,
900
+ "grad_norm": 9.65688419342041,
901
+ "learning_rate": 1.5886913288952558e-05,
902
+ "loss": 3.4437,
903
+ "step": 41000
904
+ },
905
+ {
906
+ "epoch": 0.6852176819587198,
907
+ "eval_loss": 3.3600456714630127,
908
+ "eval_runtime": 34.9587,
909
+ "eval_samples_per_second": 127.636,
910
+ "eval_steps_per_second": 15.962,
911
+ "step": 41000
912
+ },
913
+ {
914
+ "epoch": 0.6935739951533384,
915
+ "grad_norm": 3.365447998046875,
916
+ "learning_rate": 1.5465576809640177e-05,
917
+ "loss": 3.3704,
918
+ "step": 41500
919
+ },
920
+ {
921
+ "epoch": 0.7019303083479569,
922
+ "grad_norm": 4.373350143432617,
923
+ "learning_rate": 1.50442403303278e-05,
924
+ "loss": 3.4171,
925
+ "step": 42000
926
+ },
927
+ {
928
+ "epoch": 0.7019303083479569,
929
+ "eval_loss": 3.3585731983184814,
930
+ "eval_runtime": 34.556,
931
+ "eval_samples_per_second": 129.124,
932
+ "eval_steps_per_second": 16.148,
933
+ "step": 42000
934
+ },
935
+ {
936
+ "epoch": 0.7102866215425754,
937
+ "grad_norm": 6.150047779083252,
938
+ "learning_rate": 1.4623746523974047e-05,
939
+ "loss": 3.3822,
940
+ "step": 42500
941
+ },
942
+ {
943
+ "epoch": 0.718642934737194,
944
+ "grad_norm": 5.976407527923584,
945
+ "learning_rate": 1.4202410044661668e-05,
946
+ "loss": 3.442,
947
+ "step": 43000
948
+ },
949
+ {
950
+ "epoch": 0.718642934737194,
951
+ "eval_loss": 3.3521482944488525,
952
+ "eval_runtime": 34.7345,
953
+ "eval_samples_per_second": 128.46,
954
+ "eval_steps_per_second": 16.065,
955
+ "step": 43000
956
+ },
957
+ {
958
+ "epoch": 0.7269992479318125,
959
+ "grad_norm": 5.317568778991699,
960
+ "learning_rate": 1.378107356534929e-05,
961
+ "loss": 3.3837,
962
+ "step": 43500
963
+ },
964
+ {
965
+ "epoch": 0.735355561126431,
966
+ "grad_norm": 4.642706871032715,
967
+ "learning_rate": 1.335973708603691e-05,
968
+ "loss": 3.3847,
969
+ "step": 44000
970
+ },
971
+ {
972
+ "epoch": 0.735355561126431,
973
+ "eval_loss": 3.3473587036132812,
974
+ "eval_runtime": 34.8742,
975
+ "eval_samples_per_second": 127.946,
976
+ "eval_steps_per_second": 16.0,
977
+ "step": 44000
978
+ },
979
+ {
980
+ "epoch": 0.7437118743210496,
981
+ "grad_norm": 7.499514102935791,
982
+ "learning_rate": 1.293840060672453e-05,
983
+ "loss": 3.423,
984
+ "step": 44500
985
+ },
986
+ {
987
+ "epoch": 0.7520681875156681,
988
+ "grad_norm": 4.7019548416137695,
989
+ "learning_rate": 1.2517906800370777e-05,
990
+ "loss": 3.4064,
991
+ "step": 45000
992
+ },
993
+ {
994
+ "epoch": 0.7520681875156681,
995
+ "eval_loss": 3.347837209701538,
996
+ "eval_runtime": 34.5258,
997
+ "eval_samples_per_second": 129.237,
998
+ "eval_steps_per_second": 16.162,
999
+ "step": 45000
1000
+ },
1001
+ {
1002
+ "epoch": 0.7604245007102867,
1003
+ "grad_norm": 6.7862725257873535,
1004
+ "learning_rate": 1.2096570321058398e-05,
1005
+ "loss": 3.4665,
1006
+ "step": 45500
1007
+ },
1008
+ {
1009
+ "epoch": 0.7687808139049052,
1010
+ "grad_norm": 7.120584964752197,
1011
+ "learning_rate": 1.167523384174602e-05,
1012
+ "loss": 3.4181,
1013
+ "step": 46000
1014
+ },
1015
+ {
1016
+ "epoch": 0.7687808139049052,
1017
+ "eval_loss": 3.3389999866485596,
1018
+ "eval_runtime": 34.5968,
1019
+ "eval_samples_per_second": 128.972,
1020
+ "eval_steps_per_second": 16.129,
1021
+ "step": 46000
1022
+ },
1023
+ {
1024
+ "epoch": 0.7771371270995237,
1025
+ "grad_norm": 6.571496963500977,
1026
+ "learning_rate": 1.1253897362433639e-05,
1027
+ "loss": 3.3691,
1028
+ "step": 46500
1029
+ },
1030
+ {
1031
+ "epoch": 0.7854934402941423,
1032
+ "grad_norm": 4.248281955718994,
1033
+ "learning_rate": 1.0833403556079886e-05,
1034
+ "loss": 3.3822,
1035
+ "step": 47000
1036
+ },
1037
+ {
1038
+ "epoch": 0.7854934402941423,
1039
+ "eval_loss": 3.334426164627075,
1040
+ "eval_runtime": 34.5427,
1041
+ "eval_samples_per_second": 129.173,
1042
+ "eval_steps_per_second": 16.154,
1043
+ "step": 47000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7938497534887607,
1047
+ "grad_norm": 8.167221069335938,
1048
+ "learning_rate": 1.0412067076767507e-05,
1049
+ "loss": 3.4073,
1050
+ "step": 47500
1051
+ },
1052
+ {
1053
+ "epoch": 0.8022060666833793,
1054
+ "grad_norm": 4.705054759979248,
1055
+ "learning_rate": 9.990730597455128e-06,
1056
+ "loss": 3.4277,
1057
+ "step": 48000
1058
+ },
1059
+ {
1060
+ "epoch": 0.8022060666833793,
1061
+ "eval_loss": 3.3343007564544678,
1062
+ "eval_runtime": 34.5199,
1063
+ "eval_samples_per_second": 129.259,
1064
+ "eval_steps_per_second": 16.165,
1065
+ "step": 48000
1066
+ },
1067
+ {
1068
+ "epoch": 0.8105623798779978,
1069
+ "grad_norm": 5.092607498168945,
1070
+ "learning_rate": 9.56939411814275e-06,
1071
+ "loss": 3.4014,
1072
+ "step": 48500
1073
+ },
1074
+ {
1075
+ "epoch": 0.8189186930726163,
1076
+ "grad_norm": 4.517687797546387,
1077
+ "learning_rate": 9.148900311788996e-06,
1078
+ "loss": 3.331,
1079
+ "step": 49000
1080
+ },
1081
+ {
1082
+ "epoch": 0.8189186930726163,
1083
+ "eval_loss": 3.332448720932007,
1084
+ "eval_runtime": 34.966,
1085
+ "eval_samples_per_second": 127.61,
1086
+ "eval_steps_per_second": 15.958,
1087
+ "step": 49000
1088
+ },
1089
+ {
1090
+ "epoch": 0.8272750062672349,
1091
+ "grad_norm": 5.909774303436279,
1092
+ "learning_rate": 8.727563832476616e-06,
1093
+ "loss": 3.4098,
1094
+ "step": 49500
1095
+ },
1096
+ {
1097
+ "epoch": 0.8356313194618534,
1098
+ "grad_norm": 4.237793922424316,
1099
+ "learning_rate": 8.306227353164239e-06,
1100
+ "loss": 3.4073,
1101
+ "step": 50000
1102
+ },
1103
+ {
1104
+ "epoch": 0.8356313194618534,
1105
+ "eval_loss": 3.328007459640503,
1106
+ "eval_runtime": 35.0133,
1107
+ "eval_samples_per_second": 127.437,
1108
+ "eval_steps_per_second": 15.937,
1109
+ "step": 50000
1110
+ },
1111
+ {
1112
+ "epoch": 0.8439876326564719,
1113
+ "grad_norm": 6.143158912658691,
1114
+ "learning_rate": 7.884890873851858e-06,
1115
+ "loss": 3.3904,
1116
+ "step": 50500
1117
+ },
1118
+ {
1119
+ "epoch": 0.8523439458510905,
1120
+ "grad_norm": 7.914546966552734,
1121
+ "learning_rate": 7.464397067498103e-06,
1122
+ "loss": 3.3741,
1123
+ "step": 51000
1124
+ },
1125
+ {
1126
+ "epoch": 0.8523439458510905,
1127
+ "eval_loss": 3.327684164047241,
1128
+ "eval_runtime": 34.6263,
1129
+ "eval_samples_per_second": 128.862,
1130
+ "eval_steps_per_second": 16.115,
1131
+ "step": 51000
1132
+ },
1133
+ {
1134
+ "epoch": 0.860700259045709,
1135
+ "grad_norm": 4.859989643096924,
1136
+ "learning_rate": 7.043060588185725e-06,
1137
+ "loss": 3.3614,
1138
+ "step": 51500
1139
+ },
1140
+ {
1141
+ "epoch": 0.8690565722403276,
1142
+ "grad_norm": 4.473916053771973,
1143
+ "learning_rate": 6.621724108873346e-06,
1144
+ "loss": 3.3467,
1145
+ "step": 52000
1146
+ },
1147
+ {
1148
+ "epoch": 0.8690565722403276,
1149
+ "eval_loss": 3.3206727504730225,
1150
+ "eval_runtime": 34.5416,
1151
+ "eval_samples_per_second": 129.178,
1152
+ "eval_steps_per_second": 16.154,
1153
+ "step": 52000
1154
+ },
1155
+ {
1156
+ "epoch": 0.8774128854349461,
1157
+ "grad_norm": 5.406485080718994,
1158
+ "learning_rate": 6.200387629560968e-06,
1159
+ "loss": 3.3221,
1160
+ "step": 52500
1161
+ },
1162
+ {
1163
+ "epoch": 0.8857691986295646,
1164
+ "grad_norm": 5.721664905548096,
1165
+ "learning_rate": 5.7798938232072135e-06,
1166
+ "loss": 3.418,
1167
+ "step": 53000
1168
+ },
1169
+ {
1170
+ "epoch": 0.8857691986295646,
1171
+ "eval_loss": 3.320549488067627,
1172
+ "eval_runtime": 34.5125,
1173
+ "eval_samples_per_second": 129.287,
1174
+ "eval_steps_per_second": 16.168,
1175
+ "step": 53000
1176
+ },
1177
+ {
1178
+ "epoch": 0.8941255118241832,
1179
+ "grad_norm": 8.336995124816895,
1180
+ "learning_rate": 5.358557343894835e-06,
1181
+ "loss": 3.3746,
1182
+ "step": 53500
1183
+ },
1184
+ {
1185
+ "epoch": 0.9024818250188017,
1186
+ "grad_norm": 6.922939777374268,
1187
+ "learning_rate": 4.937220864582456e-06,
1188
+ "loss": 3.416,
1189
+ "step": 54000
1190
+ },
1191
+ {
1192
+ "epoch": 0.9024818250188017,
1193
+ "eval_loss": 3.3170385360717773,
1194
+ "eval_runtime": 34.5291,
1195
+ "eval_samples_per_second": 129.224,
1196
+ "eval_steps_per_second": 16.16,
1197
+ "step": 54000
1198
+ },
1199
+ {
1200
+ "epoch": 0.9108381382134202,
1201
+ "grad_norm": 6.568256855010986,
1202
+ "learning_rate": 4.515884385270077e-06,
1203
+ "loss": 3.3983,
1204
+ "step": 54500
1205
+ },
1206
+ {
1207
+ "epoch": 0.9191944514080388,
1208
+ "grad_norm": 4.785107612609863,
1209
+ "learning_rate": 4.095390578916323e-06,
1210
+ "loss": 3.3607,
1211
+ "step": 55000
1212
+ },
1213
+ {
1214
+ "epoch": 0.9191944514080388,
1215
+ "eval_loss": 3.3168444633483887,
1216
+ "eval_runtime": 34.6371,
1217
+ "eval_samples_per_second": 128.821,
1218
+ "eval_steps_per_second": 16.11,
1219
+ "step": 55000
1220
+ },
1221
+ {
1222
+ "epoch": 0.9275507646026573,
1223
+ "grad_norm": 5.468000888824463,
1224
+ "learning_rate": 3.6740540996039444e-06,
1225
+ "loss": 3.3587,
1226
+ "step": 55500
1227
+ },
1228
+ {
1229
+ "epoch": 0.9359070777972759,
1230
+ "grad_norm": 8.031869888305664,
1231
+ "learning_rate": 3.252717620291565e-06,
1232
+ "loss": 3.4075,
1233
+ "step": 56000
1234
+ },
1235
+ {
1236
+ "epoch": 0.9359070777972759,
1237
+ "eval_loss": 3.3138468265533447,
1238
+ "eval_runtime": 34.5147,
1239
+ "eval_samples_per_second": 129.278,
1240
+ "eval_steps_per_second": 16.167,
1241
+ "step": 56000
1242
+ },
1243
+ {
1244
+ "epoch": 0.9442633909918944,
1245
+ "grad_norm": 5.647308349609375,
1246
+ "learning_rate": 2.831381140979186e-06,
1247
+ "loss": 3.4308,
1248
+ "step": 56500
1249
+ },
1250
+ {
1251
+ "epoch": 0.9526197041865129,
1252
+ "grad_norm": 3.991074562072754,
1253
+ "learning_rate": 2.410044661666807e-06,
1254
+ "loss": 3.3257,
1255
+ "step": 57000
1256
+ },
1257
+ {
1258
+ "epoch": 0.9526197041865129,
1259
+ "eval_loss": 3.3135740756988525,
1260
+ "eval_runtime": 34.9614,
1261
+ "eval_samples_per_second": 127.627,
1262
+ "eval_steps_per_second": 15.96,
1263
+ "step": 57000
1264
+ },
1265
+ {
1266
+ "epoch": 0.9609760173811315,
1267
+ "grad_norm": 4.273293972015381,
1268
+ "learning_rate": 1.9895508553130533e-06,
1269
+ "loss": 3.3362,
1270
+ "step": 57500
1271
+ },
1272
+ {
1273
+ "epoch": 0.96933233057575,
1274
+ "grad_norm": 4.560577869415283,
1275
+ "learning_rate": 1.5682143760006742e-06,
1276
+ "loss": 3.3657,
1277
+ "step": 58000
1278
+ },
1279
+ {
1280
+ "epoch": 0.96933233057575,
1281
+ "eval_loss": 3.312079668045044,
1282
+ "eval_runtime": 34.557,
1283
+ "eval_samples_per_second": 129.12,
1284
+ "eval_steps_per_second": 16.147,
1285
+ "step": 58000
1286
+ },
1287
+ {
1288
+ "epoch": 0.9776886437703685,
1289
+ "grad_norm": 3.613813877105713,
1290
+ "learning_rate": 1.1468778966882954e-06,
1291
+ "loss": 3.3847,
1292
+ "step": 58500
1293
+ },
1294
+ {
1295
+ "epoch": 0.9860449569649871,
1296
+ "grad_norm": 5.713568687438965,
1297
+ "learning_rate": 7.255414173759165e-07,
1298
+ "loss": 3.3508,
1299
+ "step": 59000
1300
+ },
1301
+ {
1302
+ "epoch": 0.9860449569649871,
1303
+ "eval_loss": 3.311403751373291,
1304
+ "eval_runtime": 34.5443,
1305
+ "eval_samples_per_second": 129.167,
1306
+ "eval_steps_per_second": 16.153,
1307
+ "step": 59000
1308
+ },
1309
+ {
1310
+ "epoch": 0.9944012701596056,
1311
+ "grad_norm": 8.699734687805176,
1312
+ "learning_rate": 3.050476110221623e-07,
1313
+ "loss": 3.3511,
1314
+ "step": 59500
1315
+ }
1316
+ ],
1317
+ "logging_steps": 500,
1318
+ "max_steps": 59835,
1319
+ "num_input_tokens_seen": 0,
1320
+ "num_train_epochs": 1,
1321
+ "save_steps": 2000,
1322
+ "stateful_callbacks": {
1323
+ "TrainerControl": {
1324
+ "args": {
1325
+ "should_epoch_stop": false,
1326
+ "should_evaluate": false,
1327
+ "should_log": false,
1328
+ "should_save": true,
1329
+ "should_training_stop": true
1330
+ },
1331
+ "attributes": {}
1332
+ }
1333
+ },
1334
+ "total_flos": 2.50149494587392e+17,
1335
+ "train_batch_size": 8,
1336
+ "trial_name": null,
1337
+ "trial_params": null
1338
+ }
custom-tokenizer/full-finetuning/german-gpt2/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c53c27a55ef1914ee358ac472e9b6d5dfa998d6ba639f176cdb77cc70749b1b
3
+ size 5368
full-finetuning/LLaMmlein_120M/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "LSX-UniWue/LLaMmlein_120M",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 2048,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.46.0",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
full-finetuning/LLaMmlein_120M/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.46.0"
6
+ }
full-finetuning/LLaMmlein_120M/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0399b28b6f631f75d922b415a46df1e1af6909b40028ed196413fb8b030e4fbe
3
+ size 498687008
full-finetuning/LLaMmlein_120M/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0896cf93d0e1bc98805dfc37179adcf8fe7d846d23741e64d13ac475dbc4626c
3
+ size 997443194
full-finetuning/LLaMmlein_120M/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3bc1ac846859ad13fd356d3ed41fd20834205a7e8764e4198636863fca3f64c
3
+ size 14244
full-finetuning/LLaMmlein_120M/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3844cc778b6d10aae1d91adae178f842d6cf04cca5898f583c90f0d4ec34a05
3
+ size 1064
full-finetuning/LLaMmlein_120M/trainer_state.json ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.1603705883026123,
3
+ "best_model_checkpoint": "./models/full-finetuning/LLaMmlein_120M/checkpoint-58000",
4
+ "epoch": 1.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 59835,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008356313194618534,
13
+ "grad_norm": 40.14425277709961,
14
+ "learning_rate": 4.96e-05,
15
+ "loss": 4.8477,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.016712626389237067,
20
+ "grad_norm": 22.107275009155273,
21
+ "learning_rate": 4.958203421252212e-05,
22
+ "loss": 4.4466,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.016712626389237067,
27
+ "eval_loss": 4.3118462562561035,
28
+ "eval_runtime": 22.0404,
29
+ "eval_samples_per_second": 202.446,
30
+ "eval_steps_per_second": 25.317,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.025068939583855605,
35
+ "grad_norm": 19.609399795532227,
36
+ "learning_rate": 4.916069773320974e-05,
37
+ "loss": 4.2986,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.033425252778474135,
42
+ "grad_norm": 18.64438247680664,
43
+ "learning_rate": 4.873936125389736e-05,
44
+ "loss": 4.2295,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.033425252778474135,
49
+ "eval_loss": 4.121812343597412,
50
+ "eval_runtime": 22.1428,
51
+ "eval_samples_per_second": 201.51,
52
+ "eval_steps_per_second": 25.2,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.04178156597309267,
57
+ "grad_norm": 19.098369598388672,
58
+ "learning_rate": 4.8318024774584986e-05,
59
+ "loss": 4.1165,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.05013787916771121,
64
+ "grad_norm": 17.83785629272461,
65
+ "learning_rate": 4.789668829527261e-05,
66
+ "loss": 4.029,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.05013787916771121,
71
+ "eval_loss": 4.040452480316162,
72
+ "eval_runtime": 22.088,
73
+ "eval_samples_per_second": 202.01,
74
+ "eval_steps_per_second": 25.263,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.05849419236232974,
79
+ "grad_norm": 14.394288063049316,
80
+ "learning_rate": 4.747535181596023e-05,
81
+ "loss": 4.0701,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.06685050555694827,
86
+ "grad_norm": 17.005945205688477,
87
+ "learning_rate": 4.705401533664785e-05,
88
+ "loss": 4.0239,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.06685050555694827,
93
+ "eval_loss": 3.948943853378296,
94
+ "eval_runtime": 22.0828,
95
+ "eval_samples_per_second": 202.058,
96
+ "eval_steps_per_second": 25.269,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.0752068187515668,
101
+ "grad_norm": 20.297489166259766,
102
+ "learning_rate": 4.663267885733547e-05,
103
+ "loss": 3.9704,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.08356313194618534,
108
+ "grad_norm": 17.280521392822266,
109
+ "learning_rate": 4.621134237802309e-05,
110
+ "loss": 3.9118,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.08356313194618534,
115
+ "eval_loss": 3.891711950302124,
116
+ "eval_runtime": 22.1454,
117
+ "eval_samples_per_second": 201.487,
118
+ "eval_steps_per_second": 25.197,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.09191944514080387,
123
+ "grad_norm": 18.847597122192383,
124
+ "learning_rate": 4.579000589871071e-05,
125
+ "loss": 3.9024,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.10027575833542242,
130
+ "grad_norm": 14.654472351074219,
131
+ "learning_rate": 4.5368669419398335e-05,
132
+ "loss": 3.8479,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.10027575833542242,
137
+ "eval_loss": 3.8378305435180664,
138
+ "eval_runtime": 22.1051,
139
+ "eval_samples_per_second": 201.854,
140
+ "eval_steps_per_second": 25.243,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.10863207153004095,
145
+ "grad_norm": 18.267274856567383,
146
+ "learning_rate": 4.494817561304458e-05,
147
+ "loss": 3.8116,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.11698838472465949,
152
+ "grad_norm": 13.313653945922852,
153
+ "learning_rate": 4.45268391337322e-05,
154
+ "loss": 3.8592,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.11698838472465949,
159
+ "eval_loss": 3.8072171211242676,
160
+ "eval_runtime": 22.2351,
161
+ "eval_samples_per_second": 200.674,
162
+ "eval_steps_per_second": 25.095,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.12534469791927802,
167
+ "grad_norm": 16.960010528564453,
168
+ "learning_rate": 4.410550265441982e-05,
169
+ "loss": 3.8291,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.13370101111389654,
174
+ "grad_norm": 15.129623413085938,
175
+ "learning_rate": 4.3684166175107444e-05,
176
+ "loss": 3.7697,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.13370101111389654,
181
+ "eval_loss": 3.777130126953125,
182
+ "eval_runtime": 22.2765,
183
+ "eval_samples_per_second": 200.3,
184
+ "eval_steps_per_second": 25.049,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.1420573243085151,
189
+ "grad_norm": 18.825756072998047,
190
+ "learning_rate": 4.326535771467094e-05,
191
+ "loss": 3.7714,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.1504136375031336,
196
+ "grad_norm": 13.93099308013916,
197
+ "learning_rate": 4.284402123535856e-05,
198
+ "loss": 3.7719,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.1504136375031336,
203
+ "eval_loss": 3.747012138366699,
204
+ "eval_runtime": 22.2951,
205
+ "eval_samples_per_second": 200.134,
206
+ "eval_steps_per_second": 25.028,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.15876995069775215,
211
+ "grad_norm": 12.885889053344727,
212
+ "learning_rate": 4.2422684756046185e-05,
213
+ "loss": 3.7127,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.16712626389237067,
218
+ "grad_norm": 15.362942695617676,
219
+ "learning_rate": 4.200134827673381e-05,
220
+ "loss": 3.7135,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.16712626389237067,
225
+ "eval_loss": 3.7190206050872803,
226
+ "eval_runtime": 22.2344,
227
+ "eval_samples_per_second": 200.68,
228
+ "eval_steps_per_second": 25.096,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 0.17548257708698922,
233
+ "grad_norm": 12.432964324951172,
234
+ "learning_rate": 4.158001179742142e-05,
235
+ "loss": 3.6715,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 0.18383889028160774,
240
+ "grad_norm": 25.97146987915039,
241
+ "learning_rate": 4.115867531810904e-05,
242
+ "loss": 3.6599,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 0.18383889028160774,
247
+ "eval_loss": 3.6818652153015137,
248
+ "eval_runtime": 22.2972,
249
+ "eval_samples_per_second": 200.115,
250
+ "eval_steps_per_second": 25.026,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 0.1921952034762263,
255
+ "grad_norm": 21.378082275390625,
256
+ "learning_rate": 4.073733883879666e-05,
257
+ "loss": 3.6754,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 0.20055151667084484,
262
+ "grad_norm": 11.59192943572998,
263
+ "learning_rate": 4.031600235948428e-05,
264
+ "loss": 3.669,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 0.20055151667084484,
269
+ "eval_loss": 3.661188840866089,
270
+ "eval_runtime": 22.2553,
271
+ "eval_samples_per_second": 200.492,
272
+ "eval_steps_per_second": 25.073,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 0.20890782986546336,
277
+ "grad_norm": 17.75707244873047,
278
+ "learning_rate": 3.9894665880171905e-05,
279
+ "loss": 3.6794,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 0.2172641430600819,
284
+ "grad_norm": 10.825678825378418,
285
+ "learning_rate": 3.947332940085953e-05,
286
+ "loss": 3.6113,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 0.2172641430600819,
291
+ "eval_loss": 3.6508119106292725,
292
+ "eval_runtime": 22.2982,
293
+ "eval_samples_per_second": 200.106,
294
+ "eval_steps_per_second": 25.024,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 0.22562045625470042,
299
+ "grad_norm": 10.04261302947998,
300
+ "learning_rate": 3.905199292154715e-05,
301
+ "loss": 3.5963,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 0.23397676944931897,
306
+ "grad_norm": 13.926618576049805,
307
+ "learning_rate": 3.863065644223477e-05,
308
+ "loss": 3.5997,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 0.23397676944931897,
313
+ "eval_loss": 3.6223905086517334,
314
+ "eval_runtime": 22.2717,
315
+ "eval_samples_per_second": 200.344,
316
+ "eval_steps_per_second": 25.054,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 0.2423330826439375,
321
+ "grad_norm": 11.00304889678955,
322
+ "learning_rate": 3.820931996292239e-05,
323
+ "loss": 3.5991,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 0.25068939583855604,
328
+ "grad_norm": 16.099769592285156,
329
+ "learning_rate": 3.778798348361001e-05,
330
+ "loss": 3.6042,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 0.25068939583855604,
335
+ "eval_loss": 3.5953731536865234,
336
+ "eval_runtime": 22.2814,
337
+ "eval_samples_per_second": 200.257,
338
+ "eval_steps_per_second": 25.043,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 0.2590457090331746,
343
+ "grad_norm": 12.459487915039062,
344
+ "learning_rate": 3.736664700429763e-05,
345
+ "loss": 3.5871,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 0.2674020222277931,
350
+ "grad_norm": 16.979909896850586,
351
+ "learning_rate": 3.6946153197943875e-05,
352
+ "loss": 3.5238,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 0.2674020222277931,
357
+ "eval_loss": 3.590113401412964,
358
+ "eval_runtime": 22.2293,
359
+ "eval_samples_per_second": 200.726,
360
+ "eval_steps_per_second": 25.102,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 0.2757583354224116,
365
+ "grad_norm": 23.20758056640625,
366
+ "learning_rate": 3.65248167186315e-05,
367
+ "loss": 3.5646,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 0.2841146486170302,
372
+ "grad_norm": 18.35931396484375,
373
+ "learning_rate": 3.610348023931912e-05,
374
+ "loss": 3.5445,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 0.2841146486170302,
379
+ "eval_loss": 3.563676595687866,
380
+ "eval_runtime": 22.2571,
381
+ "eval_samples_per_second": 200.475,
382
+ "eval_steps_per_second": 25.071,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 0.2924709618116487,
387
+ "grad_norm": 17.187950134277344,
388
+ "learning_rate": 3.568214376000674e-05,
389
+ "loss": 3.494,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 0.3008272750062672,
394
+ "grad_norm": 15.331987380981445,
395
+ "learning_rate": 3.5261649953652984e-05,
396
+ "loss": 3.4913,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 0.3008272750062672,
401
+ "eval_loss": 3.541306495666504,
402
+ "eval_runtime": 22.2598,
403
+ "eval_samples_per_second": 200.451,
404
+ "eval_steps_per_second": 25.068,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 0.30918358820088576,
409
+ "grad_norm": 16.340852737426758,
410
+ "learning_rate": 3.484031347434061e-05,
411
+ "loss": 3.4969,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 0.3175399013955043,
416
+ "grad_norm": 12.265207290649414,
417
+ "learning_rate": 3.441897699502823e-05,
418
+ "loss": 3.4934,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 0.3175399013955043,
423
+ "eval_loss": 3.520357847213745,
424
+ "eval_runtime": 22.2447,
425
+ "eval_samples_per_second": 200.587,
426
+ "eval_steps_per_second": 25.085,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 0.32589621459012286,
431
+ "grad_norm": 15.456232070922852,
432
+ "learning_rate": 3.399764051571585e-05,
433
+ "loss": 3.5013,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 0.33425252778474135,
438
+ "grad_norm": 15.721699714660645,
439
+ "learning_rate": 3.3576304036403474e-05,
440
+ "loss": 3.4627,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 0.33425252778474135,
445
+ "eval_loss": 3.5179378986358643,
446
+ "eval_runtime": 22.2594,
447
+ "eval_samples_per_second": 200.455,
448
+ "eval_steps_per_second": 25.068,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 0.3426088409793599,
453
+ "grad_norm": 12.118553161621094,
454
+ "learning_rate": 3.3154967557091096e-05,
455
+ "loss": 3.5006,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 0.35096515417397844,
460
+ "grad_norm": 8.990864753723145,
461
+ "learning_rate": 3.273447375073734e-05,
462
+ "loss": 3.4367,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 0.35096515417397844,
467
+ "eval_loss": 3.5118658542633057,
468
+ "eval_runtime": 22.2415,
469
+ "eval_samples_per_second": 200.616,
470
+ "eval_steps_per_second": 25.088,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 0.359321467368597,
475
+ "grad_norm": 9.9972562789917,
476
+ "learning_rate": 3.231313727142496e-05,
477
+ "loss": 3.4498,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 0.3676777805632155,
482
+ "grad_norm": 10.996673583984375,
483
+ "learning_rate": 3.189180079211258e-05,
484
+ "loss": 3.4643,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 0.3676777805632155,
489
+ "eval_loss": 3.483738899230957,
490
+ "eval_runtime": 22.2582,
491
+ "eval_samples_per_second": 200.465,
492
+ "eval_steps_per_second": 25.069,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 0.37603409375783403,
497
+ "grad_norm": 14.55636978149414,
498
+ "learning_rate": 3.14704643128002e-05,
499
+ "loss": 3.5215,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 0.3843904069524526,
504
+ "grad_norm": 13.585105895996094,
505
+ "learning_rate": 3.104912783348782e-05,
506
+ "loss": 3.419,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 0.3843904069524526,
511
+ "eval_loss": 3.47660231590271,
512
+ "eval_runtime": 22.226,
513
+ "eval_samples_per_second": 200.756,
514
+ "eval_steps_per_second": 25.106,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 0.3927467201470711,
519
+ "grad_norm": 11.853238105773926,
520
+ "learning_rate": 3.062779135417544e-05,
521
+ "loss": 3.4438,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 0.4011030333416897,
526
+ "grad_norm": 13.06174373626709,
527
+ "learning_rate": 3.020729754782169e-05,
528
+ "loss": 3.4029,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 0.4011030333416897,
533
+ "eval_loss": 3.4587268829345703,
534
+ "eval_runtime": 22.2726,
535
+ "eval_samples_per_second": 200.336,
536
+ "eval_steps_per_second": 25.053,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 0.40945934653630817,
541
+ "grad_norm": 16.874757766723633,
542
+ "learning_rate": 2.9786803741467938e-05,
543
+ "loss": 3.3971,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 0.4178156597309267,
548
+ "grad_norm": 11.108474731445312,
549
+ "learning_rate": 2.936546726215556e-05,
550
+ "loss": 3.3574,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 0.4178156597309267,
555
+ "eval_loss": 3.446179151535034,
556
+ "eval_runtime": 22.3522,
557
+ "eval_samples_per_second": 199.622,
558
+ "eval_steps_per_second": 24.964,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 0.42617197292554526,
563
+ "grad_norm": 12.936110496520996,
564
+ "learning_rate": 2.8944130782843183e-05,
565
+ "loss": 3.3829,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 0.4345282861201638,
570
+ "grad_norm": 12.90854549407959,
571
+ "learning_rate": 2.8522794303530802e-05,
572
+ "loss": 3.4156,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 0.4345282861201638,
577
+ "eval_loss": 3.44026517868042,
578
+ "eval_runtime": 22.2661,
579
+ "eval_samples_per_second": 200.394,
580
+ "eval_steps_per_second": 25.061,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 0.4428845993147823,
585
+ "grad_norm": 10.326555252075195,
586
+ "learning_rate": 2.8101457824218424e-05,
587
+ "loss": 3.3607,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 0.45124091250940085,
592
+ "grad_norm": 12.372066497802734,
593
+ "learning_rate": 2.7681806690823293e-05,
594
+ "loss": 3.3836,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 0.45124091250940085,
599
+ "eval_loss": 3.4253649711608887,
600
+ "eval_runtime": 22.2507,
601
+ "eval_samples_per_second": 200.533,
602
+ "eval_steps_per_second": 25.078,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 0.4595972257040194,
607
+ "grad_norm": 9.778299331665039,
608
+ "learning_rate": 2.7260470211510912e-05,
609
+ "loss": 3.3671,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 0.46795353889863794,
614
+ "grad_norm": 20.047178268432617,
615
+ "learning_rate": 2.6839133732198535e-05,
616
+ "loss": 3.3395,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 0.46795353889863794,
621
+ "eval_loss": 3.41679048538208,
622
+ "eval_runtime": 22.2707,
623
+ "eval_samples_per_second": 200.353,
624
+ "eval_steps_per_second": 25.055,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 0.47630985209325644,
629
+ "grad_norm": 9.312335968017578,
630
+ "learning_rate": 2.6417797252886157e-05,
631
+ "loss": 3.3616,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 0.484666165287875,
636
+ "grad_norm": 10.994682312011719,
637
+ "learning_rate": 2.5996460773573776e-05,
638
+ "loss": 3.3719,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 0.484666165287875,
643
+ "eval_loss": 3.4018924236297607,
644
+ "eval_runtime": 22.2565,
645
+ "eval_samples_per_second": 200.481,
646
+ "eval_steps_per_second": 25.071,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 0.49302247848249353,
651
+ "grad_norm": 13.464505195617676,
652
+ "learning_rate": 2.5575124294261398e-05,
653
+ "loss": 3.3312,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 0.5013787916771121,
658
+ "grad_norm": 12.18619441986084,
659
+ "learning_rate": 2.515378781494902e-05,
660
+ "loss": 3.386,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 0.5013787916771121,
665
+ "eval_loss": 3.3899354934692383,
666
+ "eval_runtime": 22.2658,
667
+ "eval_samples_per_second": 200.397,
668
+ "eval_steps_per_second": 25.061,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 0.5097351048717306,
673
+ "grad_norm": 14.552848815917969,
674
+ "learning_rate": 2.4732451335636643e-05,
675
+ "loss": 3.3377,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 0.5180914180663492,
680
+ "grad_norm": 15.032088279724121,
681
+ "learning_rate": 2.4311114856324262e-05,
682
+ "loss": 3.3131,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 0.5180914180663492,
687
+ "eval_loss": 3.378127336502075,
688
+ "eval_runtime": 22.2423,
689
+ "eval_samples_per_second": 200.609,
690
+ "eval_steps_per_second": 25.087,
691
+ "step": 31000
692
+ },
693
+ {
694
+ "epoch": 0.5264477312609677,
695
+ "grad_norm": 14.666757583618164,
696
+ "learning_rate": 2.388977837701188e-05,
697
+ "loss": 3.3457,
698
+ "step": 31500
699
+ },
700
+ {
701
+ "epoch": 0.5348040444555862,
702
+ "grad_norm": 11.800482749938965,
703
+ "learning_rate": 2.3468441897699503e-05,
704
+ "loss": 3.3192,
705
+ "step": 32000
706
+ },
707
+ {
708
+ "epoch": 0.5348040444555862,
709
+ "eval_loss": 3.3670458793640137,
710
+ "eval_runtime": 22.256,
711
+ "eval_samples_per_second": 200.485,
712
+ "eval_steps_per_second": 25.072,
713
+ "step": 32000
714
+ },
715
+ {
716
+ "epoch": 0.5431603576502048,
717
+ "grad_norm": 10.835103034973145,
718
+ "learning_rate": 2.3047105418387125e-05,
719
+ "loss": 3.3235,
720
+ "step": 32500
721
+ },
722
+ {
723
+ "epoch": 0.5515166708448233,
724
+ "grad_norm": 12.06092357635498,
725
+ "learning_rate": 2.2625768939074744e-05,
726
+ "loss": 3.2969,
727
+ "step": 33000
728
+ },
729
+ {
730
+ "epoch": 0.5515166708448233,
731
+ "eval_loss": 3.356658935546875,
732
+ "eval_runtime": 22.2404,
733
+ "eval_samples_per_second": 200.626,
734
+ "eval_steps_per_second": 25.089,
735
+ "step": 33000
736
+ },
737
+ {
738
+ "epoch": 0.5598729840394417,
739
+ "grad_norm": 15.398877143859863,
740
+ "learning_rate": 2.2204432459762367e-05,
741
+ "loss": 3.3181,
742
+ "step": 33500
743
+ },
744
+ {
745
+ "epoch": 0.5682292972340603,
746
+ "grad_norm": 10.425477027893066,
747
+ "learning_rate": 2.178309598044999e-05,
748
+ "loss": 3.3202,
749
+ "step": 34000
750
+ },
751
+ {
752
+ "epoch": 0.5682292972340603,
753
+ "eval_loss": 3.34324312210083,
754
+ "eval_runtime": 22.2237,
755
+ "eval_samples_per_second": 200.777,
756
+ "eval_steps_per_second": 25.108,
757
+ "step": 34000
758
+ },
759
+ {
760
+ "epoch": 0.5765856104286788,
761
+ "grad_norm": 13.118115425109863,
762
+ "learning_rate": 2.136175950113761e-05,
763
+ "loss": 3.3028,
764
+ "step": 34500
765
+ },
766
+ {
767
+ "epoch": 0.5849419236232974,
768
+ "grad_norm": 8.235157012939453,
769
+ "learning_rate": 2.0941265694783854e-05,
770
+ "loss": 3.2403,
771
+ "step": 35000
772
+ },
773
+ {
774
+ "epoch": 0.5849419236232974,
775
+ "eval_loss": 3.3430681228637695,
776
+ "eval_runtime": 22.2974,
777
+ "eval_samples_per_second": 200.113,
778
+ "eval_steps_per_second": 25.025,
779
+ "step": 35000
780
+ },
781
+ {
782
+ "epoch": 0.5932982368179159,
783
+ "grad_norm": 15.389208793640137,
784
+ "learning_rate": 2.0519929215471476e-05,
785
+ "loss": 3.3105,
786
+ "step": 35500
787
+ },
788
+ {
789
+ "epoch": 0.6016545500125344,
790
+ "grad_norm": 12.708732604980469,
791
+ "learning_rate": 2.0098592736159098e-05,
792
+ "loss": 3.2775,
793
+ "step": 36000
794
+ },
795
+ {
796
+ "epoch": 0.6016545500125344,
797
+ "eval_loss": 3.3276991844177246,
798
+ "eval_runtime": 22.2643,
799
+ "eval_samples_per_second": 200.411,
800
+ "eval_steps_per_second": 25.063,
801
+ "step": 36000
802
+ },
803
+ {
804
+ "epoch": 0.610010863207153,
805
+ "grad_norm": 13.642451286315918,
806
+ "learning_rate": 1.9677256256846717e-05,
807
+ "loss": 3.2902,
808
+ "step": 36500
809
+ },
810
+ {
811
+ "epoch": 0.6183671764017715,
812
+ "grad_norm": 12.606600761413574,
813
+ "learning_rate": 1.9256762450492966e-05,
814
+ "loss": 3.271,
815
+ "step": 37000
816
+ },
817
+ {
818
+ "epoch": 0.6183671764017715,
819
+ "eval_loss": 3.3122496604919434,
820
+ "eval_runtime": 22.2804,
821
+ "eval_samples_per_second": 200.266,
822
+ "eval_steps_per_second": 25.044,
823
+ "step": 37000
824
+ },
825
+ {
826
+ "epoch": 0.6267234895963901,
827
+ "grad_norm": 11.484159469604492,
828
+ "learning_rate": 1.8835425971180585e-05,
829
+ "loss": 3.2833,
830
+ "step": 37500
831
+ },
832
+ {
833
+ "epoch": 0.6350798027910086,
834
+ "grad_norm": 12.317131996154785,
835
+ "learning_rate": 1.8414089491868204e-05,
836
+ "loss": 3.2848,
837
+ "step": 38000
838
+ },
839
+ {
840
+ "epoch": 0.6350798027910086,
841
+ "eval_loss": 3.3035213947296143,
842
+ "eval_runtime": 22.2937,
843
+ "eval_samples_per_second": 200.147,
844
+ "eval_steps_per_second": 25.03,
845
+ "step": 38000
846
+ },
847
+ {
848
+ "epoch": 0.6434361159856271,
849
+ "grad_norm": 11.45077896118164,
850
+ "learning_rate": 1.7992753012555827e-05,
851
+ "loss": 3.202,
852
+ "step": 38500
853
+ },
854
+ {
855
+ "epoch": 0.6517924291802457,
856
+ "grad_norm": 12.859657287597656,
857
+ "learning_rate": 1.7572259206202076e-05,
858
+ "loss": 3.2376,
859
+ "step": 39000
860
+ },
861
+ {
862
+ "epoch": 0.6517924291802457,
863
+ "eval_loss": 3.2956559658050537,
864
+ "eval_runtime": 22.3804,
865
+ "eval_samples_per_second": 199.371,
866
+ "eval_steps_per_second": 24.933,
867
+ "step": 39000
868
+ },
869
+ {
870
+ "epoch": 0.6601487423748642,
871
+ "grad_norm": 14.472012519836426,
872
+ "learning_rate": 1.7150922726889695e-05,
873
+ "loss": 3.1924,
874
+ "step": 39500
875
+ },
876
+ {
877
+ "epoch": 0.6685050555694827,
878
+ "grad_norm": 13.051079750061035,
879
+ "learning_rate": 1.673042892053594e-05,
880
+ "loss": 3.2598,
881
+ "step": 40000
882
+ },
883
+ {
884
+ "epoch": 0.6685050555694827,
885
+ "eval_loss": 3.2878499031066895,
886
+ "eval_runtime": 22.2464,
887
+ "eval_samples_per_second": 200.572,
888
+ "eval_steps_per_second": 25.083,
889
+ "step": 40000
890
+ },
891
+ {
892
+ "epoch": 0.6768613687641013,
893
+ "grad_norm": 15.44560718536377,
894
+ "learning_rate": 1.6309092441223563e-05,
895
+ "loss": 3.1978,
896
+ "step": 40500
897
+ },
898
+ {
899
+ "epoch": 0.6852176819587198,
900
+ "grad_norm": 16.988996505737305,
901
+ "learning_rate": 1.588775596191118e-05,
902
+ "loss": 3.2247,
903
+ "step": 41000
904
+ },
905
+ {
906
+ "epoch": 0.6852176819587198,
907
+ "eval_loss": 3.279550313949585,
908
+ "eval_runtime": 22.2386,
909
+ "eval_samples_per_second": 200.642,
910
+ "eval_steps_per_second": 25.091,
911
+ "step": 41000
912
+ },
913
+ {
914
+ "epoch": 0.6935739951533384,
915
+ "grad_norm": 8.293917655944824,
916
+ "learning_rate": 1.5466419482598804e-05,
917
+ "loss": 3.1682,
918
+ "step": 41500
919
+ },
920
+ {
921
+ "epoch": 0.7019303083479569,
922
+ "grad_norm": 10.755880355834961,
923
+ "learning_rate": 1.5045925676245051e-05,
924
+ "loss": 3.1849,
925
+ "step": 42000
926
+ },
927
+ {
928
+ "epoch": 0.7019303083479569,
929
+ "eval_loss": 3.2791192531585693,
930
+ "eval_runtime": 22.2554,
931
+ "eval_samples_per_second": 200.491,
932
+ "eval_steps_per_second": 25.073,
933
+ "step": 42000
934
+ },
935
+ {
936
+ "epoch": 0.7102866215425754,
937
+ "grad_norm": 17.822643280029297,
938
+ "learning_rate": 1.462458919693267e-05,
939
+ "loss": 3.1714,
940
+ "step": 42500
941
+ },
942
+ {
943
+ "epoch": 0.718642934737194,
944
+ "grad_norm": 18.230485916137695,
945
+ "learning_rate": 1.4203252717620291e-05,
946
+ "loss": 3.2112,
947
+ "step": 43000
948
+ },
949
+ {
950
+ "epoch": 0.718642934737194,
951
+ "eval_loss": 3.260193109512329,
952
+ "eval_runtime": 22.2518,
953
+ "eval_samples_per_second": 200.523,
954
+ "eval_steps_per_second": 25.077,
955
+ "step": 43000
956
+ },
957
+ {
958
+ "epoch": 0.7269992479318125,
959
+ "grad_norm": 13.363430976867676,
960
+ "learning_rate": 1.3781916238307913e-05,
961
+ "loss": 3.1655,
962
+ "step": 43500
963
+ },
964
+ {
965
+ "epoch": 0.735355561126431,
966
+ "grad_norm": 11.570181846618652,
967
+ "learning_rate": 1.3360579758995534e-05,
968
+ "loss": 3.174,
969
+ "step": 44000
970
+ },
971
+ {
972
+ "epoch": 0.735355561126431,
973
+ "eval_loss": 3.2490386962890625,
974
+ "eval_runtime": 22.2683,
975
+ "eval_samples_per_second": 200.374,
976
+ "eval_steps_per_second": 25.058,
977
+ "step": 44000
978
+ },
979
+ {
980
+ "epoch": 0.7437118743210496,
981
+ "grad_norm": 19.80602264404297,
982
+ "learning_rate": 1.2939243279683155e-05,
983
+ "loss": 3.1987,
984
+ "step": 44500
985
+ },
986
+ {
987
+ "epoch": 0.7520681875156681,
988
+ "grad_norm": 10.821731567382812,
989
+ "learning_rate": 1.2518749473329402e-05,
990
+ "loss": 3.1799,
991
+ "step": 45000
992
+ },
993
+ {
994
+ "epoch": 0.7520681875156681,
995
+ "eval_loss": 3.240847587585449,
996
+ "eval_runtime": 22.2794,
997
+ "eval_samples_per_second": 200.275,
998
+ "eval_steps_per_second": 25.046,
999
+ "step": 45000
1000
+ },
1001
+ {
1002
+ "epoch": 0.7604245007102867,
1003
+ "grad_norm": 16.301612854003906,
1004
+ "learning_rate": 1.2097412994017023e-05,
1005
+ "loss": 3.2029,
1006
+ "step": 45500
1007
+ },
1008
+ {
1009
+ "epoch": 0.7687808139049052,
1010
+ "grad_norm": 14.699359893798828,
1011
+ "learning_rate": 1.1676076514704643e-05,
1012
+ "loss": 3.1752,
1013
+ "step": 46000
1014
+ },
1015
+ {
1016
+ "epoch": 0.7687808139049052,
1017
+ "eval_loss": 3.233914852142334,
1018
+ "eval_runtime": 22.269,
1019
+ "eval_samples_per_second": 200.369,
1020
+ "eval_steps_per_second": 25.057,
1021
+ "step": 46000
1022
+ },
1023
+ {
1024
+ "epoch": 0.7771371270995237,
1025
+ "grad_norm": 15.696563720703125,
1026
+ "learning_rate": 1.1254740035392266e-05,
1027
+ "loss": 3.132,
1028
+ "step": 46500
1029
+ },
1030
+ {
1031
+ "epoch": 0.7854934402941423,
1032
+ "grad_norm": 13.062487602233887,
1033
+ "learning_rate": 1.0833403556079886e-05,
1034
+ "loss": 3.131,
1035
+ "step": 47000
1036
+ },
1037
+ {
1038
+ "epoch": 0.7854934402941423,
1039
+ "eval_loss": 3.2280752658843994,
1040
+ "eval_runtime": 22.2955,
1041
+ "eval_samples_per_second": 200.13,
1042
+ "eval_steps_per_second": 25.027,
1043
+ "step": 47000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7938497534887607,
1047
+ "grad_norm": 18.67305564880371,
1048
+ "learning_rate": 1.0412909749726132e-05,
1049
+ "loss": 3.1571,
1050
+ "step": 47500
1051
+ },
1052
+ {
1053
+ "epoch": 0.8022060666833793,
1054
+ "grad_norm": 10.377827644348145,
1055
+ "learning_rate": 9.992415943372378e-06,
1056
+ "loss": 3.181,
1057
+ "step": 48000
1058
+ },
1059
+ {
1060
+ "epoch": 0.8022060666833793,
1061
+ "eval_loss": 3.2206084728240967,
1062
+ "eval_runtime": 22.2943,
1063
+ "eval_samples_per_second": 200.141,
1064
+ "eval_steps_per_second": 25.029,
1065
+ "step": 48000
1066
+ },
1067
+ {
1068
+ "epoch": 0.8105623798779978,
1069
+ "grad_norm": 12.836233139038086,
1070
+ "learning_rate": 9.571922137018624e-06,
1071
+ "loss": 3.139,
1072
+ "step": 48500
1073
+ },
1074
+ {
1075
+ "epoch": 0.8189186930726163,
1076
+ "grad_norm": 11.736408233642578,
1077
+ "learning_rate": 9.150585657706244e-06,
1078
+ "loss": 3.0932,
1079
+ "step": 49000
1080
+ },
1081
+ {
1082
+ "epoch": 0.8189186930726163,
1083
+ "eval_loss": 3.2135069370269775,
1084
+ "eval_runtime": 22.2506,
1085
+ "eval_samples_per_second": 200.534,
1086
+ "eval_steps_per_second": 25.078,
1087
+ "step": 49000
1088
+ },
1089
+ {
1090
+ "epoch": 0.8272750062672349,
1091
+ "grad_norm": 16.016298294067383,
1092
+ "learning_rate": 8.729249178393865e-06,
1093
+ "loss": 3.1634,
1094
+ "step": 49500
1095
+ },
1096
+ {
1097
+ "epoch": 0.8356313194618534,
1098
+ "grad_norm": 10.488819122314453,
1099
+ "learning_rate": 8.307912699081487e-06,
1100
+ "loss": 3.1376,
1101
+ "step": 50000
1102
+ },
1103
+ {
1104
+ "epoch": 0.8356313194618534,
1105
+ "eval_loss": 3.2051162719726562,
1106
+ "eval_runtime": 22.294,
1107
+ "eval_samples_per_second": 200.144,
1108
+ "eval_steps_per_second": 25.029,
1109
+ "step": 50000
1110
+ },
1111
+ {
1112
+ "epoch": 0.8439876326564719,
1113
+ "grad_norm": 16.168071746826172,
1114
+ "learning_rate": 7.886576219769108e-06,
1115
+ "loss": 3.1121,
1116
+ "step": 50500
1117
+ },
1118
+ {
1119
+ "epoch": 0.8523439458510905,
1120
+ "grad_norm": 19.903099060058594,
1121
+ "learning_rate": 7.465239740456729e-06,
1122
+ "loss": 3.1084,
1123
+ "step": 51000
1124
+ },
1125
+ {
1126
+ "epoch": 0.8523439458510905,
1127
+ "eval_loss": 3.198310375213623,
1128
+ "eval_runtime": 22.3049,
1129
+ "eval_samples_per_second": 200.046,
1130
+ "eval_steps_per_second": 25.017,
1131
+ "step": 51000
1132
+ },
1133
+ {
1134
+ "epoch": 0.860700259045709,
1135
+ "grad_norm": 12.082676887512207,
1136
+ "learning_rate": 7.043903261144351e-06,
1137
+ "loss": 3.0957,
1138
+ "step": 51500
1139
+ },
1140
+ {
1141
+ "epoch": 0.8690565722403276,
1142
+ "grad_norm": 11.764552116394043,
1143
+ "learning_rate": 6.622566781831971e-06,
1144
+ "loss": 3.099,
1145
+ "step": 52000
1146
+ },
1147
+ {
1148
+ "epoch": 0.8690565722403276,
1149
+ "eval_loss": 3.193253993988037,
1150
+ "eval_runtime": 22.2469,
1151
+ "eval_samples_per_second": 200.567,
1152
+ "eval_steps_per_second": 25.082,
1153
+ "step": 52000
1154
+ },
1155
+ {
1156
+ "epoch": 0.8774128854349461,
1157
+ "grad_norm": 12.482972145080566,
1158
+ "learning_rate": 6.201230302519592e-06,
1159
+ "loss": 3.0779,
1160
+ "step": 52500
1161
+ },
1162
+ {
1163
+ "epoch": 0.8857691986295646,
1164
+ "grad_norm": 14.11436939239502,
1165
+ "learning_rate": 5.7798938232072135e-06,
1166
+ "loss": 3.1278,
1167
+ "step": 53000
1168
+ },
1169
+ {
1170
+ "epoch": 0.8857691986295646,
1171
+ "eval_loss": 3.1867904663085938,
1172
+ "eval_runtime": 22.2664,
1173
+ "eval_samples_per_second": 200.392,
1174
+ "eval_steps_per_second": 25.06,
1175
+ "step": 53000
1176
+ },
1177
+ {
1178
+ "epoch": 0.8941255118241832,
1179
+ "grad_norm": 19.69700813293457,
1180
+ "learning_rate": 5.358557343894835e-06,
1181
+ "loss": 3.0968,
1182
+ "step": 53500
1183
+ },
1184
+ {
1185
+ "epoch": 0.9024818250188017,
1186
+ "grad_norm": 14.537339210510254,
1187
+ "learning_rate": 4.937220864582456e-06,
1188
+ "loss": 3.1436,
1189
+ "step": 54000
1190
+ },
1191
+ {
1192
+ "epoch": 0.9024818250188017,
1193
+ "eval_loss": 3.180774688720703,
1194
+ "eval_runtime": 22.293,
1195
+ "eval_samples_per_second": 200.152,
1196
+ "eval_steps_per_second": 25.03,
1197
+ "step": 54000
1198
+ },
1199
+ {
1200
+ "epoch": 0.9108381382134202,
1201
+ "grad_norm": 16.117996215820312,
1202
+ "learning_rate": 4.515884385270077e-06,
1203
+ "loss": 3.1288,
1204
+ "step": 54500
1205
+ },
1206
+ {
1207
+ "epoch": 0.9191944514080388,
1208
+ "grad_norm": 12.458276748657227,
1209
+ "learning_rate": 4.094547905957698e-06,
1210
+ "loss": 3.0763,
1211
+ "step": 55000
1212
+ },
1213
+ {
1214
+ "epoch": 0.9191944514080388,
1215
+ "eval_loss": 3.175370216369629,
1216
+ "eval_runtime": 22.253,
1217
+ "eval_samples_per_second": 200.513,
1218
+ "eval_steps_per_second": 25.075,
1219
+ "step": 55000
1220
+ },
1221
+ {
1222
+ "epoch": 0.9275507646026573,
1223
+ "grad_norm": 14.115385055541992,
1224
+ "learning_rate": 3.6732114266453192e-06,
1225
+ "loss": 3.0642,
1226
+ "step": 55500
1227
+ },
1228
+ {
1229
+ "epoch": 0.9359070777972759,
1230
+ "grad_norm": 19.65464210510254,
1231
+ "learning_rate": 3.2518749473329403e-06,
1232
+ "loss": 3.1248,
1233
+ "step": 56000
1234
+ },
1235
+ {
1236
+ "epoch": 0.9359070777972759,
1237
+ "eval_loss": 3.1690962314605713,
1238
+ "eval_runtime": 22.3258,
1239
+ "eval_samples_per_second": 199.858,
1240
+ "eval_steps_per_second": 24.993,
1241
+ "step": 56000
1242
+ },
1243
+ {
1244
+ "epoch": 0.9442633909918944,
1245
+ "grad_norm": 11.953753471374512,
1246
+ "learning_rate": 2.831381140979186e-06,
1247
+ "loss": 3.1361,
1248
+ "step": 56500
1249
+ },
1250
+ {
1251
+ "epoch": 0.9526197041865129,
1252
+ "grad_norm": 10.821110725402832,
1253
+ "learning_rate": 2.4108873346254323e-06,
1254
+ "loss": 3.0418,
1255
+ "step": 57000
1256
+ },
1257
+ {
1258
+ "epoch": 0.9526197041865129,
1259
+ "eval_loss": 3.164776563644409,
1260
+ "eval_runtime": 22.2464,
1261
+ "eval_samples_per_second": 200.572,
1262
+ "eval_steps_per_second": 25.083,
1263
+ "step": 57000
1264
+ },
1265
+ {
1266
+ "epoch": 0.9609760173811315,
1267
+ "grad_norm": 11.476717948913574,
1268
+ "learning_rate": 1.9895508553130533e-06,
1269
+ "loss": 3.0504,
1270
+ "step": 57500
1271
+ },
1272
+ {
1273
+ "epoch": 0.96933233057575,
1274
+ "grad_norm": 10.973363876342773,
1275
+ "learning_rate": 1.5682143760006742e-06,
1276
+ "loss": 3.0755,
1277
+ "step": 58000
1278
+ },
1279
+ {
1280
+ "epoch": 0.96933233057575,
1281
+ "eval_loss": 3.1603705883026123,
1282
+ "eval_runtime": 22.272,
1283
+ "eval_samples_per_second": 200.341,
1284
+ "eval_steps_per_second": 25.054,
1285
+ "step": 58000
1286
+ },
1287
+ {
1288
+ "epoch": 0.9776886437703685,
1289
+ "grad_norm": 10.836787223815918,
1290
+ "learning_rate": 1.1468778966882954e-06,
1291
+ "loss": 3.1001,
1292
+ "step": 58500
1293
+ },
1294
+ {
1295
+ "epoch": 0.9860449569649871,
1296
+ "grad_norm": 13.901703834533691,
1297
+ "learning_rate": 7.255414173759165e-07,
1298
+ "loss": 3.0633,
1299
+ "step": 59000
1300
+ },
1301
+ {
1302
+ "epoch": 0.9860449569649871,
1303
+ "eval_loss": 3.1587648391723633,
1304
+ "eval_runtime": 22.3994,
1305
+ "eval_samples_per_second": 199.202,
1306
+ "eval_steps_per_second": 24.911,
1307
+ "step": 59000
1308
+ },
1309
+ {
1310
+ "epoch": 0.9944012701596056,
1311
+ "grad_norm": 19.022567749023438,
1312
+ "learning_rate": 3.0420493806353753e-07,
1313
+ "loss": 3.0751,
1314
+ "step": 59500
1315
+ }
1316
+ ],
1317
+ "logging_steps": 500,
1318
+ "max_steps": 59835,
1319
+ "num_input_tokens_seen": 0,
1320
+ "num_train_epochs": 1,
1321
+ "save_steps": 2000,
1322
+ "stateful_callbacks": {
1323
+ "TrainerControl": {
1324
+ "args": {
1325
+ "should_epoch_stop": false,
1326
+ "should_evaluate": false,
1327
+ "should_log": false,
1328
+ "should_save": true,
1329
+ "should_training_stop": true
1330
+ },
1331
+ "attributes": {}
1332
+ }
1333
+ },
1334
+ "total_flos": 2.943723113325527e+17,
1335
+ "train_batch_size": 8,
1336
+ "trial_name": null,
1337
+ "trial_params": null
1338
+ }
full-finetuning/LLaMmlein_120M/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3489adac5f0635d0dfab0d3adcddd340b685c77de41ab94351691f318fbda79
3
+ size 5240
full-finetuning/german-gpt2/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dbmdz/german-gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.0,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.46.0",
38
+ "use_cache": true,
39
+ "vocab_size": 50266
40
+ }
full-finetuning/german-gpt2/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.46.0"
6
+ }
full-finetuning/german-gpt2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8e15667a18c4b3497fb23b0fce496d6ff93c6c29494c95bc5eb396ac42a1146
3
+ size 497801856
full-finetuning/german-gpt2/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec5e28abe83c3faea24cdf4c5d768e92115d8432ee555a8db92897d89837a00c
3
+ size 995697594
full-finetuning/german-gpt2/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3bc1ac846859ad13fd356d3ed41fd20834205a7e8764e4198636863fca3f64c
3
+ size 14244
full-finetuning/german-gpt2/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a27b31e56458d82a793cfa18a3b4ed45acf96eb2c39bb55be03b452e972acf4
3
+ size 1064
full-finetuning/german-gpt2/trainer_state.json ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.3120391368865967,
3
+ "best_model_checkpoint": "./models/full-finetuning/german-gpt2/checkpoint-58000",
4
+ "epoch": 1.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 59835,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008356313194618534,
13
+ "grad_norm": 7.7337164878845215,
14
+ "learning_rate": 4.97e-05,
15
+ "loss": 5.7213,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.016712626389237067,
20
+ "grad_norm": 6.002913951873779,
21
+ "learning_rate": 4.95811915395635e-05,
22
+ "loss": 4.8528,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.016712626389237067,
27
+ "eval_loss": 4.570379734039307,
28
+ "eval_runtime": 33.8087,
29
+ "eval_samples_per_second": 131.978,
30
+ "eval_steps_per_second": 16.505,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.025068939583855605,
35
+ "grad_norm": 5.462852478027344,
36
+ "learning_rate": 4.915985506025112e-05,
37
+ "loss": 4.6135,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.033425252778474135,
42
+ "grad_norm": 6.940282821655273,
43
+ "learning_rate": 4.873851858093874e-05,
44
+ "loss": 4.4938,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.033425252778474135,
49
+ "eval_loss": 4.2595720291137695,
50
+ "eval_runtime": 36.1247,
51
+ "eval_samples_per_second": 123.516,
52
+ "eval_steps_per_second": 15.446,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.04178156597309267,
57
+ "grad_norm": 6.612778663635254,
58
+ "learning_rate": 4.8317182101626365e-05,
59
+ "loss": 4.3683,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.05013787916771121,
64
+ "grad_norm": 5.179454803466797,
65
+ "learning_rate": 4.789584562231398e-05,
66
+ "loss": 4.2089,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.05013787916771121,
71
+ "eval_loss": 4.104772567749023,
72
+ "eval_runtime": 33.8319,
73
+ "eval_samples_per_second": 131.887,
74
+ "eval_steps_per_second": 16.493,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.05849419236232974,
79
+ "grad_norm": 4.007822036743164,
80
+ "learning_rate": 4.74745091430016e-05,
81
+ "loss": 4.2532,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.06685050555694827,
86
+ "grad_norm": 5.069295883178711,
87
+ "learning_rate": 4.7053172663689226e-05,
88
+ "loss": 4.2046,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.06685050555694827,
93
+ "eval_loss": 4.0018157958984375,
94
+ "eval_runtime": 33.841,
95
+ "eval_samples_per_second": 131.852,
96
+ "eval_steps_per_second": 16.489,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.0752068187515668,
101
+ "grad_norm": 8.424234390258789,
102
+ "learning_rate": 4.663183618437685e-05,
103
+ "loss": 4.1513,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.08356313194618534,
108
+ "grad_norm": 5.6522016525268555,
109
+ "learning_rate": 4.621049970506447e-05,
110
+ "loss": 4.0912,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.08356313194618534,
115
+ "eval_loss": 3.927964687347412,
116
+ "eval_runtime": 33.8173,
117
+ "eval_samples_per_second": 131.944,
118
+ "eval_steps_per_second": 16.5,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.09191944514080387,
123
+ "grad_norm": 6.062402248382568,
124
+ "learning_rate": 4.5789163225752086e-05,
125
+ "loss": 4.0602,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.10027575833542242,
130
+ "grad_norm": 4.522783279418945,
131
+ "learning_rate": 4.536782674643971e-05,
132
+ "loss": 4.0243,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.10027575833542242,
137
+ "eval_loss": 3.8684792518615723,
138
+ "eval_runtime": 33.8061,
139
+ "eval_samples_per_second": 131.988,
140
+ "eval_steps_per_second": 16.506,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.10863207153004095,
145
+ "grad_norm": 5.732029914855957,
146
+ "learning_rate": 4.494649026712733e-05,
147
+ "loss": 3.9523,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.11698838472465949,
152
+ "grad_norm": 5.034173011779785,
153
+ "learning_rate": 4.452599646077358e-05,
154
+ "loss": 3.9991,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.11698838472465949,
159
+ "eval_loss": 3.818037748336792,
160
+ "eval_runtime": 33.7911,
161
+ "eval_samples_per_second": 132.047,
162
+ "eval_steps_per_second": 16.513,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.12534469791927802,
167
+ "grad_norm": 6.326226711273193,
168
+ "learning_rate": 4.41046599814612e-05,
169
+ "loss": 3.9681,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.13370101111389654,
174
+ "grad_norm": 5.3602495193481445,
175
+ "learning_rate": 4.368332350214882e-05,
176
+ "loss": 3.9101,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.13370101111389654,
181
+ "eval_loss": 3.7742578983306885,
182
+ "eval_runtime": 33.7987,
183
+ "eval_samples_per_second": 132.017,
184
+ "eval_steps_per_second": 16.51,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.1420573243085151,
189
+ "grad_norm": 6.7508111000061035,
190
+ "learning_rate": 4.326198702283644e-05,
191
+ "loss": 3.9032,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.1504136375031336,
196
+ "grad_norm": 5.866630554199219,
197
+ "learning_rate": 4.2840650543524055e-05,
198
+ "loss": 3.8962,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.1504136375031336,
203
+ "eval_loss": 3.7523410320281982,
204
+ "eval_runtime": 33.8046,
205
+ "eval_samples_per_second": 131.994,
206
+ "eval_steps_per_second": 16.507,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.15876995069775215,
211
+ "grad_norm": 4.693000316619873,
212
+ "learning_rate": 4.241931406421168e-05,
213
+ "loss": 3.853,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.16712626389237067,
218
+ "grad_norm": 5.653314113616943,
219
+ "learning_rate": 4.199882025785793e-05,
220
+ "loss": 3.8244,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.16712626389237067,
225
+ "eval_loss": 3.711071014404297,
226
+ "eval_runtime": 33.8163,
227
+ "eval_samples_per_second": 131.948,
228
+ "eval_steps_per_second": 16.501,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 0.17548257708698922,
233
+ "grad_norm": 4.951623439788818,
234
+ "learning_rate": 4.157748377854555e-05,
235
+ "loss": 3.8097,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 0.18383889028160774,
240
+ "grad_norm": 9.94375228881836,
241
+ "learning_rate": 4.115614729923317e-05,
242
+ "loss": 3.7687,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 0.18383889028160774,
247
+ "eval_loss": 3.6785600185394287,
248
+ "eval_runtime": 33.809,
249
+ "eval_samples_per_second": 131.977,
250
+ "eval_steps_per_second": 16.504,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 0.1921952034762263,
255
+ "grad_norm": 7.155759334564209,
256
+ "learning_rate": 4.0734810819920794e-05,
257
+ "loss": 3.8008,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 0.20055151667084484,
262
+ "grad_norm": 3.9872701168060303,
263
+ "learning_rate": 4.031347434060841e-05,
264
+ "loss": 3.7853,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 0.20055151667084484,
269
+ "eval_loss": 3.655622959136963,
270
+ "eval_runtime": 33.807,
271
+ "eval_samples_per_second": 131.985,
272
+ "eval_steps_per_second": 16.505,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 0.20890782986546336,
277
+ "grad_norm": 7.3907647132873535,
278
+ "learning_rate": 3.989298053425466e-05,
279
+ "loss": 3.8082,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 0.2172641430600819,
284
+ "grad_norm": 4.089804172515869,
285
+ "learning_rate": 3.947164405494228e-05,
286
+ "loss": 3.7216,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 0.2172641430600819,
291
+ "eval_loss": 3.641737937927246,
292
+ "eval_runtime": 33.7976,
293
+ "eval_samples_per_second": 132.021,
294
+ "eval_steps_per_second": 16.51,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 0.22562045625470042,
299
+ "grad_norm": 3.7814671993255615,
300
+ "learning_rate": 3.90503075756299e-05,
301
+ "loss": 3.7132,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 0.23397676944931897,
306
+ "grad_norm": 5.00490665435791,
307
+ "learning_rate": 3.8628971096317526e-05,
308
+ "loss": 3.7508,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 0.23397676944931897,
313
+ "eval_loss": 3.6097123622894287,
314
+ "eval_runtime": 33.8097,
315
+ "eval_samples_per_second": 131.974,
316
+ "eval_steps_per_second": 16.504,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 0.2423330826439375,
321
+ "grad_norm": 4.1177659034729,
322
+ "learning_rate": 3.820763461700515e-05,
323
+ "loss": 3.7317,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 0.25068939583855604,
328
+ "grad_norm": 5.807891845703125,
329
+ "learning_rate": 3.7786298137692763e-05,
330
+ "loss": 3.7186,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 0.25068939583855604,
335
+ "eval_loss": 3.5940768718719482,
336
+ "eval_runtime": 33.8398,
337
+ "eval_samples_per_second": 131.856,
338
+ "eval_steps_per_second": 16.489,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 0.2590457090331746,
343
+ "grad_norm": 5.580984592437744,
344
+ "learning_rate": 3.736496165838038e-05,
345
+ "loss": 3.7059,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 0.2674020222277931,
350
+ "grad_norm": 5.553014278411865,
351
+ "learning_rate": 3.6943625179068e-05,
352
+ "loss": 3.6317,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 0.2674020222277931,
357
+ "eval_loss": 3.5815696716308594,
358
+ "eval_runtime": 33.7945,
359
+ "eval_samples_per_second": 132.033,
360
+ "eval_steps_per_second": 16.512,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 0.2757583354224116,
365
+ "grad_norm": 7.818964958190918,
366
+ "learning_rate": 3.652313137271425e-05,
367
+ "loss": 3.6837,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 0.2841146486170302,
372
+ "grad_norm": 7.232082843780518,
373
+ "learning_rate": 3.610179489340187e-05,
374
+ "loss": 3.6965,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 0.2841146486170302,
379
+ "eval_loss": 3.559220552444458,
380
+ "eval_runtime": 33.8032,
381
+ "eval_samples_per_second": 131.999,
382
+ "eval_steps_per_second": 16.507,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 0.2924709618116487,
387
+ "grad_norm": 5.058260917663574,
388
+ "learning_rate": 3.5680458414089495e-05,
389
+ "loss": 3.6218,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 0.3008272750062672,
394
+ "grad_norm": 6.645322322845459,
395
+ "learning_rate": 3.525912193477712e-05,
396
+ "loss": 3.6203,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 0.3008272750062672,
401
+ "eval_loss": 3.5445735454559326,
402
+ "eval_runtime": 33.826,
403
+ "eval_samples_per_second": 131.911,
404
+ "eval_steps_per_second": 16.496,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 0.30918358820088576,
409
+ "grad_norm": 6.7204084396362305,
410
+ "learning_rate": 3.483862812842336e-05,
411
+ "loss": 3.6135,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 0.3175399013955043,
416
+ "grad_norm": 5.846601486206055,
417
+ "learning_rate": 3.441729164911098e-05,
418
+ "loss": 3.6139,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 0.3175399013955043,
423
+ "eval_loss": 3.529482364654541,
424
+ "eval_runtime": 33.8084,
425
+ "eval_samples_per_second": 131.979,
426
+ "eval_steps_per_second": 16.505,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 0.32589621459012286,
431
+ "grad_norm": 4.412099838256836,
432
+ "learning_rate": 3.3995955169798604e-05,
433
+ "loss": 3.6202,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 0.33425252778474135,
438
+ "grad_norm": 6.884310722351074,
439
+ "learning_rate": 3.357461869048623e-05,
440
+ "loss": 3.5862,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 0.33425252778474135,
445
+ "eval_loss": 3.518963098526001,
446
+ "eval_runtime": 33.8217,
447
+ "eval_samples_per_second": 131.927,
448
+ "eval_steps_per_second": 16.498,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 0.3426088409793599,
453
+ "grad_norm": 4.762236595153809,
454
+ "learning_rate": 3.315328221117385e-05,
455
+ "loss": 3.613,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 0.35096515417397844,
460
+ "grad_norm": 3.5604190826416016,
461
+ "learning_rate": 3.273278840482009e-05,
462
+ "loss": 3.5767,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 0.35096515417397844,
467
+ "eval_loss": 3.5067789554595947,
468
+ "eval_runtime": 33.8036,
469
+ "eval_samples_per_second": 131.998,
470
+ "eval_steps_per_second": 16.507,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 0.359321467368597,
475
+ "grad_norm": 3.5096914768218994,
476
+ "learning_rate": 3.2311451925507714e-05,
477
+ "loss": 3.5692,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 0.3676777805632155,
482
+ "grad_norm": 4.54230260848999,
483
+ "learning_rate": 3.1890115446195336e-05,
484
+ "loss": 3.5914,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 0.3676777805632155,
489
+ "eval_loss": 3.491032361984253,
490
+ "eval_runtime": 33.8108,
491
+ "eval_samples_per_second": 131.97,
492
+ "eval_steps_per_second": 16.504,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 0.37603409375783403,
497
+ "grad_norm": 5.874061584472656,
498
+ "learning_rate": 3.146877896688296e-05,
499
+ "loss": 3.6494,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 0.3843904069524526,
504
+ "grad_norm": 5.265365123748779,
505
+ "learning_rate": 3.10482851605292e-05,
506
+ "loss": 3.5796,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 0.3843904069524526,
511
+ "eval_loss": 3.4834258556365967,
512
+ "eval_runtime": 33.7961,
513
+ "eval_samples_per_second": 132.027,
514
+ "eval_steps_per_second": 16.511,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 0.3927467201470711,
519
+ "grad_norm": 4.7764129638671875,
520
+ "learning_rate": 3.062694868121682e-05,
521
+ "loss": 3.5603,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 0.4011030333416897,
526
+ "grad_norm": 4.89961576461792,
527
+ "learning_rate": 3.0205612201904442e-05,
528
+ "loss": 3.5427,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 0.4011030333416897,
533
+ "eval_loss": 3.476203203201294,
534
+ "eval_runtime": 33.8616,
535
+ "eval_samples_per_second": 131.772,
536
+ "eval_steps_per_second": 16.479,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 0.40945934653630817,
541
+ "grad_norm": 6.659942150115967,
542
+ "learning_rate": 2.9784275722592064e-05,
543
+ "loss": 3.5559,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 0.4178156597309267,
548
+ "grad_norm": 4.591799259185791,
549
+ "learning_rate": 2.936378191623831e-05,
550
+ "loss": 3.5087,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 0.4178156597309267,
555
+ "eval_loss": 3.465503454208374,
556
+ "eval_runtime": 33.8191,
557
+ "eval_samples_per_second": 131.937,
558
+ "eval_steps_per_second": 16.5,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 0.42617197292554526,
563
+ "grad_norm": 4.84548282623291,
564
+ "learning_rate": 2.894244543692593e-05,
565
+ "loss": 3.5474,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 0.4345282861201638,
570
+ "grad_norm": 5.839838027954102,
571
+ "learning_rate": 2.852110895761355e-05,
572
+ "loss": 3.5723,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 0.4345282861201638,
577
+ "eval_loss": 3.458843946456909,
578
+ "eval_runtime": 33.8846,
579
+ "eval_samples_per_second": 131.682,
580
+ "eval_steps_per_second": 16.468,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 0.4428845993147823,
585
+ "grad_norm": 3.7624571323394775,
586
+ "learning_rate": 2.8099772478301174e-05,
587
+ "loss": 3.4898,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 0.45124091250940085,
592
+ "grad_norm": 4.763157844543457,
593
+ "learning_rate": 2.7678435998988793e-05,
594
+ "loss": 3.5493,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 0.45124091250940085,
599
+ "eval_loss": 3.4483096599578857,
600
+ "eval_runtime": 33.7859,
601
+ "eval_samples_per_second": 132.067,
602
+ "eval_steps_per_second": 16.516,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 0.4595972257040194,
607
+ "grad_norm": 4.820297718048096,
608
+ "learning_rate": 2.725794219263504e-05,
609
+ "loss": 3.5162,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 0.46795353889863794,
614
+ "grad_norm": 8.342415809631348,
615
+ "learning_rate": 2.683660571332266e-05,
616
+ "loss": 3.4815,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 0.46795353889863794,
621
+ "eval_loss": 3.4425435066223145,
622
+ "eval_runtime": 33.8093,
623
+ "eval_samples_per_second": 131.976,
624
+ "eval_steps_per_second": 16.504,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 0.47630985209325644,
629
+ "grad_norm": 3.9816768169403076,
630
+ "learning_rate": 2.641526923401028e-05,
631
+ "loss": 3.5168,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 0.484666165287875,
636
+ "grad_norm": 4.662781715393066,
637
+ "learning_rate": 2.5993932754697902e-05,
638
+ "loss": 3.5442,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 0.484666165287875,
643
+ "eval_loss": 3.4301340579986572,
644
+ "eval_runtime": 33.7983,
645
+ "eval_samples_per_second": 132.018,
646
+ "eval_steps_per_second": 16.51,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 0.49302247848249353,
651
+ "grad_norm": 5.855069160461426,
652
+ "learning_rate": 2.5573438948344148e-05,
653
+ "loss": 3.5048,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 0.5013787916771121,
658
+ "grad_norm": 3.8957674503326416,
659
+ "learning_rate": 2.515210246903177e-05,
660
+ "loss": 3.5452,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 0.5013787916771121,
665
+ "eval_loss": 3.4219019412994385,
666
+ "eval_runtime": 33.8062,
667
+ "eval_samples_per_second": 131.988,
668
+ "eval_steps_per_second": 16.506,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 0.5097351048717306,
673
+ "grad_norm": 5.273784160614014,
674
+ "learning_rate": 2.473076598971939e-05,
675
+ "loss": 3.4951,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 0.5180914180663492,
680
+ "grad_norm": 6.109909534454346,
681
+ "learning_rate": 2.430942951040701e-05,
682
+ "loss": 3.4879,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 0.5180914180663492,
687
+ "eval_loss": 3.419405698776245,
688
+ "eval_runtime": 33.7875,
689
+ "eval_samples_per_second": 132.061,
690
+ "eval_steps_per_second": 16.515,
691
+ "step": 31000
692
+ },
693
+ {
694
+ "epoch": 0.5264477312609677,
695
+ "grad_norm": 7.194368839263916,
696
+ "learning_rate": 2.3888935704053257e-05,
697
+ "loss": 3.5259,
698
+ "step": 31500
699
+ },
700
+ {
701
+ "epoch": 0.5348040444555862,
702
+ "grad_norm": 4.303890228271484,
703
+ "learning_rate": 2.3468441897699503e-05,
704
+ "loss": 3.4956,
705
+ "step": 32000
706
+ },
707
+ {
708
+ "epoch": 0.5348040444555862,
709
+ "eval_loss": 3.4062435626983643,
710
+ "eval_runtime": 33.8183,
711
+ "eval_samples_per_second": 131.94,
712
+ "eval_steps_per_second": 16.5,
713
+ "step": 32000
714
+ },
715
+ {
716
+ "epoch": 0.5431603576502048,
717
+ "grad_norm": 4.180033206939697,
718
+ "learning_rate": 2.3047105418387125e-05,
719
+ "loss": 3.5137,
720
+ "step": 32500
721
+ },
722
+ {
723
+ "epoch": 0.5515166708448233,
724
+ "grad_norm": 4.65141487121582,
725
+ "learning_rate": 2.2625768939074744e-05,
726
+ "loss": 3.4635,
727
+ "step": 33000
728
+ },
729
+ {
730
+ "epoch": 0.5515166708448233,
731
+ "eval_loss": 3.3979876041412354,
732
+ "eval_runtime": 33.8238,
733
+ "eval_samples_per_second": 131.919,
734
+ "eval_steps_per_second": 16.497,
735
+ "step": 33000
736
+ },
737
+ {
738
+ "epoch": 0.5598729840394417,
739
+ "grad_norm": 7.3677215576171875,
740
+ "learning_rate": 2.2204432459762367e-05,
741
+ "loss": 3.5071,
742
+ "step": 33500
743
+ },
744
+ {
745
+ "epoch": 0.5682292972340603,
746
+ "grad_norm": 4.072124481201172,
747
+ "learning_rate": 2.178309598044999e-05,
748
+ "loss": 3.4836,
749
+ "step": 34000
750
+ },
751
+ {
752
+ "epoch": 0.5682292972340603,
753
+ "eval_loss": 3.3961925506591797,
754
+ "eval_runtime": 33.8074,
755
+ "eval_samples_per_second": 131.983,
756
+ "eval_steps_per_second": 16.505,
757
+ "step": 34000
758
+ },
759
+ {
760
+ "epoch": 0.5765856104286788,
761
+ "grad_norm": 4.479409217834473,
762
+ "learning_rate": 2.136175950113761e-05,
763
+ "loss": 3.4763,
764
+ "step": 34500
765
+ },
766
+ {
767
+ "epoch": 0.5849419236232974,
768
+ "grad_norm": 3.3214428424835205,
769
+ "learning_rate": 2.094042302182523e-05,
770
+ "loss": 3.4378,
771
+ "step": 35000
772
+ },
773
+ {
774
+ "epoch": 0.5849419236232974,
775
+ "eval_loss": 3.389758825302124,
776
+ "eval_runtime": 33.7986,
777
+ "eval_samples_per_second": 132.017,
778
+ "eval_steps_per_second": 16.51,
779
+ "step": 35000
780
+ },
781
+ {
782
+ "epoch": 0.5932982368179159,
783
+ "grad_norm": 6.104400157928467,
784
+ "learning_rate": 2.0519086542512852e-05,
785
+ "loss": 3.4959,
786
+ "step": 35500
787
+ },
788
+ {
789
+ "epoch": 0.6016545500125344,
790
+ "grad_norm": 5.734145641326904,
791
+ "learning_rate": 2.0097750063200475e-05,
792
+ "loss": 3.4673,
793
+ "step": 36000
794
+ },
795
+ {
796
+ "epoch": 0.6016545500125344,
797
+ "eval_loss": 3.3848955631256104,
798
+ "eval_runtime": 33.8025,
799
+ "eval_samples_per_second": 132.002,
800
+ "eval_steps_per_second": 16.508,
801
+ "step": 36000
802
+ },
803
+ {
804
+ "epoch": 0.610010863207153,
805
+ "grad_norm": 4.3256683349609375,
806
+ "learning_rate": 1.9677256256846717e-05,
807
+ "loss": 3.4903,
808
+ "step": 36500
809
+ },
810
+ {
811
+ "epoch": 0.6183671764017715,
812
+ "grad_norm": 5.769290447235107,
813
+ "learning_rate": 1.925591977753434e-05,
814
+ "loss": 3.4722,
815
+ "step": 37000
816
+ },
817
+ {
818
+ "epoch": 0.6183671764017715,
819
+ "eval_loss": 3.3791496753692627,
820
+ "eval_runtime": 33.8066,
821
+ "eval_samples_per_second": 131.986,
822
+ "eval_steps_per_second": 16.506,
823
+ "step": 37000
824
+ },
825
+ {
826
+ "epoch": 0.6267234895963901,
827
+ "grad_norm": 4.285965442657471,
828
+ "learning_rate": 1.8834583298221962e-05,
829
+ "loss": 3.4922,
830
+ "step": 37500
831
+ },
832
+ {
833
+ "epoch": 0.6350798027910086,
834
+ "grad_norm": 4.97435998916626,
835
+ "learning_rate": 1.8413246818909584e-05,
836
+ "loss": 3.4814,
837
+ "step": 38000
838
+ },
839
+ {
840
+ "epoch": 0.6350798027910086,
841
+ "eval_loss": 3.3762271404266357,
842
+ "eval_runtime": 33.8323,
843
+ "eval_samples_per_second": 131.886,
844
+ "eval_steps_per_second": 16.493,
845
+ "step": 38000
846
+ },
847
+ {
848
+ "epoch": 0.6434361159856271,
849
+ "grad_norm": 4.531788349151611,
850
+ "learning_rate": 1.7992753012555827e-05,
851
+ "loss": 3.4144,
852
+ "step": 38500
853
+ },
854
+ {
855
+ "epoch": 0.6517924291802457,
856
+ "grad_norm": 5.197137355804443,
857
+ "learning_rate": 1.757141653324345e-05,
858
+ "loss": 3.4358,
859
+ "step": 39000
860
+ },
861
+ {
862
+ "epoch": 0.6517924291802457,
863
+ "eval_loss": 3.3685717582702637,
864
+ "eval_runtime": 33.7899,
865
+ "eval_samples_per_second": 132.051,
866
+ "eval_steps_per_second": 16.514,
867
+ "step": 39000
868
+ },
869
+ {
870
+ "epoch": 0.6601487423748642,
871
+ "grad_norm": 5.673033714294434,
872
+ "learning_rate": 1.715008005393107e-05,
873
+ "loss": 3.3936,
874
+ "step": 39500
875
+ },
876
+ {
877
+ "epoch": 0.6685050555694827,
878
+ "grad_norm": 4.699774742126465,
879
+ "learning_rate": 1.672874357461869e-05,
880
+ "loss": 3.4711,
881
+ "step": 40000
882
+ },
883
+ {
884
+ "epoch": 0.6685050555694827,
885
+ "eval_loss": 3.3646111488342285,
886
+ "eval_runtime": 33.8073,
887
+ "eval_samples_per_second": 131.983,
888
+ "eval_steps_per_second": 16.505,
889
+ "step": 40000
890
+ },
891
+ {
892
+ "epoch": 0.6768613687641013,
893
+ "grad_norm": 4.5016889572143555,
894
+ "learning_rate": 1.630824976826494e-05,
895
+ "loss": 3.4073,
896
+ "step": 40500
897
+ },
898
+ {
899
+ "epoch": 0.6852176819587198,
900
+ "grad_norm": 9.633309364318848,
901
+ "learning_rate": 1.5886913288952558e-05,
902
+ "loss": 3.4437,
903
+ "step": 41000
904
+ },
905
+ {
906
+ "epoch": 0.6852176819587198,
907
+ "eval_loss": 3.3601598739624023,
908
+ "eval_runtime": 33.7786,
909
+ "eval_samples_per_second": 132.096,
910
+ "eval_steps_per_second": 16.519,
911
+ "step": 41000
912
+ },
913
+ {
914
+ "epoch": 0.6935739951533384,
915
+ "grad_norm": 3.356966733932495,
916
+ "learning_rate": 1.5465576809640177e-05,
917
+ "loss": 3.3706,
918
+ "step": 41500
919
+ },
920
+ {
921
+ "epoch": 0.7019303083479569,
922
+ "grad_norm": 4.3639678955078125,
923
+ "learning_rate": 1.50442403303278e-05,
924
+ "loss": 3.4171,
925
+ "step": 42000
926
+ },
927
+ {
928
+ "epoch": 0.7019303083479569,
929
+ "eval_loss": 3.358569622039795,
930
+ "eval_runtime": 33.8063,
931
+ "eval_samples_per_second": 131.987,
932
+ "eval_steps_per_second": 16.506,
933
+ "step": 42000
934
+ },
935
+ {
936
+ "epoch": 0.7102866215425754,
937
+ "grad_norm": 6.138998985290527,
938
+ "learning_rate": 1.4623746523974047e-05,
939
+ "loss": 3.3822,
940
+ "step": 42500
941
+ },
942
+ {
943
+ "epoch": 0.718642934737194,
944
+ "grad_norm": 5.975950717926025,
945
+ "learning_rate": 1.4202410044661668e-05,
946
+ "loss": 3.442,
947
+ "step": 43000
948
+ },
949
+ {
950
+ "epoch": 0.718642934737194,
951
+ "eval_loss": 3.3521432876586914,
952
+ "eval_runtime": 33.7743,
953
+ "eval_samples_per_second": 132.112,
954
+ "eval_steps_per_second": 16.521,
955
+ "step": 43000
956
+ },
957
+ {
958
+ "epoch": 0.7269992479318125,
959
+ "grad_norm": 5.316736698150635,
960
+ "learning_rate": 1.378107356534929e-05,
961
+ "loss": 3.3837,
962
+ "step": 43500
963
+ },
964
+ {
965
+ "epoch": 0.735355561126431,
966
+ "grad_norm": 4.64693021774292,
967
+ "learning_rate": 1.335973708603691e-05,
968
+ "loss": 3.3846,
969
+ "step": 44000
970
+ },
971
+ {
972
+ "epoch": 0.735355561126431,
973
+ "eval_loss": 3.3473236560821533,
974
+ "eval_runtime": 33.8058,
975
+ "eval_samples_per_second": 131.989,
976
+ "eval_steps_per_second": 16.506,
977
+ "step": 44000
978
+ },
979
+ {
980
+ "epoch": 0.7437118743210496,
981
+ "grad_norm": 7.507499694824219,
982
+ "learning_rate": 1.293840060672453e-05,
983
+ "loss": 3.4229,
984
+ "step": 44500
985
+ },
986
+ {
987
+ "epoch": 0.7520681875156681,
988
+ "grad_norm": 4.695137023925781,
989
+ "learning_rate": 1.2517906800370777e-05,
990
+ "loss": 3.4064,
991
+ "step": 45000
992
+ },
993
+ {
994
+ "epoch": 0.7520681875156681,
995
+ "eval_loss": 3.3478496074676514,
996
+ "eval_runtime": 33.7907,
997
+ "eval_samples_per_second": 132.048,
998
+ "eval_steps_per_second": 16.513,
999
+ "step": 45000
1000
+ },
1001
+ {
1002
+ "epoch": 0.7604245007102867,
1003
+ "grad_norm": 6.782580375671387,
1004
+ "learning_rate": 1.2096570321058398e-05,
1005
+ "loss": 3.4665,
1006
+ "step": 45500
1007
+ },
1008
+ {
1009
+ "epoch": 0.7687808139049052,
1010
+ "grad_norm": 7.160044193267822,
1011
+ "learning_rate": 1.167523384174602e-05,
1012
+ "loss": 3.4181,
1013
+ "step": 46000
1014
+ },
1015
+ {
1016
+ "epoch": 0.7687808139049052,
1017
+ "eval_loss": 3.339061737060547,
1018
+ "eval_runtime": 33.8357,
1019
+ "eval_samples_per_second": 131.872,
1020
+ "eval_steps_per_second": 16.491,
1021
+ "step": 46000
1022
+ },
1023
+ {
1024
+ "epoch": 0.7771371270995237,
1025
+ "grad_norm": 6.559309482574463,
1026
+ "learning_rate": 1.1253897362433639e-05,
1027
+ "loss": 3.3691,
1028
+ "step": 46500
1029
+ },
1030
+ {
1031
+ "epoch": 0.7854934402941423,
1032
+ "grad_norm": 4.2476043701171875,
1033
+ "learning_rate": 1.0832560883121261e-05,
1034
+ "loss": 3.3825,
1035
+ "step": 47000
1036
+ },
1037
+ {
1038
+ "epoch": 0.7854934402941423,
1039
+ "eval_loss": 3.3343887329101562,
1040
+ "eval_runtime": 33.7813,
1041
+ "eval_samples_per_second": 132.085,
1042
+ "eval_steps_per_second": 16.518,
1043
+ "step": 47000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7938497534887607,
1047
+ "grad_norm": 8.163984298706055,
1048
+ "learning_rate": 1.0411224403808882e-05,
1049
+ "loss": 3.4073,
1050
+ "step": 47500
1051
+ },
1052
+ {
1053
+ "epoch": 0.8022060666833793,
1054
+ "grad_norm": 4.705440998077393,
1055
+ "learning_rate": 9.989887924496504e-06,
1056
+ "loss": 3.4277,
1057
+ "step": 48000
1058
+ },
1059
+ {
1060
+ "epoch": 0.8022060666833793,
1061
+ "eval_loss": 3.334386110305786,
1062
+ "eval_runtime": 33.8549,
1063
+ "eval_samples_per_second": 131.798,
1064
+ "eval_steps_per_second": 16.482,
1065
+ "step": 48000
1066
+ },
1067
+ {
1068
+ "epoch": 0.8105623798779978,
1069
+ "grad_norm": 5.080317497253418,
1070
+ "learning_rate": 9.56939411814275e-06,
1071
+ "loss": 3.4014,
1072
+ "step": 48500
1073
+ },
1074
+ {
1075
+ "epoch": 0.8189186930726163,
1076
+ "grad_norm": 4.523305892944336,
1077
+ "learning_rate": 9.14805763883037e-06,
1078
+ "loss": 3.331,
1079
+ "step": 49000
1080
+ },
1081
+ {
1082
+ "epoch": 0.8189186930726163,
1083
+ "eval_loss": 3.33245587348938,
1084
+ "eval_runtime": 33.9406,
1085
+ "eval_samples_per_second": 131.465,
1086
+ "eval_steps_per_second": 16.44,
1087
+ "step": 49000
1088
+ },
1089
+ {
1090
+ "epoch": 0.8272750062672349,
1091
+ "grad_norm": 5.878600597381592,
1092
+ "learning_rate": 8.726721159517993e-06,
1093
+ "loss": 3.4097,
1094
+ "step": 49500
1095
+ },
1096
+ {
1097
+ "epoch": 0.8356313194618534,
1098
+ "grad_norm": 4.23352575302124,
1099
+ "learning_rate": 8.305384680205612e-06,
1100
+ "loss": 3.4073,
1101
+ "step": 50000
1102
+ },
1103
+ {
1104
+ "epoch": 0.8356313194618534,
1105
+ "eval_loss": 3.328063488006592,
1106
+ "eval_runtime": 33.9523,
1107
+ "eval_samples_per_second": 131.42,
1108
+ "eval_steps_per_second": 16.435,
1109
+ "step": 50000
1110
+ },
1111
+ {
1112
+ "epoch": 0.8439876326564719,
1113
+ "grad_norm": 6.142092227935791,
1114
+ "learning_rate": 7.884890873851858e-06,
1115
+ "loss": 3.3904,
1116
+ "step": 50500
1117
+ },
1118
+ {
1119
+ "epoch": 0.8523439458510905,
1120
+ "grad_norm": 7.910597324371338,
1121
+ "learning_rate": 7.463554394539479e-06,
1122
+ "loss": 3.3741,
1123
+ "step": 51000
1124
+ },
1125
+ {
1126
+ "epoch": 0.8523439458510905,
1127
+ "eval_loss": 3.327683448791504,
1128
+ "eval_runtime": 33.9417,
1129
+ "eval_samples_per_second": 131.461,
1130
+ "eval_steps_per_second": 16.44,
1131
+ "step": 51000
1132
+ },
1133
+ {
1134
+ "epoch": 0.860700259045709,
1135
+ "grad_norm": 4.868673324584961,
1136
+ "learning_rate": 7.0422179152271005e-06,
1137
+ "loss": 3.3615,
1138
+ "step": 51500
1139
+ },
1140
+ {
1141
+ "epoch": 0.8690565722403276,
1142
+ "grad_norm": 4.47282075881958,
1143
+ "learning_rate": 6.620881435914722e-06,
1144
+ "loss": 3.3467,
1145
+ "step": 52000
1146
+ },
1147
+ {
1148
+ "epoch": 0.8690565722403276,
1149
+ "eval_loss": 3.3206417560577393,
1150
+ "eval_runtime": 33.9544,
1151
+ "eval_samples_per_second": 131.412,
1152
+ "eval_steps_per_second": 16.434,
1153
+ "step": 52000
1154
+ },
1155
+ {
1156
+ "epoch": 0.8774128854349461,
1157
+ "grad_norm": 5.413076877593994,
1158
+ "learning_rate": 6.200387629560968e-06,
1159
+ "loss": 3.3221,
1160
+ "step": 52500
1161
+ },
1162
+ {
1163
+ "epoch": 0.8857691986295646,
1164
+ "grad_norm": 5.672983169555664,
1165
+ "learning_rate": 5.779051150248588e-06,
1166
+ "loss": 3.418,
1167
+ "step": 53000
1168
+ },
1169
+ {
1170
+ "epoch": 0.8857691986295646,
1171
+ "eval_loss": 3.320605516433716,
1172
+ "eval_runtime": 33.884,
1173
+ "eval_samples_per_second": 131.684,
1174
+ "eval_steps_per_second": 16.468,
1175
+ "step": 53000
1176
+ },
1177
+ {
1178
+ "epoch": 0.8941255118241832,
1179
+ "grad_norm": 8.368507385253906,
1180
+ "learning_rate": 5.35771467093621e-06,
1181
+ "loss": 3.3747,
1182
+ "step": 53500
1183
+ },
1184
+ {
1185
+ "epoch": 0.9024818250188017,
1186
+ "grad_norm": 6.918625831604004,
1187
+ "learning_rate": 4.936378191623831e-06,
1188
+ "loss": 3.416,
1189
+ "step": 54000
1190
+ },
1191
+ {
1192
+ "epoch": 0.9024818250188017,
1193
+ "eval_loss": 3.317139148712158,
1194
+ "eval_runtime": 33.9331,
1195
+ "eval_samples_per_second": 131.494,
1196
+ "eval_steps_per_second": 16.444,
1197
+ "step": 54000
1198
+ },
1199
+ {
1200
+ "epoch": 0.9108381382134202,
1201
+ "grad_norm": 6.566315174102783,
1202
+ "learning_rate": 4.515884385270077e-06,
1203
+ "loss": 3.3985,
1204
+ "step": 54500
1205
+ },
1206
+ {
1207
+ "epoch": 0.9191944514080388,
1208
+ "grad_norm": 4.790822982788086,
1209
+ "learning_rate": 4.094547905957698e-06,
1210
+ "loss": 3.3607,
1211
+ "step": 55000
1212
+ },
1213
+ {
1214
+ "epoch": 0.9191944514080388,
1215
+ "eval_loss": 3.3170626163482666,
1216
+ "eval_runtime": 33.9426,
1217
+ "eval_samples_per_second": 131.457,
1218
+ "eval_steps_per_second": 16.439,
1219
+ "step": 55000
1220
+ },
1221
+ {
1222
+ "epoch": 0.9275507646026573,
1223
+ "grad_norm": 5.501828193664551,
1224
+ "learning_rate": 3.6732114266453192e-06,
1225
+ "loss": 3.3586,
1226
+ "step": 55500
1227
+ },
1228
+ {
1229
+ "epoch": 0.9359070777972759,
1230
+ "grad_norm": 8.011107444763184,
1231
+ "learning_rate": 3.2518749473329403e-06,
1232
+ "loss": 3.4076,
1233
+ "step": 56000
1234
+ },
1235
+ {
1236
+ "epoch": 0.9359070777972759,
1237
+ "eval_loss": 3.3138890266418457,
1238
+ "eval_runtime": 33.9309,
1239
+ "eval_samples_per_second": 131.503,
1240
+ "eval_steps_per_second": 16.445,
1241
+ "step": 56000
1242
+ },
1243
+ {
1244
+ "epoch": 0.9442633909918944,
1245
+ "grad_norm": 5.630584239959717,
1246
+ "learning_rate": 2.8305384680205613e-06,
1247
+ "loss": 3.4309,
1248
+ "step": 56500
1249
+ },
1250
+ {
1251
+ "epoch": 0.9526197041865129,
1252
+ "grad_norm": 3.990445137023926,
1253
+ "learning_rate": 2.4092019887081824e-06,
1254
+ "loss": 3.3257,
1255
+ "step": 57000
1256
+ },
1257
+ {
1258
+ "epoch": 0.9526197041865129,
1259
+ "eval_loss": 3.3135273456573486,
1260
+ "eval_runtime": 33.8948,
1261
+ "eval_samples_per_second": 131.643,
1262
+ "eval_steps_per_second": 16.463,
1263
+ "step": 57000
1264
+ },
1265
+ {
1266
+ "epoch": 0.9609760173811315,
1267
+ "grad_norm": 4.2760396003723145,
1268
+ "learning_rate": 1.9878655093958034e-06,
1269
+ "loss": 3.3362,
1270
+ "step": 57500
1271
+ },
1272
+ {
1273
+ "epoch": 0.96933233057575,
1274
+ "grad_norm": 4.5603532791137695,
1275
+ "learning_rate": 1.5665290300834245e-06,
1276
+ "loss": 3.3657,
1277
+ "step": 58000
1278
+ },
1279
+ {
1280
+ "epoch": 0.96933233057575,
1281
+ "eval_loss": 3.3120391368865967,
1282
+ "eval_runtime": 33.957,
1283
+ "eval_samples_per_second": 131.401,
1284
+ "eval_steps_per_second": 16.433,
1285
+ "step": 58000
1286
+ },
1287
+ {
1288
+ "epoch": 0.9776886437703685,
1289
+ "grad_norm": 3.6152896881103516,
1290
+ "learning_rate": 1.1460352237296705e-06,
1291
+ "loss": 3.3847,
1292
+ "step": 58500
1293
+ },
1294
+ {
1295
+ "epoch": 0.9860449569649871,
1296
+ "grad_norm": 5.7242751121521,
1297
+ "learning_rate": 7.246987444172916e-07,
1298
+ "loss": 3.3509,
1299
+ "step": 59000
1300
+ },
1301
+ {
1302
+ "epoch": 0.9860449569649871,
1303
+ "eval_loss": 3.3113863468170166,
1304
+ "eval_runtime": 33.9749,
1305
+ "eval_samples_per_second": 131.332,
1306
+ "eval_steps_per_second": 16.424,
1307
+ "step": 59000
1308
+ },
1309
+ {
1310
+ "epoch": 0.9944012701596056,
1311
+ "grad_norm": 8.690220832824707,
1312
+ "learning_rate": 3.033622651049128e-07,
1313
+ "loss": 3.3511,
1314
+ "step": 59500
1315
+ }
1316
+ ],
1317
+ "logging_steps": 500,
1318
+ "max_steps": 59835,
1319
+ "num_input_tokens_seen": 0,
1320
+ "num_train_epochs": 1,
1321
+ "save_steps": 2000,
1322
+ "stateful_callbacks": {
1323
+ "TrainerControl": {
1324
+ "args": {
1325
+ "should_epoch_stop": false,
1326
+ "should_evaluate": false,
1327
+ "should_log": false,
1328
+ "should_save": true,
1329
+ "should_training_stop": true
1330
+ },
1331
+ "attributes": {}
1332
+ }
1333
+ },
1334
+ "total_flos": 2.50149494587392e+17,
1335
+ "train_batch_size": 8,
1336
+ "trial_name": null,
1337
+ "trial_params": null
1338
+ }
full-finetuning/german-gpt2/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2630695e31d01dc3422c408e2b55254fe3e00fac7a8d16574c59ef8d085fdefc
3
+ size 5304
instruct-finetuning/base/LLaMmlein_120M/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LSX-UniWue/LLaMmlein_120M
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
instruct-finetuning/base/LLaMmlein_120M/adapter_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "LSX-UniWue/LLaMmlein_120M",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 4,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "o_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "lm_head",
30
+ "down_proj"
31
+ ],
32
+ "task_type": "CAUSAL_LM",
33
+ "use_dora": false,
34
+ "use_rslora": false
35
+ }
instruct-finetuning/base/LLaMmlein_120M/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a839a6e5514599a2a43cc1f29f435b0f9a726019a5cdec1abd0a91a44ef66eb8
3
+ size 123361608
instruct-finetuning/base/LLaMmlein_120M/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f506a64c04df6456eb1373f4f11de4e0552f2cd77656720971d58b429541a8
3
+ size 850871994
instruct-finetuning/base/LLaMmlein_120M/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42ccd97e89dcb0ecb4e53ebfd9919fde6e4d4c5c0a12a689387afb57967751d
3
+ size 14244
instruct-finetuning/base/LLaMmlein_120M/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af176d761d71fce3fbce7001f4850782b022af8f40338e8e88b22363a32018f
3
+ size 1064
instruct-finetuning/base/LLaMmlein_120M/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
instruct-finetuning/base/LLaMmlein_120M/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
instruct-finetuning/base/LLaMmlein_120M/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "left",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": true
43
+ }
instruct-finetuning/base/LLaMmlein_120M/trainer_state.json ADDED
@@ -0,0 +1,1593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 6.343393325805664,
3
+ "best_model_checkpoint": "./models/instruct-finetuning/base/LLaMmlein_120M/checkpoint-100",
4
+ "epoch": 2000.0,
5
+ "eval_steps": 100,
6
+ "global_step": 2000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 10.0,
13
+ "grad_norm": 7.257441997528076,
14
+ "learning_rate": 2.9999999999999997e-05,
15
+ "loss": 4.0845,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 20.0,
20
+ "grad_norm": 6.133562088012695,
21
+ "learning_rate": 5.9999999999999995e-05,
22
+ "loss": 1.8532,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 30.0,
27
+ "grad_norm": 2.961812734603882,
28
+ "learning_rate": 8.999999999999999e-05,
29
+ "loss": 0.3387,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 40.0,
34
+ "grad_norm": 1.0544193983078003,
35
+ "learning_rate": 0.00011999999999999999,
36
+ "loss": 0.0728,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 50.0,
41
+ "grad_norm": 0.11237581074237823,
42
+ "learning_rate": 0.00015,
43
+ "loss": 0.0122,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 60.0,
48
+ "grad_norm": 0.04610421508550644,
49
+ "learning_rate": 0.00017999999999999998,
50
+ "loss": 0.0066,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 70.0,
55
+ "grad_norm": 0.031031867489218712,
56
+ "learning_rate": 0.00020999999999999998,
57
+ "loss": 0.0057,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 80.0,
62
+ "grad_norm": 0.017072567716240883,
63
+ "learning_rate": 0.00023999999999999998,
64
+ "loss": 0.0055,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 90.0,
69
+ "grad_norm": 0.00996345654129982,
70
+ "learning_rate": 0.00027,
71
+ "loss": 0.0055,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 100.0,
76
+ "grad_norm": 0.009029845707118511,
77
+ "learning_rate": 0.0003,
78
+ "loss": 0.0055,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 100.0,
83
+ "eval_loss": 6.343393325805664,
84
+ "eval_runtime": 0.0563,
85
+ "eval_samples_per_second": 283.996,
86
+ "eval_steps_per_second": 17.75,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 110.0,
91
+ "grad_norm": 0.0047378516755998135,
92
+ "learning_rate": 0.00029842105263157894,
93
+ "loss": 0.0055,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 120.0,
98
+ "grad_norm": 0.003162564244121313,
99
+ "learning_rate": 0.00029684210526315785,
100
+ "loss": 0.0055,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 130.0,
105
+ "grad_norm": 0.009896782226860523,
106
+ "learning_rate": 0.0002952631578947368,
107
+ "loss": 0.0054,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 140.0,
112
+ "grad_norm": 0.005971991922706366,
113
+ "learning_rate": 0.0002936842105263158,
114
+ "loss": 0.0055,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 150.0,
119
+ "grad_norm": 0.006593422032892704,
120
+ "learning_rate": 0.0002921052631578947,
121
+ "loss": 0.0054,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 160.0,
126
+ "grad_norm": 0.01387177873402834,
127
+ "learning_rate": 0.00029052631578947366,
128
+ "loss": 0.0054,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 170.0,
133
+ "grad_norm": 0.01074151135981083,
134
+ "learning_rate": 0.00028894736842105263,
135
+ "loss": 0.0055,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 180.0,
140
+ "grad_norm": 0.004622858017683029,
141
+ "learning_rate": 0.00028736842105263154,
142
+ "loss": 0.0054,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 190.0,
147
+ "grad_norm": 0.0028191779274493456,
148
+ "learning_rate": 0.0002857894736842105,
149
+ "loss": 0.0054,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 200.0,
154
+ "grad_norm": 0.007395514752715826,
155
+ "learning_rate": 0.0002842105263157894,
156
+ "loss": 0.0054,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 200.0,
161
+ "eval_loss": 6.482385158538818,
162
+ "eval_runtime": 0.0567,
163
+ "eval_samples_per_second": 282.327,
164
+ "eval_steps_per_second": 17.645,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 210.0,
169
+ "grad_norm": 0.002630041679367423,
170
+ "learning_rate": 0.0002826315789473684,
171
+ "loss": 0.0054,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 220.0,
176
+ "grad_norm": 0.009388357400894165,
177
+ "learning_rate": 0.00028105263157894735,
178
+ "loss": 0.0054,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 230.0,
183
+ "grad_norm": 0.012319635599851608,
184
+ "learning_rate": 0.0002794736842105263,
185
+ "loss": 0.0054,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 240.0,
190
+ "grad_norm": 0.008075610734522343,
191
+ "learning_rate": 0.00027789473684210523,
192
+ "loss": 0.0054,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 250.0,
197
+ "grad_norm": 0.0070420210249722,
198
+ "learning_rate": 0.0002763157894736842,
199
+ "loss": 0.0054,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 260.0,
204
+ "grad_norm": 0.0077827088534832,
205
+ "learning_rate": 0.0002747368421052631,
206
+ "loss": 0.0054,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 270.0,
211
+ "grad_norm": 0.009190627373754978,
212
+ "learning_rate": 0.00027315789473684207,
213
+ "loss": 0.0054,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 280.0,
218
+ "grad_norm": 0.006282865069806576,
219
+ "learning_rate": 0.00027157894736842104,
220
+ "loss": 0.0054,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 290.0,
225
+ "grad_norm": 0.00803539901971817,
226
+ "learning_rate": 0.00027,
227
+ "loss": 0.0054,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 300.0,
232
+ "grad_norm": 0.005915256217122078,
233
+ "learning_rate": 0.0002684210526315789,
234
+ "loss": 0.0054,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 300.0,
239
+ "eval_loss": 6.544665336608887,
240
+ "eval_runtime": 0.0575,
241
+ "eval_samples_per_second": 278.46,
242
+ "eval_steps_per_second": 17.404,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 310.0,
247
+ "grad_norm": 0.00953945517539978,
248
+ "learning_rate": 0.0002668421052631579,
249
+ "loss": 0.0054,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 320.0,
254
+ "grad_norm": 0.0184449665248394,
255
+ "learning_rate": 0.0002652631578947368,
256
+ "loss": 0.0054,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 330.0,
261
+ "grad_norm": 0.009298360906541348,
262
+ "learning_rate": 0.00026368421052631576,
263
+ "loss": 0.0054,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 340.0,
268
+ "grad_norm": 0.007060893811285496,
269
+ "learning_rate": 0.0002621052631578947,
270
+ "loss": 0.0054,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 350.0,
275
+ "grad_norm": 0.005190024618059397,
276
+ "learning_rate": 0.0002605263157894737,
277
+ "loss": 0.0054,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 360.0,
282
+ "grad_norm": 0.007410724181681871,
283
+ "learning_rate": 0.0002589473684210526,
284
+ "loss": 0.0054,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 370.0,
289
+ "grad_norm": 0.0066852509044110775,
290
+ "learning_rate": 0.00025736842105263157,
291
+ "loss": 0.0054,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 380.0,
296
+ "grad_norm": 0.00933538842946291,
297
+ "learning_rate": 0.0002557894736842105,
298
+ "loss": 0.0054,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 390.0,
303
+ "grad_norm": 0.006421348080039024,
304
+ "learning_rate": 0.00025421052631578945,
305
+ "loss": 0.0054,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 400.0,
310
+ "grad_norm": 0.013186627998948097,
311
+ "learning_rate": 0.00025263157894736836,
312
+ "loss": 0.0054,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 400.0,
317
+ "eval_loss": 6.592121601104736,
318
+ "eval_runtime": 0.0566,
319
+ "eval_samples_per_second": 282.605,
320
+ "eval_steps_per_second": 17.663,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 410.0,
325
+ "grad_norm": 0.008023693226277828,
326
+ "learning_rate": 0.0002510526315789474,
327
+ "loss": 0.0054,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 420.0,
332
+ "grad_norm": 0.006304403301328421,
333
+ "learning_rate": 0.0002494736842105263,
334
+ "loss": 0.0054,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 430.0,
339
+ "grad_norm": 0.006279406137764454,
340
+ "learning_rate": 0.00024789473684210526,
341
+ "loss": 0.0054,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 440.0,
346
+ "grad_norm": 0.005663767457008362,
347
+ "learning_rate": 0.00024631578947368417,
348
+ "loss": 0.0054,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 450.0,
353
+ "grad_norm": 0.007969029247760773,
354
+ "learning_rate": 0.00024473684210526314,
355
+ "loss": 0.0054,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 460.0,
360
+ "grad_norm": 0.004170434549450874,
361
+ "learning_rate": 0.00024315789473684207,
362
+ "loss": 0.0054,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 470.0,
367
+ "grad_norm": 0.007959424518048763,
368
+ "learning_rate": 0.000241578947368421,
369
+ "loss": 0.0054,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 480.0,
374
+ "grad_norm": 0.007390080485492945,
375
+ "learning_rate": 0.00023999999999999998,
376
+ "loss": 0.0054,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 490.0,
381
+ "grad_norm": 0.006555990315973759,
382
+ "learning_rate": 0.00023842105263157895,
383
+ "loss": 0.0054,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 500.0,
388
+ "grad_norm": 0.007358189672231674,
389
+ "learning_rate": 0.00023684210526315788,
390
+ "loss": 0.0054,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 500.0,
395
+ "eval_loss": 6.633509159088135,
396
+ "eval_runtime": 0.0562,
397
+ "eval_samples_per_second": 284.773,
398
+ "eval_steps_per_second": 17.798,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 510.0,
403
+ "grad_norm": 0.008251392282545567,
404
+ "learning_rate": 0.00023526315789473682,
405
+ "loss": 0.0054,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 520.0,
410
+ "grad_norm": 0.0032486410345882177,
411
+ "learning_rate": 0.00023368421052631576,
412
+ "loss": 0.0054,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 530.0,
417
+ "grad_norm": 0.01079108752310276,
418
+ "learning_rate": 0.0002321052631578947,
419
+ "loss": 0.0054,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 540.0,
424
+ "grad_norm": 0.006488645449280739,
425
+ "learning_rate": 0.00023052631578947364,
426
+ "loss": 0.0054,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 550.0,
431
+ "grad_norm": 0.006889748852699995,
432
+ "learning_rate": 0.00022894736842105263,
433
+ "loss": 0.0054,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 560.0,
438
+ "grad_norm": 0.005966802127659321,
439
+ "learning_rate": 0.00022736842105263157,
440
+ "loss": 0.0054,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 570.0,
445
+ "grad_norm": 0.00871422328054905,
446
+ "learning_rate": 0.0002257894736842105,
447
+ "loss": 0.0054,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 580.0,
452
+ "grad_norm": 0.007486347574740648,
453
+ "learning_rate": 0.00022421052631578945,
454
+ "loss": 0.0054,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 590.0,
459
+ "grad_norm": 0.006703991908580065,
460
+ "learning_rate": 0.0002226315789473684,
461
+ "loss": 0.0054,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 600.0,
466
+ "grad_norm": 0.0069758091121912,
467
+ "learning_rate": 0.00022105263157894733,
468
+ "loss": 0.0054,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 600.0,
473
+ "eval_loss": 6.659405708312988,
474
+ "eval_runtime": 0.0561,
475
+ "eval_samples_per_second": 284.975,
476
+ "eval_steps_per_second": 17.811,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 610.0,
481
+ "grad_norm": 0.006505531724542379,
482
+ "learning_rate": 0.00021947368421052632,
483
+ "loss": 0.0054,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 620.0,
488
+ "grad_norm": 0.006151702255010605,
489
+ "learning_rate": 0.00021789473684210526,
490
+ "loss": 0.0054,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 630.0,
495
+ "grad_norm": 0.006619542371481657,
496
+ "learning_rate": 0.0002163157894736842,
497
+ "loss": 0.0054,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 640.0,
502
+ "grad_norm": 0.0047877514734864235,
503
+ "learning_rate": 0.00021473684210526314,
504
+ "loss": 0.0054,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 650.0,
509
+ "grad_norm": 0.005860659293830395,
510
+ "learning_rate": 0.00021315789473684208,
511
+ "loss": 0.0054,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 660.0,
516
+ "grad_norm": 0.0046273041516542435,
517
+ "learning_rate": 0.00021157894736842102,
518
+ "loss": 0.0054,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 670.0,
523
+ "grad_norm": 0.00666681258007884,
524
+ "learning_rate": 0.00020999999999999998,
525
+ "loss": 0.0054,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 680.0,
530
+ "grad_norm": 0.0070931087248027325,
531
+ "learning_rate": 0.00020842105263157895,
532
+ "loss": 0.0054,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 690.0,
537
+ "grad_norm": 0.005947079975157976,
538
+ "learning_rate": 0.0002068421052631579,
539
+ "loss": 0.0054,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 700.0,
544
+ "grad_norm": 0.007008504122495651,
545
+ "learning_rate": 0.00020526315789473683,
546
+ "loss": 0.0054,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 700.0,
551
+ "eval_loss": 6.695831775665283,
552
+ "eval_runtime": 0.0572,
553
+ "eval_samples_per_second": 279.963,
554
+ "eval_steps_per_second": 17.498,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 710.0,
559
+ "grad_norm": 0.00678026070818305,
560
+ "learning_rate": 0.00020368421052631576,
561
+ "loss": 0.0054,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 720.0,
566
+ "grad_norm": 0.005221761297434568,
567
+ "learning_rate": 0.0002021052631578947,
568
+ "loss": 0.0054,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 730.0,
573
+ "grad_norm": 0.005231580231338739,
574
+ "learning_rate": 0.00020052631578947367,
575
+ "loss": 0.0054,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 740.0,
580
+ "grad_norm": 0.006937530357390642,
581
+ "learning_rate": 0.0001989473684210526,
582
+ "loss": 0.0054,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 750.0,
587
+ "grad_norm": 0.007865114137530327,
588
+ "learning_rate": 0.00019736842105263157,
589
+ "loss": 0.0054,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 760.0,
594
+ "grad_norm": 0.014306128025054932,
595
+ "learning_rate": 0.0001957894736842105,
596
+ "loss": 0.0054,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 770.0,
601
+ "grad_norm": 0.006088100839406252,
602
+ "learning_rate": 0.00019421052631578945,
603
+ "loss": 0.0054,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 780.0,
608
+ "grad_norm": 0.004863563925027847,
609
+ "learning_rate": 0.0001926315789473684,
610
+ "loss": 0.0054,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 790.0,
615
+ "grad_norm": 0.005216915160417557,
616
+ "learning_rate": 0.00019105263157894736,
617
+ "loss": 0.0054,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 800.0,
622
+ "grad_norm": 0.0025104102678596973,
623
+ "learning_rate": 0.0001894736842105263,
624
+ "loss": 0.0054,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 800.0,
629
+ "eval_loss": 6.713556289672852,
630
+ "eval_runtime": 0.0569,
631
+ "eval_samples_per_second": 281.395,
632
+ "eval_steps_per_second": 17.587,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 810.0,
637
+ "grad_norm": 0.004186335019767284,
638
+ "learning_rate": 0.00018789473684210524,
639
+ "loss": 0.0054,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 820.0,
644
+ "grad_norm": 0.005255494732409716,
645
+ "learning_rate": 0.0001863157894736842,
646
+ "loss": 0.0054,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 830.0,
651
+ "grad_norm": 0.005517592187970877,
652
+ "learning_rate": 0.00018473684210526314,
653
+ "loss": 0.0054,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 840.0,
658
+ "grad_norm": 0.0052512213587760925,
659
+ "learning_rate": 0.00018315789473684208,
660
+ "loss": 0.0054,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 850.0,
665
+ "grad_norm": 0.005355685483664274,
666
+ "learning_rate": 0.00018157894736842105,
667
+ "loss": 0.0054,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 860.0,
672
+ "grad_norm": 0.0037962698843330145,
673
+ "learning_rate": 0.00017999999999999998,
674
+ "loss": 0.0054,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 870.0,
679
+ "grad_norm": 0.0035633454099297523,
680
+ "learning_rate": 0.00017842105263157892,
681
+ "loss": 0.0054,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 880.0,
686
+ "grad_norm": 0.00413065729662776,
687
+ "learning_rate": 0.00017684210526315786,
688
+ "loss": 0.0054,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 890.0,
693
+ "grad_norm": 0.0030881876591593027,
694
+ "learning_rate": 0.00017526315789473683,
695
+ "loss": 0.0054,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 900.0,
700
+ "grad_norm": 0.005164352711290121,
701
+ "learning_rate": 0.0001736842105263158,
702
+ "loss": 0.0054,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 900.0,
707
+ "eval_loss": 6.732208251953125,
708
+ "eval_runtime": 0.0561,
709
+ "eval_samples_per_second": 285.23,
710
+ "eval_steps_per_second": 17.827,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 910.0,
715
+ "grad_norm": 0.0037032829131931067,
716
+ "learning_rate": 0.00017210526315789473,
717
+ "loss": 0.0054,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 920.0,
722
+ "grad_norm": 0.009146653115749359,
723
+ "learning_rate": 0.00017052631578947367,
724
+ "loss": 0.0054,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 930.0,
729
+ "grad_norm": 0.004405967425554991,
730
+ "learning_rate": 0.0001689473684210526,
731
+ "loss": 0.0054,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 940.0,
736
+ "grad_norm": 0.004398548975586891,
737
+ "learning_rate": 0.00016736842105263155,
738
+ "loss": 0.0054,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 950.0,
743
+ "grad_norm": 0.004243563394993544,
744
+ "learning_rate": 0.00016578947368421052,
745
+ "loss": 0.0054,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 960.0,
750
+ "grad_norm": 0.004099991172552109,
751
+ "learning_rate": 0.00016421052631578948,
752
+ "loss": 0.0054,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 970.0,
757
+ "grad_norm": 0.004985535517334938,
758
+ "learning_rate": 0.00016263157894736842,
759
+ "loss": 0.0054,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 980.0,
764
+ "grad_norm": 0.005821101367473602,
765
+ "learning_rate": 0.00016105263157894736,
766
+ "loss": 0.0054,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 990.0,
771
+ "grad_norm": 0.006319540087133646,
772
+ "learning_rate": 0.0001594736842105263,
773
+ "loss": 0.0054,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1000.0,
778
+ "grad_norm": 0.004705901723355055,
779
+ "learning_rate": 0.00015789473684210524,
780
+ "loss": 0.0054,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1000.0,
785
+ "eval_loss": 6.75187349319458,
786
+ "eval_runtime": 0.0567,
787
+ "eval_samples_per_second": 282.37,
788
+ "eval_steps_per_second": 17.648,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1010.0,
793
+ "grad_norm": 0.0037577070761471987,
794
+ "learning_rate": 0.00015631578947368418,
795
+ "loss": 0.0054,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1020.0,
800
+ "grad_norm": 0.0032031466253101826,
801
+ "learning_rate": 0.00015473684210526317,
802
+ "loss": 0.0054,
803
+ "step": 1020
804
+ },
805
+ {
806
+ "epoch": 1030.0,
807
+ "grad_norm": 0.0021940330043435097,
808
+ "learning_rate": 0.0001531578947368421,
809
+ "loss": 0.0054,
810
+ "step": 1030
811
+ },
812
+ {
813
+ "epoch": 1040.0,
814
+ "grad_norm": 0.0029385159723460674,
815
+ "learning_rate": 0.00015157894736842105,
816
+ "loss": 0.0054,
817
+ "step": 1040
818
+ },
819
+ {
820
+ "epoch": 1050.0,
821
+ "grad_norm": 0.003867937019094825,
822
+ "learning_rate": 0.00015,
823
+ "loss": 0.0054,
824
+ "step": 1050
825
+ },
826
+ {
827
+ "epoch": 1060.0,
828
+ "grad_norm": 0.003548321081325412,
829
+ "learning_rate": 0.00014842105263157893,
830
+ "loss": 0.0054,
831
+ "step": 1060
832
+ },
833
+ {
834
+ "epoch": 1070.0,
835
+ "grad_norm": 0.004459850490093231,
836
+ "learning_rate": 0.0001468421052631579,
837
+ "loss": 0.0054,
838
+ "step": 1070
839
+ },
840
+ {
841
+ "epoch": 1080.0,
842
+ "grad_norm": 0.008197680115699768,
843
+ "learning_rate": 0.00014526315789473683,
844
+ "loss": 0.0054,
845
+ "step": 1080
846
+ },
847
+ {
848
+ "epoch": 1090.0,
849
+ "grad_norm": 0.0062789651565253735,
850
+ "learning_rate": 0.00014368421052631577,
851
+ "loss": 0.0054,
852
+ "step": 1090
853
+ },
854
+ {
855
+ "epoch": 1100.0,
856
+ "grad_norm": 0.003925560973584652,
857
+ "learning_rate": 0.0001421052631578947,
858
+ "loss": 0.0054,
859
+ "step": 1100
860
+ },
861
+ {
862
+ "epoch": 1100.0,
863
+ "eval_loss": 6.772552013397217,
864
+ "eval_runtime": 0.0757,
865
+ "eval_samples_per_second": 211.273,
866
+ "eval_steps_per_second": 13.205,
867
+ "step": 1100
868
+ },
869
+ {
870
+ "epoch": 1110.0,
871
+ "grad_norm": 0.003982131835073233,
872
+ "learning_rate": 0.00014052631578947367,
873
+ "loss": 0.0054,
874
+ "step": 1110
875
+ },
876
+ {
877
+ "epoch": 1120.0,
878
+ "grad_norm": 0.003327371319755912,
879
+ "learning_rate": 0.00013894736842105261,
880
+ "loss": 0.0054,
881
+ "step": 1120
882
+ },
883
+ {
884
+ "epoch": 1130.0,
885
+ "grad_norm": 0.0032421585638076067,
886
+ "learning_rate": 0.00013736842105263155,
887
+ "loss": 0.0054,
888
+ "step": 1130
889
+ },
890
+ {
891
+ "epoch": 1140.0,
892
+ "grad_norm": 0.004894908983260393,
893
+ "learning_rate": 0.00013578947368421052,
894
+ "loss": 0.0054,
895
+ "step": 1140
896
+ },
897
+ {
898
+ "epoch": 1150.0,
899
+ "grad_norm": 0.0032253412064164877,
900
+ "learning_rate": 0.00013421052631578946,
901
+ "loss": 0.0054,
902
+ "step": 1150
903
+ },
904
+ {
905
+ "epoch": 1160.0,
906
+ "grad_norm": 0.0022986368276178837,
907
+ "learning_rate": 0.0001326315789473684,
908
+ "loss": 0.0054,
909
+ "step": 1160
910
+ },
911
+ {
912
+ "epoch": 1170.0,
913
+ "grad_norm": 0.004993563052266836,
914
+ "learning_rate": 0.00013105263157894736,
915
+ "loss": 0.0054,
916
+ "step": 1170
917
+ },
918
+ {
919
+ "epoch": 1180.0,
920
+ "grad_norm": 0.0036353906616568565,
921
+ "learning_rate": 0.0001294736842105263,
922
+ "loss": 0.0054,
923
+ "step": 1180
924
+ },
925
+ {
926
+ "epoch": 1190.0,
927
+ "grad_norm": 0.00476466491818428,
928
+ "learning_rate": 0.00012789473684210524,
929
+ "loss": 0.0054,
930
+ "step": 1190
931
+ },
932
+ {
933
+ "epoch": 1200.0,
934
+ "grad_norm": 0.003141681896522641,
935
+ "learning_rate": 0.00012631578947368418,
936
+ "loss": 0.0054,
937
+ "step": 1200
938
+ },
939
+ {
940
+ "epoch": 1200.0,
941
+ "eval_loss": 6.784540176391602,
942
+ "eval_runtime": 0.0567,
943
+ "eval_samples_per_second": 282.119,
944
+ "eval_steps_per_second": 17.632,
945
+ "step": 1200
946
+ },
947
+ {
948
+ "epoch": 1210.0,
949
+ "grad_norm": 0.004309048876166344,
950
+ "learning_rate": 0.00012473684210526315,
951
+ "loss": 0.0054,
952
+ "step": 1210
953
+ },
954
+ {
955
+ "epoch": 1220.0,
956
+ "grad_norm": 0.004496368113905191,
957
+ "learning_rate": 0.00012315789473684208,
958
+ "loss": 0.0054,
959
+ "step": 1220
960
+ },
961
+ {
962
+ "epoch": 1230.0,
963
+ "grad_norm": 0.0035161725245416164,
964
+ "learning_rate": 0.00012157894736842104,
965
+ "loss": 0.0054,
966
+ "step": 1230
967
+ },
968
+ {
969
+ "epoch": 1240.0,
970
+ "grad_norm": 0.002308123977854848,
971
+ "learning_rate": 0.00011999999999999999,
972
+ "loss": 0.0054,
973
+ "step": 1240
974
+ },
975
+ {
976
+ "epoch": 1250.0,
977
+ "grad_norm": 0.003306175349280238,
978
+ "learning_rate": 0.00011842105263157894,
979
+ "loss": 0.0054,
980
+ "step": 1250
981
+ },
982
+ {
983
+ "epoch": 1260.0,
984
+ "grad_norm": 0.0042304969392716885,
985
+ "learning_rate": 0.00011684210526315788,
986
+ "loss": 0.0054,
987
+ "step": 1260
988
+ },
989
+ {
990
+ "epoch": 1270.0,
991
+ "grad_norm": 0.0038438441697508097,
992
+ "learning_rate": 0.00011526315789473682,
993
+ "loss": 0.0054,
994
+ "step": 1270
995
+ },
996
+ {
997
+ "epoch": 1280.0,
998
+ "grad_norm": 0.004704161547124386,
999
+ "learning_rate": 0.00011368421052631579,
1000
+ "loss": 0.0054,
1001
+ "step": 1280
1002
+ },
1003
+ {
1004
+ "epoch": 1290.0,
1005
+ "grad_norm": 0.00560784712433815,
1006
+ "learning_rate": 0.00011210526315789472,
1007
+ "loss": 0.0054,
1008
+ "step": 1290
1009
+ },
1010
+ {
1011
+ "epoch": 1300.0,
1012
+ "grad_norm": 0.003551292000338435,
1013
+ "learning_rate": 0.00011052631578947366,
1014
+ "loss": 0.0054,
1015
+ "step": 1300
1016
+ },
1017
+ {
1018
+ "epoch": 1300.0,
1019
+ "eval_loss": 6.797518730163574,
1020
+ "eval_runtime": 0.0715,
1021
+ "eval_samples_per_second": 223.82,
1022
+ "eval_steps_per_second": 13.989,
1023
+ "step": 1300
1024
+ },
1025
+ {
1026
+ "epoch": 1310.0,
1027
+ "grad_norm": 0.0029719718731939793,
1028
+ "learning_rate": 0.00010894736842105263,
1029
+ "loss": 0.0054,
1030
+ "step": 1310
1031
+ },
1032
+ {
1033
+ "epoch": 1320.0,
1034
+ "grad_norm": 0.0032612320501357317,
1035
+ "learning_rate": 0.00010736842105263157,
1036
+ "loss": 0.0054,
1037
+ "step": 1320
1038
+ },
1039
+ {
1040
+ "epoch": 1330.0,
1041
+ "grad_norm": 0.003693740116432309,
1042
+ "learning_rate": 0.00010578947368421051,
1043
+ "loss": 0.0054,
1044
+ "step": 1330
1045
+ },
1046
+ {
1047
+ "epoch": 1340.0,
1048
+ "grad_norm": 0.0029208508785814047,
1049
+ "learning_rate": 0.00010421052631578947,
1050
+ "loss": 0.0054,
1051
+ "step": 1340
1052
+ },
1053
+ {
1054
+ "epoch": 1350.0,
1055
+ "grad_norm": 0.00595273170620203,
1056
+ "learning_rate": 0.00010263157894736841,
1057
+ "loss": 0.0054,
1058
+ "step": 1350
1059
+ },
1060
+ {
1061
+ "epoch": 1360.0,
1062
+ "grad_norm": 0.003722916590049863,
1063
+ "learning_rate": 0.00010105263157894735,
1064
+ "loss": 0.0054,
1065
+ "step": 1360
1066
+ },
1067
+ {
1068
+ "epoch": 1370.0,
1069
+ "grad_norm": 0.0032132056076079607,
1070
+ "learning_rate": 9.94736842105263e-05,
1071
+ "loss": 0.0054,
1072
+ "step": 1370
1073
+ },
1074
+ {
1075
+ "epoch": 1380.0,
1076
+ "grad_norm": 0.004051781725138426,
1077
+ "learning_rate": 9.789473684210526e-05,
1078
+ "loss": 0.0054,
1079
+ "step": 1380
1080
+ },
1081
+ {
1082
+ "epoch": 1390.0,
1083
+ "grad_norm": 0.0027944352477788925,
1084
+ "learning_rate": 9.63157894736842e-05,
1085
+ "loss": 0.0054,
1086
+ "step": 1390
1087
+ },
1088
+ {
1089
+ "epoch": 1400.0,
1090
+ "grad_norm": 0.003489309921860695,
1091
+ "learning_rate": 9.473684210526315e-05,
1092
+ "loss": 0.0054,
1093
+ "step": 1400
1094
+ },
1095
+ {
1096
+ "epoch": 1400.0,
1097
+ "eval_loss": 6.807862281799316,
1098
+ "eval_runtime": 0.0784,
1099
+ "eval_samples_per_second": 204.11,
1100
+ "eval_steps_per_second": 12.757,
1101
+ "step": 1400
1102
+ },
1103
+ {
1104
+ "epoch": 1410.0,
1105
+ "grad_norm": 0.0035617060493677855,
1106
+ "learning_rate": 9.31578947368421e-05,
1107
+ "loss": 0.0054,
1108
+ "step": 1410
1109
+ },
1110
+ {
1111
+ "epoch": 1420.0,
1112
+ "grad_norm": 0.00412606680765748,
1113
+ "learning_rate": 9.157894736842104e-05,
1114
+ "loss": 0.0054,
1115
+ "step": 1420
1116
+ },
1117
+ {
1118
+ "epoch": 1430.0,
1119
+ "grad_norm": 0.0025130140129476786,
1120
+ "learning_rate": 8.999999999999999e-05,
1121
+ "loss": 0.0054,
1122
+ "step": 1430
1123
+ },
1124
+ {
1125
+ "epoch": 1440.0,
1126
+ "grad_norm": 0.003538177814334631,
1127
+ "learning_rate": 8.842105263157893e-05,
1128
+ "loss": 0.0054,
1129
+ "step": 1440
1130
+ },
1131
+ {
1132
+ "epoch": 1450.0,
1133
+ "grad_norm": 0.0038532898761332035,
1134
+ "learning_rate": 8.68421052631579e-05,
1135
+ "loss": 0.0054,
1136
+ "step": 1450
1137
+ },
1138
+ {
1139
+ "epoch": 1460.0,
1140
+ "grad_norm": 0.0030542940367013216,
1141
+ "learning_rate": 8.526315789473684e-05,
1142
+ "loss": 0.0054,
1143
+ "step": 1460
1144
+ },
1145
+ {
1146
+ "epoch": 1470.0,
1147
+ "grad_norm": 0.003488674759864807,
1148
+ "learning_rate": 8.368421052631578e-05,
1149
+ "loss": 0.0054,
1150
+ "step": 1470
1151
+ },
1152
+ {
1153
+ "epoch": 1480.0,
1154
+ "grad_norm": 0.0034734217915683985,
1155
+ "learning_rate": 8.210526315789474e-05,
1156
+ "loss": 0.0054,
1157
+ "step": 1480
1158
+ },
1159
+ {
1160
+ "epoch": 1490.0,
1161
+ "grad_norm": 0.00296577624976635,
1162
+ "learning_rate": 8.052631578947368e-05,
1163
+ "loss": 0.0054,
1164
+ "step": 1490
1165
+ },
1166
+ {
1167
+ "epoch": 1500.0,
1168
+ "grad_norm": 0.004232010338455439,
1169
+ "learning_rate": 7.894736842105262e-05,
1170
+ "loss": 0.0054,
1171
+ "step": 1500
1172
+ },
1173
+ {
1174
+ "epoch": 1500.0,
1175
+ "eval_loss": 6.816506862640381,
1176
+ "eval_runtime": 0.0566,
1177
+ "eval_samples_per_second": 282.528,
1178
+ "eval_steps_per_second": 17.658,
1179
+ "step": 1500
1180
+ },
1181
+ {
1182
+ "epoch": 1510.0,
1183
+ "grad_norm": 0.00355426874011755,
1184
+ "learning_rate": 7.736842105263159e-05,
1185
+ "loss": 0.0054,
1186
+ "step": 1510
1187
+ },
1188
+ {
1189
+ "epoch": 1520.0,
1190
+ "grad_norm": 0.0033336167689412832,
1191
+ "learning_rate": 7.578947368421052e-05,
1192
+ "loss": 0.0054,
1193
+ "step": 1520
1194
+ },
1195
+ {
1196
+ "epoch": 1530.0,
1197
+ "grad_norm": 0.0036274876911193132,
1198
+ "learning_rate": 7.421052631578946e-05,
1199
+ "loss": 0.0054,
1200
+ "step": 1530
1201
+ },
1202
+ {
1203
+ "epoch": 1540.0,
1204
+ "grad_norm": 0.004252485930919647,
1205
+ "learning_rate": 7.263157894736842e-05,
1206
+ "loss": 0.0054,
1207
+ "step": 1540
1208
+ },
1209
+ {
1210
+ "epoch": 1550.0,
1211
+ "grad_norm": 0.0027540773153305054,
1212
+ "learning_rate": 7.105263157894735e-05,
1213
+ "loss": 0.0054,
1214
+ "step": 1550
1215
+ },
1216
+ {
1217
+ "epoch": 1560.0,
1218
+ "grad_norm": 0.004094436764717102,
1219
+ "learning_rate": 6.947368421052631e-05,
1220
+ "loss": 0.0054,
1221
+ "step": 1560
1222
+ },
1223
+ {
1224
+ "epoch": 1570.0,
1225
+ "grad_norm": 0.006011336576193571,
1226
+ "learning_rate": 6.789473684210526e-05,
1227
+ "loss": 0.0054,
1228
+ "step": 1570
1229
+ },
1230
+ {
1231
+ "epoch": 1580.0,
1232
+ "grad_norm": 0.006121497601270676,
1233
+ "learning_rate": 6.63157894736842e-05,
1234
+ "loss": 0.0054,
1235
+ "step": 1580
1236
+ },
1237
+ {
1238
+ "epoch": 1590.0,
1239
+ "grad_norm": 0.00510810874402523,
1240
+ "learning_rate": 6.473684210526315e-05,
1241
+ "loss": 0.0054,
1242
+ "step": 1590
1243
+ },
1244
+ {
1245
+ "epoch": 1600.0,
1246
+ "grad_norm": 0.0036733602173626423,
1247
+ "learning_rate": 6.315789473684209e-05,
1248
+ "loss": 0.0054,
1249
+ "step": 1600
1250
+ },
1251
+ {
1252
+ "epoch": 1600.0,
1253
+ "eval_loss": 6.823525428771973,
1254
+ "eval_runtime": 0.0562,
1255
+ "eval_samples_per_second": 284.573,
1256
+ "eval_steps_per_second": 17.786,
1257
+ "step": 1600
1258
+ },
1259
+ {
1260
+ "epoch": 1610.0,
1261
+ "grad_norm": 0.0023464614059776068,
1262
+ "learning_rate": 6.157894736842104e-05,
1263
+ "loss": 0.0054,
1264
+ "step": 1610
1265
+ },
1266
+ {
1267
+ "epoch": 1620.0,
1268
+ "grad_norm": 0.003976115491241217,
1269
+ "learning_rate": 5.9999999999999995e-05,
1270
+ "loss": 0.0054,
1271
+ "step": 1620
1272
+ },
1273
+ {
1274
+ "epoch": 1630.0,
1275
+ "grad_norm": 0.002651224611327052,
1276
+ "learning_rate": 5.842105263157894e-05,
1277
+ "loss": 0.0054,
1278
+ "step": 1630
1279
+ },
1280
+ {
1281
+ "epoch": 1640.0,
1282
+ "grad_norm": 0.002472205553203821,
1283
+ "learning_rate": 5.684210526315789e-05,
1284
+ "loss": 0.0054,
1285
+ "step": 1640
1286
+ },
1287
+ {
1288
+ "epoch": 1650.0,
1289
+ "grad_norm": 0.0021227358374744654,
1290
+ "learning_rate": 5.526315789473683e-05,
1291
+ "loss": 0.0054,
1292
+ "step": 1650
1293
+ },
1294
+ {
1295
+ "epoch": 1660.0,
1296
+ "grad_norm": 0.002482566749677062,
1297
+ "learning_rate": 5.3684210526315784e-05,
1298
+ "loss": 0.0054,
1299
+ "step": 1660
1300
+ },
1301
+ {
1302
+ "epoch": 1670.0,
1303
+ "grad_norm": 0.00298318755812943,
1304
+ "learning_rate": 5.210526315789474e-05,
1305
+ "loss": 0.0054,
1306
+ "step": 1670
1307
+ },
1308
+ {
1309
+ "epoch": 1680.0,
1310
+ "grad_norm": 0.002869119867682457,
1311
+ "learning_rate": 5.0526315789473676e-05,
1312
+ "loss": 0.0054,
1313
+ "step": 1680
1314
+ },
1315
+ {
1316
+ "epoch": 1690.0,
1317
+ "grad_norm": 0.0015972091350704432,
1318
+ "learning_rate": 4.894736842105263e-05,
1319
+ "loss": 0.0054,
1320
+ "step": 1690
1321
+ },
1322
+ {
1323
+ "epoch": 1700.0,
1324
+ "grad_norm": 0.0019471285631880164,
1325
+ "learning_rate": 4.7368421052631574e-05,
1326
+ "loss": 0.0054,
1327
+ "step": 1700
1328
+ },
1329
+ {
1330
+ "epoch": 1700.0,
1331
+ "eval_loss": 6.832122802734375,
1332
+ "eval_runtime": 0.0565,
1333
+ "eval_samples_per_second": 283.107,
1334
+ "eval_steps_per_second": 17.694,
1335
+ "step": 1700
1336
+ },
1337
+ {
1338
+ "epoch": 1710.0,
1339
+ "grad_norm": 0.0015862587606534362,
1340
+ "learning_rate": 4.578947368421052e-05,
1341
+ "loss": 0.0054,
1342
+ "step": 1710
1343
+ },
1344
+ {
1345
+ "epoch": 1720.0,
1346
+ "grad_norm": 0.0018773003248497844,
1347
+ "learning_rate": 4.4210526315789466e-05,
1348
+ "loss": 0.0054,
1349
+ "step": 1720
1350
+ },
1351
+ {
1352
+ "epoch": 1730.0,
1353
+ "grad_norm": 0.0024244124069809914,
1354
+ "learning_rate": 4.263157894736842e-05,
1355
+ "loss": 0.0054,
1356
+ "step": 1730
1357
+ },
1358
+ {
1359
+ "epoch": 1740.0,
1360
+ "grad_norm": 0.0032190545462071896,
1361
+ "learning_rate": 4.105263157894737e-05,
1362
+ "loss": 0.0054,
1363
+ "step": 1740
1364
+ },
1365
+ {
1366
+ "epoch": 1750.0,
1367
+ "grad_norm": 0.002826133742928505,
1368
+ "learning_rate": 3.947368421052631e-05,
1369
+ "loss": 0.0054,
1370
+ "step": 1750
1371
+ },
1372
+ {
1373
+ "epoch": 1760.0,
1374
+ "grad_norm": 0.002389115747064352,
1375
+ "learning_rate": 3.789473684210526e-05,
1376
+ "loss": 0.0054,
1377
+ "step": 1760
1378
+ },
1379
+ {
1380
+ "epoch": 1770.0,
1381
+ "grad_norm": 0.0018089566146954894,
1382
+ "learning_rate": 3.631578947368421e-05,
1383
+ "loss": 0.0054,
1384
+ "step": 1770
1385
+ },
1386
+ {
1387
+ "epoch": 1780.0,
1388
+ "grad_norm": 0.001812565722502768,
1389
+ "learning_rate": 3.4736842105263153e-05,
1390
+ "loss": 0.0054,
1391
+ "step": 1780
1392
+ },
1393
+ {
1394
+ "epoch": 1790.0,
1395
+ "grad_norm": 0.0015028449706733227,
1396
+ "learning_rate": 3.31578947368421e-05,
1397
+ "loss": 0.0054,
1398
+ "step": 1790
1399
+ },
1400
+ {
1401
+ "epoch": 1800.0,
1402
+ "grad_norm": 0.00215025688521564,
1403
+ "learning_rate": 3.1578947368421045e-05,
1404
+ "loss": 0.0054,
1405
+ "step": 1800
1406
+ },
1407
+ {
1408
+ "epoch": 1800.0,
1409
+ "eval_loss": 6.83547306060791,
1410
+ "eval_runtime": 0.0569,
1411
+ "eval_samples_per_second": 281.101,
1412
+ "eval_steps_per_second": 17.569,
1413
+ "step": 1800
1414
+ },
1415
+ {
1416
+ "epoch": 1810.0,
1417
+ "grad_norm": 0.002039752434939146,
1418
+ "learning_rate": 2.9999999999999997e-05,
1419
+ "loss": 0.0054,
1420
+ "step": 1810
1421
+ },
1422
+ {
1423
+ "epoch": 1820.0,
1424
+ "grad_norm": 0.0030053190421313047,
1425
+ "learning_rate": 2.8421052631578946e-05,
1426
+ "loss": 0.0054,
1427
+ "step": 1820
1428
+ },
1429
+ {
1430
+ "epoch": 1830.0,
1431
+ "grad_norm": 0.0024026986211538315,
1432
+ "learning_rate": 2.6842105263157892e-05,
1433
+ "loss": 0.0054,
1434
+ "step": 1830
1435
+ },
1436
+ {
1437
+ "epoch": 1840.0,
1438
+ "grad_norm": 0.001505296560935676,
1439
+ "learning_rate": 2.5263157894736838e-05,
1440
+ "loss": 0.0054,
1441
+ "step": 1840
1442
+ },
1443
+ {
1444
+ "epoch": 1850.0,
1445
+ "grad_norm": 0.001797975623048842,
1446
+ "learning_rate": 2.3684210526315787e-05,
1447
+ "loss": 0.0054,
1448
+ "step": 1850
1449
+ },
1450
+ {
1451
+ "epoch": 1860.0,
1452
+ "grad_norm": 0.0021602166816592216,
1453
+ "learning_rate": 2.2105263157894733e-05,
1454
+ "loss": 0.0054,
1455
+ "step": 1860
1456
+ },
1457
+ {
1458
+ "epoch": 1870.0,
1459
+ "grad_norm": 0.0020166405010968447,
1460
+ "learning_rate": 2.0526315789473685e-05,
1461
+ "loss": 0.0054,
1462
+ "step": 1870
1463
+ },
1464
+ {
1465
+ "epoch": 1880.0,
1466
+ "grad_norm": 0.0018515754491090775,
1467
+ "learning_rate": 1.894736842105263e-05,
1468
+ "loss": 0.0054,
1469
+ "step": 1880
1470
+ },
1471
+ {
1472
+ "epoch": 1890.0,
1473
+ "grad_norm": 0.0014232598477974534,
1474
+ "learning_rate": 1.7368421052631577e-05,
1475
+ "loss": 0.0054,
1476
+ "step": 1890
1477
+ },
1478
+ {
1479
+ "epoch": 1900.0,
1480
+ "grad_norm": 0.0013582637766376138,
1481
+ "learning_rate": 1.5789473684210522e-05,
1482
+ "loss": 0.0054,
1483
+ "step": 1900
1484
+ },
1485
+ {
1486
+ "epoch": 1900.0,
1487
+ "eval_loss": 6.838393688201904,
1488
+ "eval_runtime": 0.0566,
1489
+ "eval_samples_per_second": 282.616,
1490
+ "eval_steps_per_second": 17.663,
1491
+ "step": 1900
1492
+ },
1493
+ {
1494
+ "epoch": 1910.0,
1495
+ "grad_norm": 0.0013303229352459311,
1496
+ "learning_rate": 1.4210526315789473e-05,
1497
+ "loss": 0.0054,
1498
+ "step": 1910
1499
+ },
1500
+ {
1501
+ "epoch": 1920.0,
1502
+ "grad_norm": 0.0014441731618717313,
1503
+ "learning_rate": 1.2631578947368419e-05,
1504
+ "loss": 0.0054,
1505
+ "step": 1920
1506
+ },
1507
+ {
1508
+ "epoch": 1930.0,
1509
+ "grad_norm": 0.0011644844198599458,
1510
+ "learning_rate": 1.1052631578947366e-05,
1511
+ "loss": 0.0054,
1512
+ "step": 1930
1513
+ },
1514
+ {
1515
+ "epoch": 1940.0,
1516
+ "grad_norm": 0.0011815495090559125,
1517
+ "learning_rate": 9.473684210526315e-06,
1518
+ "loss": 0.0054,
1519
+ "step": 1940
1520
+ },
1521
+ {
1522
+ "epoch": 1950.0,
1523
+ "grad_norm": 0.0019422987243160605,
1524
+ "learning_rate": 7.894736842105261e-06,
1525
+ "loss": 0.0054,
1526
+ "step": 1950
1527
+ },
1528
+ {
1529
+ "epoch": 1960.0,
1530
+ "grad_norm": 0.0012022488517686725,
1531
+ "learning_rate": 6.3157894736842095e-06,
1532
+ "loss": 0.0054,
1533
+ "step": 1960
1534
+ },
1535
+ {
1536
+ "epoch": 1970.0,
1537
+ "grad_norm": 0.0013600951060652733,
1538
+ "learning_rate": 4.736842105263158e-06,
1539
+ "loss": 0.0054,
1540
+ "step": 1970
1541
+ },
1542
+ {
1543
+ "epoch": 1980.0,
1544
+ "grad_norm": 0.0014138800324872136,
1545
+ "learning_rate": 3.1578947368421047e-06,
1546
+ "loss": 0.0054,
1547
+ "step": 1980
1548
+ },
1549
+ {
1550
+ "epoch": 1990.0,
1551
+ "grad_norm": 0.0012338694650679827,
1552
+ "learning_rate": 1.5789473684210524e-06,
1553
+ "loss": 0.0054,
1554
+ "step": 1990
1555
+ },
1556
+ {
1557
+ "epoch": 2000.0,
1558
+ "grad_norm": 0.0009407839388586581,
1559
+ "learning_rate": 0.0,
1560
+ "loss": 0.0054,
1561
+ "step": 2000
1562
+ },
1563
+ {
1564
+ "epoch": 2000.0,
1565
+ "eval_loss": 6.840120792388916,
1566
+ "eval_runtime": 0.0569,
1567
+ "eval_samples_per_second": 280.972,
1568
+ "eval_steps_per_second": 17.561,
1569
+ "step": 2000
1570
+ }
1571
+ ],
1572
+ "logging_steps": 10,
1573
+ "max_steps": 2000,
1574
+ "num_input_tokens_seen": 0,
1575
+ "num_train_epochs": 2000,
1576
+ "save_steps": 100,
1577
+ "stateful_callbacks": {
1578
+ "TrainerControl": {
1579
+ "args": {
1580
+ "should_epoch_stop": false,
1581
+ "should_evaluate": false,
1582
+ "should_log": false,
1583
+ "should_save": true,
1584
+ "should_training_stop": true
1585
+ },
1586
+ "attributes": {}
1587
+ }
1588
+ },
1589
+ "total_flos": 1.045476409344e+16,
1590
+ "train_batch_size": 30,
1591
+ "trial_name": null,
1592
+ "trial_params": null
1593
+ }
instruct-finetuning/base/LLaMmlein_120M/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49b230fcb8bb33325a80b3b5f9018ab82781c698b9f593e510fec637714f64d6
3
+ size 5496
instruct-finetuning/base/german-gpt2/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: dbmdz/german-gpt2
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
instruct-finetuning/base/german-gpt2/adapter_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "dbmdz/german-gpt2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 4,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "down_proj",
25
+ "v_proj",
26
+ "lm_head",
27
+ "gate_proj",
28
+ "q_proj",
29
+ "up_proj",
30
+ "o_proj"
31
+ ],
32
+ "task_type": "CAUSAL_LM",
33
+ "use_dora": false,
34
+ "use_rslora": false
35
+ }
instruct-finetuning/base/german-gpt2/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c02604806e39e6f29920141a48c20f0df2a13c16f1755d10617a733316a23f3
3
+ size 160949880
instruct-finetuning/base/german-gpt2/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50265
3
+ }
instruct-finetuning/base/german-gpt2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
instruct-finetuning/base/german-gpt2/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84356959f433ca56d0418c13cc0eb6e657c4ebac43251dced6e86ba9a281708d
3
+ size 13067612
instruct-finetuning/base/german-gpt2/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e862fec9943ccd0a1ec4c4847bbc61f171f1c24a21f76d7a53e951110d236b4
3
+ size 14244