Desm0nt commited on
Commit
820dbd8
1 Parent(s): 61b1920

Update 11.07 bulild v13

Browse files

Trained on 1400 images. More various dataset

Files changed (3) hide show
  1. sft_args.json +16 -16
  2. trainer_state.json +1230 -1448
  3. training_args.bin +1 -1
sft_args.json CHANGED
@@ -7,7 +7,7 @@
7
  "additional_trainable_parameters": [],
8
  "tuner_backend": "peft",
9
  "template_type": "phi3-vl",
10
- "output_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216",
11
  "add_output_dir_suffix": true,
12
  "ddp_backend": null,
13
  "ddp_find_unused_parameters": null,
@@ -22,7 +22,7 @@
22
  ],
23
  "val_dataset": [],
24
  "dataset_seed": 42,
25
- "dataset_test_ratio": 0.07,
26
  "use_loss_scale": false,
27
  "system": null,
28
  "max_length": 2048,
@@ -45,18 +45,18 @@
45
  "bnb_4bit_use_double_quant": true,
46
  "bnb_4bit_quant_storage": null,
47
  "lora_target_modules": [
48
- "fc1",
49
- "img_projection.0",
50
  "img_projection.2",
 
 
 
 
51
  "k_proj",
 
 
 
52
  "out_proj",
53
  "fc2",
54
- "qkv_proj",
55
- "o_proj",
56
- "q_proj",
57
- "down_proj",
58
- "v_proj",
59
- "gate_up_proj"
60
  ],
61
  "lora_rank": 128,
62
  "lora_alpha": 128,
@@ -122,15 +122,15 @@
122
  "optim": "adamw_torch",
123
  "adam_beta1": 0.9,
124
  "adam_beta2": 0.95,
125
- "learning_rate": 0.00014,
126
- "weight_decay": 0.1,
127
  "gradient_accumulation_steps": 2,
128
- "max_grad_norm": 0.5,
129
  "predict_with_generate": false,
130
  "lr_scheduler_type": "cosine",
131
  "warmup_ratio": 0.05,
132
  "eval_steps": 50,
133
- "save_steps": 300,
134
  "save_only_model": false,
135
  "save_total_limit": 8,
136
  "logging_steps": 5,
@@ -149,7 +149,7 @@
149
  "use_flash_attn": null,
150
  "ignore_args_error": false,
151
  "check_model_is_latest": true,
152
- "logging_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216/runs",
153
  "report_to": [
154
  "tensorboard"
155
  ],
@@ -206,5 +206,5 @@
206
  "load_in_4bit": false,
207
  "load_in_8bit": false,
208
  "train_sampler_random": true,
209
- "training_args": "Seq2SeqTrainingArguments(output_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240629-080216', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, learning_rate=0.00014, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=0.5, num_train_epochs=4, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs={}, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240629-080216/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=300, save_total_limit=8, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=None, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240629-080216', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=False, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy=None, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, sortish_sampler=True, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=GenerationConfig {\n \"do_sample\": true,\n \"eos_token_id\": 32000,\n \"max_new_tokens\": 2048,\n \"pad_token_id\": 32000,\n \"temperature\": 0.3,\n \"top_k\": 20,\n \"top_p\": 0.7\n}\n, train_sampler_random=True, push_hub_strategy='push_best', acc_strategy='token', additional_saved_files=[], metric_warmup_step=0, train_dataset_sample=1021)"
210
  }
 
7
  "additional_trainable_parameters": [],
8
  "tuner_backend": "peft",
9
  "template_type": "phi3-vl",
10
+ "output_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v9-20240710-235159",
11
  "add_output_dir_suffix": true,
12
  "ddp_backend": null,
13
  "ddp_find_unused_parameters": null,
 
22
  ],
23
  "val_dataset": [],
24
  "dataset_seed": 42,
25
+ "dataset_test_ratio": 0.12,
26
  "use_loss_scale": false,
27
  "system": null,
28
  "max_length": 2048,
 
45
  "bnb_4bit_use_double_quant": true,
46
  "bnb_4bit_quant_storage": null,
47
  "lora_target_modules": [
 
 
48
  "img_projection.2",
49
+ "gate_up_proj",
50
+ "q_proj",
51
+ "v_proj",
52
+ "img_projection.0",
53
  "k_proj",
54
+ "fc1",
55
+ "down_proj",
56
+ "o_proj",
57
  "out_proj",
58
  "fc2",
59
+ "qkv_proj"
 
 
 
 
 
60
  ],
61
  "lora_rank": 128,
62
  "lora_alpha": 128,
 
122
  "optim": "adamw_torch",
123
  "adam_beta1": 0.9,
124
  "adam_beta2": 0.95,
125
+ "learning_rate": 0.000135,
126
+ "weight_decay": 0.2,
127
  "gradient_accumulation_steps": 2,
128
+ "max_grad_norm": 0.4,
129
  "predict_with_generate": false,
130
  "lr_scheduler_type": "cosine",
131
  "warmup_ratio": 0.05,
132
  "eval_steps": 50,
133
+ "save_steps": 100,
134
  "save_only_model": false,
135
  "save_total_limit": 8,
136
  "logging_steps": 5,
 
149
  "use_flash_attn": null,
150
  "ignore_args_error": false,
151
  "check_model_is_latest": true,
152
+ "logging_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v9-20240710-235159/runs",
153
  "report_to": [
154
  "tensorboard"
155
  ],
 
206
  "load_in_4bit": false,
207
  "load_in_8bit": false,
208
  "train_sampler_random": true,
209
+ "training_args": "Seq2SeqTrainingArguments(output_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v9-20240710-235159', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, learning_rate=0.000135, weight_decay=0.2, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=0.4, num_train_epochs=4, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs={}, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v9-20240710-235159/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=8, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=None, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v9-20240710-235159', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=False, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy=None, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, sortish_sampler=True, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=GenerationConfig {\n \"do_sample\": true,\n \"eos_token_id\": 32000,\n \"max_new_tokens\": 2048,\n \"pad_token_id\": 32000,\n \"temperature\": 0.3,\n \"top_k\": 20,\n \"top_p\": 0.7\n}\n, train_sampler_random=True, push_hub_strategy='push_best', acc_strategy='token', additional_saved_files=[], metric_warmup_step=0, train_dataset_sample=1110)"
210
  }
trainer_state.json CHANGED
@@ -1,1991 +1,1773 @@
1
  {
2
- "best_metric": 1.52509904,
3
- "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216\\checkpoint-300",
4
- "epoch": 3.5225048923679063,
5
  "eval_steps": 50,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "acc": 0.49833804,
13
- "epoch": 0.003913894324853229,
14
- "grad_norm": 0.77734375,
15
- "learning_rate": 2.745098039215686e-06,
16
- "loss": 2.37747383,
17
- "memory(GiB)": 17.35,
18
  "step": 1,
19
- "train_speed(iter/s)": 0.076826
20
  },
21
  {
22
- "acc": 0.50652587,
23
- "epoch": 0.019569471624266144,
24
- "grad_norm": 1.140625,
25
- "learning_rate": 1.372549019607843e-05,
26
- "loss": 2.29183841,
27
- "memory(GiB)": 19.33,
28
  "step": 5,
29
- "train_speed(iter/s)": 0.082188
30
  },
31
  {
32
- "acc": 0.52587533,
33
- "epoch": 0.03913894324853229,
34
- "grad_norm": 0.68359375,
35
- "learning_rate": 2.745098039215686e-05,
36
- "loss": 2.22724895,
37
- "memory(GiB)": 19.89,
38
  "step": 10,
39
- "train_speed(iter/s)": 0.082805
40
  },
41
  {
42
- "acc": 0.52128973,
43
- "epoch": 0.05870841487279843,
44
- "grad_norm": 0.8359375,
45
- "learning_rate": 4.117647058823529e-05,
46
- "loss": 2.27491264,
47
- "memory(GiB)": 19.24,
48
  "step": 15,
49
- "train_speed(iter/s)": 0.082482
50
  },
51
  {
52
- "acc": 0.51135335,
53
- "epoch": 0.07827788649706457,
54
- "grad_norm": 0.66015625,
55
- "learning_rate": 5.490196078431372e-05,
56
- "loss": 2.32762127,
57
- "memory(GiB)": 19.86,
58
  "step": 20,
59
- "train_speed(iter/s)": 0.082557
60
  },
61
  {
62
- "acc": 0.54442377,
63
- "epoch": 0.09784735812133072,
64
- "grad_norm": 0.65625,
65
- "learning_rate": 6.862745098039214e-05,
66
- "loss": 2.09772224,
67
- "memory(GiB)": 19.05,
68
  "step": 25,
69
- "train_speed(iter/s)": 0.082348
70
  },
71
  {
72
- "acc": 0.5545311,
73
- "epoch": 0.11741682974559686,
74
- "grad_norm": 0.62109375,
75
- "learning_rate": 8.235294117647058e-05,
76
- "loss": 2.00072975,
77
- "memory(GiB)": 19.89,
78
  "step": 30,
79
- "train_speed(iter/s)": 0.082166
80
  },
81
  {
82
- "acc": 0.57092514,
83
- "epoch": 0.136986301369863,
84
- "grad_norm": 0.9296875,
85
- "learning_rate": 9.6078431372549e-05,
86
- "loss": 1.94450474,
87
- "memory(GiB)": 19.16,
88
  "step": 35,
89
- "train_speed(iter/s)": 0.081966
90
  },
91
  {
92
- "acc": 0.56716595,
93
- "epoch": 0.15655577299412915,
94
- "grad_norm": 0.7734375,
95
- "learning_rate": 0.00010980392156862745,
96
- "loss": 1.90242462,
97
- "memory(GiB)": 19.62,
98
  "step": 40,
99
- "train_speed(iter/s)": 0.081987
100
  },
101
  {
102
- "acc": 0.57822714,
103
- "epoch": 0.1761252446183953,
104
- "grad_norm": 0.74609375,
105
- "learning_rate": 0.00012352941176470587,
106
- "loss": 1.83147659,
107
- "memory(GiB)": 19.99,
108
  "step": 45,
109
- "train_speed(iter/s)": 0.081878
110
  },
111
  {
112
- "acc": 0.57696896,
113
- "epoch": 0.19569471624266144,
114
- "grad_norm": 0.85546875,
115
- "learning_rate": 0.00013725490196078428,
116
- "loss": 1.82299595,
117
- "memory(GiB)": 19.11,
118
  "step": 50,
119
- "train_speed(iter/s)": 0.081843
120
  },
121
  {
122
- "epoch": 0.19569471624266144,
123
- "eval_acc": 0.583503534956795,
124
- "eval_loss": 1.8029242753982544,
125
- "eval_runtime": 85.1254,
126
- "eval_samples_per_second": 0.893,
127
- "eval_steps_per_second": 0.446,
128
  "step": 50
129
  },
130
  {
131
- "acc": 0.59343066,
132
- "epoch": 0.21526418786692758,
133
- "grad_norm": 1.0,
134
- "learning_rate": 0.0001399941138119636,
135
- "loss": 1.82339039,
136
- "memory(GiB)": 22.92,
137
  "step": 55,
138
- "train_speed(iter/s)": 0.072544
139
  },
140
  {
141
- "acc": 0.58571839,
142
- "epoch": 0.23483365949119372,
143
- "grad_norm": 0.7734375,
144
- "learning_rate": 0.00013997020286964757,
145
- "loss": 1.80549526,
146
- "memory(GiB)": 19.43,
147
  "step": 60,
148
- "train_speed(iter/s)": 0.073269
149
  },
150
  {
151
- "acc": 0.60369935,
152
- "epoch": 0.25440313111545987,
153
- "grad_norm": 0.99609375,
154
- "learning_rate": 0.0001399279055646442,
155
- "loss": 1.6768074,
156
- "memory(GiB)": 19.57,
157
  "step": 65,
158
- "train_speed(iter/s)": 0.073897
159
  },
160
  {
161
- "acc": 0.58763909,
162
- "epoch": 0.273972602739726,
163
- "grad_norm": 1.1640625,
164
- "learning_rate": 0.00013986723301159307,
165
- "loss": 1.79169483,
166
- "memory(GiB)": 19.48,
167
  "step": 70,
168
- "train_speed(iter/s)": 0.074533
169
  },
170
  {
171
- "acc": 0.58979025,
172
- "epoch": 0.29354207436399216,
173
- "grad_norm": 0.69140625,
174
- "learning_rate": 0.00013978820115367462,
175
- "loss": 1.72388344,
176
- "memory(GiB)": 19.35,
177
  "step": 75,
178
- "train_speed(iter/s)": 0.075045
179
  },
180
  {
181
- "acc": 0.59725327,
182
- "epoch": 0.3131115459882583,
183
- "grad_norm": 0.75,
184
- "learning_rate": 0.00013969083075842048,
185
- "loss": 1.70864868,
186
- "memory(GiB)": 19.49,
187
  "step": 80,
188
- "train_speed(iter/s)": 0.075523
189
  },
190
  {
191
- "acc": 0.60098982,
192
- "epoch": 0.33268101761252444,
193
- "grad_norm": 4.59375,
194
- "learning_rate": 0.00013957514741225646,
195
- "loss": 1.67311764,
196
- "memory(GiB)": 20.01,
197
  "step": 85,
198
- "train_speed(iter/s)": 0.075928
199
  },
200
  {
201
- "acc": 0.58315139,
202
- "epoch": 0.3522504892367906,
203
- "grad_norm": 0.8359375,
204
- "learning_rate": 0.00013944118151377894,
205
- "loss": 1.74437752,
206
- "memory(GiB)": 20.14,
207
  "step": 90,
208
- "train_speed(iter/s)": 0.076154
209
  },
210
  {
211
- "acc": 0.6138227,
212
- "epoch": 0.37181996086105673,
213
- "grad_norm": 0.75,
214
- "learning_rate": 0.0001392889682657671,
215
- "loss": 1.63750076,
216
- "memory(GiB)": 19.59,
217
  "step": 95,
218
- "train_speed(iter/s)": 0.076253
219
  },
220
  {
221
- "acc": 0.63383026,
222
- "epoch": 0.3913894324853229,
223
- "grad_norm": 0.8515625,
224
- "learning_rate": 0.00013911854766593233,
225
- "loss": 1.56653557,
226
- "memory(GiB)": 19.5,
227
  "step": 100,
228
- "train_speed(iter/s)": 0.076386
229
  },
230
  {
231
- "epoch": 0.3913894324853229,
232
- "eval_acc": 0.604241948153967,
233
- "eval_loss": 1.6681365966796875,
234
- "eval_runtime": 72.2811,
235
- "eval_samples_per_second": 1.051,
236
- "eval_steps_per_second": 0.526,
237
  "step": 100
238
  },
239
  {
240
- "acc": 0.61646304,
241
- "epoch": 0.410958904109589,
242
- "grad_norm": 0.73046875,
243
- "learning_rate": 0.00013892996449640807,
244
- "loss": 1.59651537,
245
- "memory(GiB)": 22.5,
246
  "step": 105,
247
- "train_speed(iter/s)": 0.072857
248
  },
249
  {
250
- "acc": 0.60897431,
251
- "epoch": 0.43052837573385516,
252
- "grad_norm": 0.83984375,
253
- "learning_rate": 0.00013872326831198205,
254
- "loss": 1.70257473,
255
- "memory(GiB)": 19.42,
256
  "step": 110,
257
- "train_speed(iter/s)": 0.073309
258
  },
259
  {
260
- "acc": 0.58328586,
261
- "epoch": 0.4500978473581213,
262
- "grad_norm": 0.9453125,
263
- "learning_rate": 0.00013849851342707462,
264
- "loss": 1.71216717,
265
- "memory(GiB)": 19.47,
266
  "step": 115,
267
- "train_speed(iter/s)": 0.073753
268
  },
269
  {
270
- "acc": 0.62397904,
271
- "epoch": 0.46966731898238745,
272
- "grad_norm": 0.80078125,
273
- "learning_rate": 0.0001382557589014664,
274
- "loss": 1.54239073,
275
- "memory(GiB)": 19.33,
276
  "step": 120,
277
- "train_speed(iter/s)": 0.074078
278
  },
279
  {
280
- "acc": 0.60271235,
281
- "epoch": 0.4892367906066536,
282
- "grad_norm": 1.171875,
283
- "learning_rate": 0.0001379950685247788,
284
- "loss": 1.72333088,
285
- "memory(GiB)": 19.37,
286
  "step": 125,
287
- "train_speed(iter/s)": 0.074428
288
  },
289
  {
290
- "acc": 0.5755064,
291
- "epoch": 0.5088062622309197,
292
- "grad_norm": 0.94921875,
293
- "learning_rate": 0.00013771651079971182,
294
- "loss": 1.81728477,
295
- "memory(GiB)": 19.52,
296
  "step": 130,
297
- "train_speed(iter/s)": 0.074768
298
  },
299
  {
300
- "acc": 0.5844254,
301
- "epoch": 0.5283757338551859,
302
- "grad_norm": 0.8515625,
303
- "learning_rate": 0.00013742015892404325,
304
- "loss": 1.77252998,
305
- "memory(GiB)": 19.51,
306
  "step": 135,
307
- "train_speed(iter/s)": 0.075066
308
  },
309
  {
310
- "acc": 0.5998323,
311
- "epoch": 0.547945205479452,
312
- "grad_norm": 0.8671875,
313
- "learning_rate": 0.0001371060907713942,
314
- "loss": 1.69012871,
315
- "memory(GiB)": 19.54,
316
  "step": 140,
317
- "train_speed(iter/s)": 0.07528
318
  },
319
  {
320
- "acc": 0.62686119,
321
- "epoch": 0.5675146771037182,
322
- "grad_norm": 0.68359375,
323
- "learning_rate": 0.00013677438887076603,
324
- "loss": 1.66314449,
325
- "memory(GiB)": 19.54,
326
  "step": 145,
327
- "train_speed(iter/s)": 0.075467
328
  },
329
  {
330
- "acc": 0.59954901,
331
- "epoch": 0.5870841487279843,
332
- "grad_norm": 0.6328125,
333
- "learning_rate": 0.00013642514038485367,
334
- "loss": 1.67525444,
335
- "memory(GiB)": 19.55,
336
  "step": 150,
337
- "train_speed(iter/s)": 0.075722
338
  },
339
  {
340
- "epoch": 0.5870841487279843,
341
- "eval_acc": 0.6184603299293009,
342
- "eval_loss": 1.5965631008148193,
343
- "eval_runtime": 72.3005,
344
- "eval_samples_per_second": 1.051,
345
- "eval_steps_per_second": 0.526,
346
  "step": 150
347
  },
348
  {
349
- "acc": 0.585955,
350
- "epoch": 0.6066536203522505,
351
- "grad_norm": 0.9375,
352
- "learning_rate": 0.00013605843708714162,
353
- "loss": 1.7486639,
354
- "memory(GiB)": 23.22,
355
  "step": 155,
356
- "train_speed(iter/s)": 0.073368
357
  },
358
  {
359
- "acc": 0.62769904,
360
- "epoch": 0.6262230919765166,
361
- "grad_norm": 0.7265625,
362
- "learning_rate": 0.00013567437533778826,
363
- "loss": 1.55238762,
364
- "memory(GiB)": 19.62,
365
  "step": 160,
366
- "train_speed(iter/s)": 0.073628
367
  },
368
  {
369
- "acc": 0.63651643,
370
- "epoch": 0.6457925636007827,
371
- "grad_norm": 0.80078125,
372
- "learning_rate": 0.00013527305605830488,
373
- "loss": 1.54306393,
374
- "memory(GiB)": 19.88,
375
  "step": 165,
376
- "train_speed(iter/s)": 0.073903
377
  },
378
  {
379
- "acc": 0.59288979,
380
- "epoch": 0.6653620352250489,
381
- "grad_norm": 0.703125,
382
- "learning_rate": 0.0001348545847050361,
383
- "loss": 1.69727612,
384
- "memory(GiB)": 19.58,
385
  "step": 170,
386
- "train_speed(iter/s)": 0.074077
387
  },
388
  {
389
- "acc": 0.61248484,
390
- "epoch": 0.684931506849315,
391
- "grad_norm": 0.9140625,
392
- "learning_rate": 0.00013441907124144866,
393
- "loss": 1.65900764,
394
- "memory(GiB)": 19.49,
395
  "step": 175,
396
- "train_speed(iter/s)": 0.074329
397
  },
398
  {
399
- "acc": 0.61740661,
400
- "epoch": 0.7045009784735812,
401
- "grad_norm": 0.90625,
402
- "learning_rate": 0.0001339666301092358,
403
- "loss": 1.6518961,
404
- "memory(GiB)": 19.68,
405
  "step": 180,
406
- "train_speed(iter/s)": 0.074558
407
  },
408
  {
409
- "acc": 0.62250223,
410
- "epoch": 0.7240704500978473,
411
- "grad_norm": 0.84765625,
412
- "learning_rate": 0.00013349738019824512,
413
- "loss": 1.55100412,
414
- "memory(GiB)": 19.34,
415
  "step": 185,
416
- "train_speed(iter/s)": 0.07477
417
  },
418
  {
419
- "acc": 0.61055808,
420
- "epoch": 0.7436399217221135,
421
- "grad_norm": 0.90625,
422
- "learning_rate": 0.00013301144481523718,
423
- "loss": 1.67241592,
424
- "memory(GiB)": 19.56,
425
  "step": 190,
426
- "train_speed(iter/s)": 0.075006
427
  },
428
  {
429
- "acc": 0.6389596,
430
- "epoch": 0.7632093933463796,
431
- "grad_norm": 0.83203125,
432
- "learning_rate": 0.00013250895165148384,
433
- "loss": 1.54227753,
434
- "memory(GiB)": 19.29,
435
  "step": 195,
436
- "train_speed(iter/s)": 0.075192
437
  },
438
  {
439
- "acc": 0.59149747,
440
- "epoch": 0.7827788649706457,
441
- "grad_norm": 0.68359375,
442
- "learning_rate": 0.00013199003274921416,
443
- "loss": 1.71190453,
444
- "memory(GiB)": 19.35,
445
  "step": 200,
446
- "train_speed(iter/s)": 0.075393
447
  },
448
  {
449
- "epoch": 0.7827788649706457,
450
- "eval_acc": 0.6241162608012569,
451
- "eval_loss": 1.5573129653930664,
452
- "eval_runtime": 69.5471,
453
- "eval_samples_per_second": 1.093,
454
- "eval_steps_per_second": 0.546,
455
  "step": 200
456
  },
457
  {
458
- "acc": 0.62623324,
459
- "epoch": 0.8023483365949119,
460
- "grad_norm": 0.81640625,
461
- "learning_rate": 0.00013145482446691724,
462
- "loss": 1.55779324,
463
- "memory(GiB)": 20.56,
464
  "step": 205,
465
- "train_speed(iter/s)": 0.073671
466
  },
467
  {
468
- "acc": 0.61495056,
469
- "epoch": 0.821917808219178,
470
- "grad_norm": 1.03125,
471
- "learning_rate": 0.00013090346744351058,
472
- "loss": 1.56424398,
473
- "memory(GiB)": 19.48,
474
  "step": 210,
475
- "train_speed(iter/s)": 0.073902
476
  },
477
  {
478
- "acc": 0.59643593,
479
- "epoch": 0.8414872798434442,
480
- "grad_norm": 1.0703125,
481
- "learning_rate": 0.00013033610656138395,
482
- "loss": 1.62190418,
483
- "memory(GiB)": 19.5,
484
  "step": 215,
485
- "train_speed(iter/s)": 0.074133
486
  },
487
  {
488
- "acc": 0.63052382,
489
- "epoch": 0.8610567514677103,
490
- "grad_norm": 0.59765625,
491
- "learning_rate": 0.00012975289090832792,
492
- "loss": 1.53521852,
493
- "memory(GiB)": 19.53,
494
  "step": 220,
495
- "train_speed(iter/s)": 0.074334
496
  },
497
  {
498
- "acc": 0.61408448,
499
- "epoch": 0.8806262230919765,
500
- "grad_norm": 0.7734375,
501
- "learning_rate": 0.00012915397373835754,
502
- "loss": 1.59712257,
503
- "memory(GiB)": 19.52,
504
  "step": 225,
505
- "train_speed(iter/s)": 0.074533
506
  },
507
  {
508
- "acc": 0.62307076,
509
- "epoch": 0.9001956947162426,
510
- "grad_norm": 0.66796875,
511
- "learning_rate": 0.00012853951243144105,
512
- "loss": 1.57903328,
513
- "memory(GiB)": 19.49,
514
  "step": 230,
515
- "train_speed(iter/s)": 0.074719
516
  },
517
  {
518
- "acc": 0.61717134,
519
- "epoch": 0.9197651663405088,
520
- "grad_norm": 0.84375,
521
- "learning_rate": 0.00012790966845214457,
522
- "loss": 1.61422024,
523
- "memory(GiB)": 19.25,
524
  "step": 235,
525
- "train_speed(iter/s)": 0.074916
526
  },
527
  {
528
- "acc": 0.62549253,
529
- "epoch": 0.9393346379647749,
530
- "grad_norm": 0.8125,
531
- "learning_rate": 0.0001272646073072033,
532
- "loss": 1.62806015,
533
- "memory(GiB)": 19.36,
534
  "step": 240,
535
- "train_speed(iter/s)": 0.0751
536
  },
537
  {
538
- "acc": 0.61903515,
539
- "epoch": 0.958904109589041,
540
- "grad_norm": 0.74609375,
541
- "learning_rate": 0.0001266044985020307,
542
- "loss": 1.55927486,
543
- "memory(GiB)": 19.36,
544
  "step": 245,
545
- "train_speed(iter/s)": 0.075266
546
  },
547
  {
548
- "acc": 0.61238952,
549
- "epoch": 0.9784735812133072,
550
- "grad_norm": 0.87890625,
551
- "learning_rate": 0.00012592951549617683,
552
- "loss": 1.52888412,
553
- "memory(GiB)": 19.33,
554
  "step": 250,
555
- "train_speed(iter/s)": 0.075438
556
  },
557
  {
558
- "epoch": 0.9784735812133072,
559
- "eval_acc": 0.6267085624509033,
560
- "eval_loss": 1.5281730890274048,
561
- "eval_runtime": 69.069,
562
- "eval_samples_per_second": 1.1,
563
- "eval_steps_per_second": 0.55,
564
  "step": 250
565
  },
566
  {
567
- "acc": 0.63230977,
568
- "epoch": 0.9980430528375733,
569
- "grad_norm": 0.84765625,
570
- "learning_rate": 0.00012523983565774753,
571
- "loss": 1.53058205,
572
- "memory(GiB)": 19.46,
573
  "step": 255,
574
- "train_speed(iter/s)": 0.074081
575
  },
576
  {
577
- "acc": 0.66042156,
578
- "epoch": 1.0176125244618395,
579
- "grad_norm": 0.76171875,
580
- "learning_rate": 0.00012453564021679692,
581
- "loss": 1.37123928,
582
- "memory(GiB)": 20.18,
583
  "step": 260,
584
- "train_speed(iter/s)": 0.074295
585
  },
586
  {
587
- "acc": 0.67253222,
588
- "epoch": 1.0371819960861057,
589
- "grad_norm": 0.76953125,
590
- "learning_rate": 0.00012381711421770455,
591
- "loss": 1.28407507,
592
- "memory(GiB)": 19.7,
593
  "step": 265,
594
- "train_speed(iter/s)": 0.074448
595
  },
596
  {
597
- "acc": 0.66850777,
598
- "epoch": 1.0567514677103718,
599
- "grad_norm": 0.98046875,
600
- "learning_rate": 0.0001230844464705507,
601
- "loss": 1.27961807,
602
- "memory(GiB)": 19.58,
603
  "step": 270,
604
- "train_speed(iter/s)": 0.07459
605
  },
606
  {
607
- "acc": 0.67196817,
608
- "epoch": 1.076320939334638,
609
- "grad_norm": 0.9140625,
610
- "learning_rate": 0.00012233782950150186,
611
- "loss": 1.28494987,
612
- "memory(GiB)": 19.61,
613
  "step": 275,
614
- "train_speed(iter/s)": 0.074728
615
  },
616
  {
617
- "acc": 0.67708378,
618
- "epoch": 1.095890410958904,
619
- "grad_norm": 0.87109375,
620
- "learning_rate": 0.00012157745950221989,
621
- "loss": 1.29551096,
622
- "memory(GiB)": 19.63,
623
  "step": 280,
624
- "train_speed(iter/s)": 0.074881
625
  },
626
  {
627
- "acc": 0.66973438,
628
- "epoch": 1.1154598825831703,
629
- "grad_norm": 1.0859375,
630
- "learning_rate": 0.0001208035362783079,
631
- "loss": 1.27705774,
632
- "memory(GiB)": 19.49,
633
  "step": 285,
634
- "train_speed(iter/s)": 0.075029
635
  },
636
  {
637
- "acc": 0.6750237,
638
- "epoch": 1.1350293542074363,
639
- "grad_norm": 1.0859375,
640
- "learning_rate": 0.00012001626319680648,
641
- "loss": 1.25660419,
642
- "memory(GiB)": 19.55,
643
  "step": 290,
644
- "train_speed(iter/s)": 0.07515
645
  },
646
  {
647
- "acc": 0.624368,
648
- "epoch": 1.1545988258317026,
649
- "grad_norm": 1.1953125,
650
- "learning_rate": 0.00011921584713275411,
651
- "loss": 1.5070508,
652
- "memory(GiB)": 19.52,
653
  "step": 295,
654
- "train_speed(iter/s)": 0.075278
655
  },
656
  {
657
- "acc": 0.66252189,
658
- "epoch": 1.1741682974559686,
659
- "grad_norm": 0.828125,
660
- "learning_rate": 0.0001184024984148257,
661
- "loss": 1.32014723,
662
- "memory(GiB)": 19.92,
663
  "step": 300,
664
- "train_speed(iter/s)": 0.075433
665
  },
666
  {
667
- "epoch": 1.1741682974559686,
668
- "eval_acc": 0.6282796543597801,
669
- "eval_loss": 1.5250990390777588,
670
- "eval_runtime": 70.3986,
671
- "eval_samples_per_second": 1.08,
672
- "eval_steps_per_second": 0.54,
673
  "step": 300
674
  },
675
  {
676
- "acc": 0.67028356,
677
- "epoch": 1.1937377690802349,
678
- "grad_norm": 1.7109375,
679
- "learning_rate": 0.00011757643077006372,
680
- "loss": 1.28037386,
681
- "memory(GiB)": 22.6,
682
  "step": 305,
683
- "train_speed(iter/s)": 0.074243
684
  },
685
  {
686
- "acc": 0.655305,
687
- "epoch": 1.213307240704501,
688
- "grad_norm": 1.1015625,
689
- "learning_rate": 0.00011673786126771617,
690
- "loss": 1.31057158,
691
- "memory(GiB)": 19.72,
692
  "step": 310,
693
- "train_speed(iter/s)": 0.074392
694
  },
695
  {
696
- "acc": 0.66528535,
697
- "epoch": 1.2328767123287672,
698
- "grad_norm": 1.6171875,
699
- "learning_rate": 0.0001158870102621965,
700
- "loss": 1.29698696,
701
- "memory(GiB)": 19.08,
702
  "step": 315,
703
- "train_speed(iter/s)": 0.074534
704
  },
705
  {
706
- "acc": 0.66950455,
707
- "epoch": 1.2524461839530332,
708
- "grad_norm": 1.2421875,
709
- "learning_rate": 0.00011502410133517998,
710
- "loss": 1.27706356,
711
- "memory(GiB)": 19.87,
712
  "step": 320,
713
- "train_speed(iter/s)": 0.074667
714
  },
715
  {
716
- "acc": 0.65843534,
717
- "epoch": 1.2720156555772995,
718
- "grad_norm": 1.2265625,
719
- "learning_rate": 0.0001141493612368524,
720
- "loss": 1.30308371,
721
- "memory(GiB)": 19.87,
722
  "step": 325,
723
- "train_speed(iter/s)": 0.0748
724
  },
725
  {
726
- "acc": 0.66441913,
727
- "epoch": 1.2915851272015655,
728
- "grad_norm": 1.2578125,
729
- "learning_rate": 0.00011326301982632583,
730
- "loss": 1.26109972,
731
- "memory(GiB)": 19.09,
732
  "step": 330,
733
- "train_speed(iter/s)": 0.074935
734
  },
735
  {
736
- "acc": 0.68711085,
737
- "epoch": 1.3111545988258317,
738
- "grad_norm": 0.95703125,
739
- "learning_rate": 0.00011236531001123771,
740
- "loss": 1.19278584,
741
- "memory(GiB)": 19.73,
742
  "step": 335,
743
- "train_speed(iter/s)": 0.075053
744
  },
745
  {
746
- "acc": 0.66676803,
747
- "epoch": 1.3307240704500978,
748
- "grad_norm": 1.96875,
749
- "learning_rate": 0.0001114564676865486,
750
- "loss": 1.3068346,
751
- "memory(GiB)": 19.84,
752
  "step": 340,
753
- "train_speed(iter/s)": 0.075151
754
  },
755
  {
756
- "acc": 0.66865935,
757
- "epoch": 1.350293542074364,
758
- "grad_norm": 1.2421875,
759
- "learning_rate": 0.00011053673167255516,
760
- "loss": 1.30573978,
761
- "memory(GiB)": 19.66,
762
  "step": 345,
763
- "train_speed(iter/s)": 0.075271
764
  },
765
  {
766
- "acc": 0.66606102,
767
- "epoch": 1.36986301369863,
768
- "grad_norm": 0.76171875,
769
- "learning_rate": 0.00010960634365213437,
770
- "loss": 1.26872787,
771
- "memory(GiB)": 19.73,
772
  "step": 350,
773
- "train_speed(iter/s)": 0.075377
774
  },
775
  {
776
- "epoch": 1.36986301369863,
777
- "eval_acc": 0.6315003927729772,
778
- "eval_loss": 1.5066882371902466,
779
- "eval_runtime": 72.5685,
780
- "eval_samples_per_second": 1.047,
781
- "eval_steps_per_second": 0.524,
782
  "step": 350
783
  },
784
  {
785
- "acc": 0.67307239,
786
- "epoch": 1.3894324853228963,
787
- "grad_norm": 1.1796875,
788
- "learning_rate": 0.0001086655481072354,
789
- "loss": 1.27917318,
790
- "memory(GiB)": 22.92,
791
  "step": 355,
792
- "train_speed(iter/s)": 0.074318
793
  },
794
  {
795
- "acc": 0.65870218,
796
- "epoch": 1.4090019569471623,
797
- "grad_norm": 3.609375,
798
- "learning_rate": 0.00010771459225463617,
799
- "loss": 1.33731461,
800
- "memory(GiB)": 19.67,
801
  "step": 360,
802
- "train_speed(iter/s)": 0.074416
803
  },
804
  {
805
- "acc": 0.68150563,
806
- "epoch": 1.4285714285714286,
807
- "grad_norm": 0.9296875,
808
- "learning_rate": 0.00010675372598098113,
809
- "loss": 1.20515957,
810
- "memory(GiB)": 19.99,
811
  "step": 365,
812
- "train_speed(iter/s)": 0.07451
813
  },
814
  {
815
- "acc": 0.66793504,
816
- "epoch": 1.4481409001956946,
817
- "grad_norm": 1.03125,
818
- "learning_rate": 0.00010578320177711743,
819
- "loss": 1.31133595,
820
- "memory(GiB)": 19.9,
821
  "step": 370,
822
- "train_speed(iter/s)": 0.074613
823
  },
824
  {
825
- "acc": 0.66840873,
826
- "epoch": 1.467710371819961,
827
- "grad_norm": 0.9453125,
828
- "learning_rate": 0.00010480327467174705,
829
- "loss": 1.27730675,
830
- "memory(GiB)": 19.91,
831
  "step": 375,
832
- "train_speed(iter/s)": 0.074709
833
  },
834
  {
835
- "acc": 0.6621439,
836
- "epoch": 1.487279843444227,
837
- "grad_norm": 0.7890625,
838
- "learning_rate": 0.00010381420216441152,
839
- "loss": 1.29670372,
840
- "memory(GiB)": 19.65,
841
  "step": 380,
842
- "train_speed(iter/s)": 0.074824
843
  },
844
  {
845
- "acc": 0.66805882,
846
- "epoch": 1.5068493150684932,
847
- "grad_norm": 0.8203125,
848
- "learning_rate": 0.00010281624415782804,
849
- "loss": 1.23922901,
850
- "memory(GiB)": 19.77,
851
  "step": 385,
852
- "train_speed(iter/s)": 0.074927
853
  },
854
  {
855
- "acc": 0.66435666,
856
- "epoch": 1.5264187866927594,
857
- "grad_norm": 0.82421875,
858
- "learning_rate": 0.0001018096628895935,
859
- "loss": 1.27945633,
860
- "memory(GiB)": 19.79,
861
  "step": 390,
862
- "train_speed(iter/s)": 0.075033
863
  },
864
  {
865
- "acc": 0.68444743,
866
- "epoch": 1.5459882583170255,
867
- "grad_norm": 0.98046875,
868
- "learning_rate": 0.00010079472286327533,
869
- "loss": 1.2325819,
870
- "memory(GiB)": 19.55,
871
  "step": 395,
872
- "train_speed(iter/s)": 0.075133
873
  },
874
  {
875
- "acc": 0.68633671,
876
- "epoch": 1.5655577299412915,
877
- "grad_norm": 1.171875,
878
- "learning_rate": 9.977169077890672e-05,
879
- "loss": 1.26248102,
880
- "memory(GiB)": 19.79,
881
  "step": 400,
882
- "train_speed(iter/s)": 0.075233
883
  },
884
  {
885
- "epoch": 1.5655577299412915,
886
- "eval_acc": 0.6297721916732129,
887
- "eval_loss": 1.5114485025405884,
888
- "eval_runtime": 70.7985,
889
- "eval_samples_per_second": 1.073,
890
- "eval_steps_per_second": 0.537,
891
  "step": 400
892
  },
893
  {
894
- "acc": 0.67859097,
895
- "epoch": 1.5851272015655578,
896
- "grad_norm": 1.046875,
897
- "learning_rate": 9.874083546290482e-05,
898
- "loss": 1.2065486,
899
- "memory(GiB)": 22.72,
900
  "step": 405,
901
- "train_speed(iter/s)": 0.074347
902
  },
903
  {
904
- "acc": 0.66178751,
905
- "epoch": 1.604696673189824,
906
- "grad_norm": 0.96484375,
907
- "learning_rate": 9.770242779743008e-05,
908
- "loss": 1.30969448,
909
- "memory(GiB)": 20.13,
910
  "step": 410,
911
- "train_speed(iter/s)": 0.074453
912
  },
913
  {
914
- "acc": 0.65872512,
915
- "epoch": 1.62426614481409,
916
- "grad_norm": 0.74609375,
917
- "learning_rate": 9.665674064920533e-05,
918
- "loss": 1.27483397,
919
- "memory(GiB)": 20.17,
920
  "step": 415,
921
- "train_speed(iter/s)": 0.074534
922
  },
923
  {
924
- "acc": 0.66567349,
925
- "epoch": 1.643835616438356,
926
- "grad_norm": 0.87109375,
927
- "learning_rate": 9.560404879781353e-05,
928
- "loss": 1.31585007,
929
- "memory(GiB)": 20.07,
930
  "step": 420,
931
- "train_speed(iter/s)": 0.074639
932
  },
933
  {
934
- "acc": 0.66216898,
935
- "epoch": 1.6634050880626223,
936
- "grad_norm": 0.85546875,
937
- "learning_rate": 9.454462886349281e-05,
938
- "loss": 1.32738457,
939
- "memory(GiB)": 19.43,
940
  "step": 425,
941
- "train_speed(iter/s)": 0.074732
942
  },
943
  {
944
- "acc": 0.6608973,
945
- "epoch": 1.6829745596868886,
946
- "grad_norm": 1.1328125,
947
- "learning_rate": 9.347875923444772e-05,
948
- "loss": 1.2792593,
949
- "memory(GiB)": 20.05,
950
  "step": 430,
951
- "train_speed(iter/s)": 0.074827
952
  },
953
  {
954
- "acc": 0.65830297,
955
- "epoch": 1.7025440313111546,
956
- "grad_norm": 0.94921875,
957
- "learning_rate": 9.240671999369607e-05,
958
- "loss": 1.34132614,
959
- "memory(GiB)": 19.82,
960
  "step": 435,
961
- "train_speed(iter/s)": 0.074914
962
  },
963
  {
964
- "acc": 0.68926673,
965
- "epoch": 1.7221135029354206,
966
- "grad_norm": 0.76953125,
967
- "learning_rate": 9.132879284547038e-05,
968
- "loss": 1.15266266,
969
- "memory(GiB)": 19.28,
970
  "step": 440,
971
- "train_speed(iter/s)": 0.074997
972
  },
973
  {
974
- "acc": 0.65699558,
975
- "epoch": 1.741682974559687,
976
- "grad_norm": 0.96484375,
977
- "learning_rate": 9.024526104119312e-05,
978
- "loss": 1.32417459,
979
- "memory(GiB)": 19.29,
980
  "step": 445,
981
- "train_speed(iter/s)": 0.075079
982
  },
983
  {
984
- "acc": 0.68860197,
985
- "epoch": 1.7612524461839532,
986
- "grad_norm": 0.8203125,
987
- "learning_rate": 8.91564093050458e-05,
988
- "loss": 1.20134068,
989
- "memory(GiB)": 19.33,
990
  "step": 450,
991
- "train_speed(iter/s)": 0.07515
992
  },
993
  {
994
- "epoch": 1.7612524461839532,
995
- "eval_acc": 0.6351924587588373,
996
- "eval_loss": 1.4908838272094727,
997
- "eval_runtime": 71.5161,
998
- "eval_samples_per_second": 1.063,
999
- "eval_steps_per_second": 0.531,
1000
  "step": 450
1001
  },
1002
  {
1003
- "acc": 0.65404687,
1004
- "epoch": 1.7808219178082192,
1005
- "grad_norm": 1.0078125,
1006
- "learning_rate": 8.806252375915052e-05,
1007
- "loss": 1.31502724,
1008
- "memory(GiB)": 19.13,
1009
  "step": 455,
1010
- "train_speed(iter/s)": 0.074358
1011
  },
1012
  {
1013
- "acc": 0.69379678,
1014
- "epoch": 1.8003913894324852,
1015
- "grad_norm": 1.1015625,
1016
- "learning_rate": 8.696389184838471e-05,
1017
- "loss": 1.1870966,
1018
- "memory(GiB)": 20.18,
1019
  "step": 460,
1020
- "train_speed(iter/s)": 0.074437
1021
  },
1022
  {
1023
- "acc": 0.67447538,
1024
- "epoch": 1.8199608610567515,
1025
- "grad_norm": 1.2890625,
1026
- "learning_rate": 8.586080226484789e-05,
1027
- "loss": 1.19511604,
1028
- "memory(GiB)": 20.09,
1029
  "step": 465,
1030
- "train_speed(iter/s)": 0.074531
1031
  },
1032
  {
1033
- "acc": 0.67230067,
1034
- "epoch": 1.8395303326810177,
1035
- "grad_norm": 1.0390625,
1036
- "learning_rate": 8.475354487200092e-05,
1037
- "loss": 1.30591021,
1038
- "memory(GiB)": 19.29,
1039
  "step": 470,
1040
- "train_speed(iter/s)": 0.074608
1041
  },
1042
  {
1043
- "acc": 0.65006552,
1044
- "epoch": 1.8590998043052838,
1045
- "grad_norm": 3.21875,
1046
- "learning_rate": 8.364241062849732e-05,
1047
- "loss": 1.35613279,
1048
- "memory(GiB)": 19.51,
1049
  "step": 475,
1050
- "train_speed(iter/s)": 0.07469
1051
  },
1052
  {
1053
- "acc": 0.66248426,
1054
- "epoch": 1.8786692759295498,
1055
- "grad_norm": 1.0703125,
1056
- "learning_rate": 8.252769151172682e-05,
1057
- "loss": 1.34706697,
1058
- "memory(GiB)": 19.16,
1059
  "step": 480,
1060
- "train_speed(iter/s)": 0.074779
1061
  },
1062
  {
1063
- "acc": 0.66462736,
1064
- "epoch": 1.898238747553816,
1065
- "grad_norm": 0.8515625,
1066
- "learning_rate": 8.140968044109134e-05,
1067
- "loss": 1.31343336,
1068
- "memory(GiB)": 19.17,
1069
  "step": 485,
1070
- "train_speed(iter/s)": 0.07486
1071
  },
1072
  {
1073
- "acc": 0.65373287,
1074
- "epoch": 1.9178082191780823,
1075
- "grad_norm": 1.078125,
1076
- "learning_rate": 8.028867120103326e-05,
1077
- "loss": 1.31145601,
1078
- "memory(GiB)": 19.46,
1079
  "step": 490,
1080
- "train_speed(iter/s)": 0.074941
1081
  },
1082
  {
1083
- "acc": 0.6731041,
1084
- "epoch": 1.9373776908023483,
1085
- "grad_norm": 0.89453125,
1086
- "learning_rate": 7.916495836383648e-05,
1087
- "loss": 1.24272699,
1088
- "memory(GiB)": 19.45,
1089
  "step": 495,
1090
- "train_speed(iter/s)": 0.075011
1091
  },
1092
  {
1093
- "acc": 0.66485052,
1094
- "epoch": 1.9569471624266144,
1095
- "grad_norm": 1.03125,
1096
- "learning_rate": 7.80388372122204e-05,
1097
- "loss": 1.28164721,
1098
- "memory(GiB)": 19.24,
1099
  "step": 500,
1100
- "train_speed(iter/s)": 0.07509
1101
  },
1102
  {
1103
- "epoch": 1.9569471624266144,
1104
- "eval_acc": 0.6349567949725059,
1105
- "eval_loss": 1.483258843421936,
1106
- "eval_runtime": 72.4797,
1107
- "eval_samples_per_second": 1.049,
1108
- "eval_steps_per_second": 0.524,
1109
  "step": 500
1110
  },
1111
  {
1112
- "acc": 0.68325486,
1113
- "epoch": 1.9765166340508806,
1114
- "grad_norm": 1.2890625,
1115
- "learning_rate": 7.691060366174728e-05,
1116
- "loss": 1.2257865,
1117
- "memory(GiB)": 22.98,
1118
  "step": 505,
1119
- "train_speed(iter/s)": 0.074371
1120
  },
1121
  {
1122
- "acc": 0.68977013,
1123
- "epoch": 1.9960861056751469,
1124
- "grad_norm": 1.0234375,
1125
- "learning_rate": 7.578055418306327e-05,
1126
- "loss": 1.25723343,
1127
- "memory(GiB)": 19.56,
1128
  "step": 510,
1129
- "train_speed(iter/s)": 0.074471
1130
  },
1131
  {
1132
- "acc": 0.72185702,
1133
- "epoch": 2.015655577299413,
1134
- "grad_norm": 0.7890625,
1135
- "learning_rate": 7.464898572399353e-05,
1136
- "loss": 1.01715631,
1137
- "memory(GiB)": 20.07,
1138
  "step": 515,
1139
- "train_speed(iter/s)": 0.074591
1140
  },
1141
  {
1142
- "acc": 0.71889682,
1143
- "epoch": 2.035225048923679,
1144
- "grad_norm": 1.0625,
1145
- "learning_rate": 7.351619563151208e-05,
1146
- "loss": 1.03077154,
1147
- "memory(GiB)": 19.92,
1148
  "step": 520,
1149
- "train_speed(iter/s)": 0.074683
1150
  },
1151
  {
1152
- "acc": 0.7505311,
1153
- "epoch": 2.0547945205479454,
1154
- "grad_norm": 1.9609375,
1155
- "learning_rate": 7.238248157360663e-05,
1156
- "loss": 0.93218956,
1157
- "memory(GiB)": 19.85,
1158
  "step": 525,
1159
- "train_speed(iter/s)": 0.07477
1160
  },
1161
  {
1162
- "acc": 0.7315311,
1163
- "epoch": 2.0743639921722115,
1164
- "grad_norm": 1.1875,
1165
- "learning_rate": 7.124814146105921e-05,
1166
- "loss": 0.96330833,
1167
- "memory(GiB)": 19.87,
1168
  "step": 530,
1169
- "train_speed(iter/s)": 0.074853
1170
  },
1171
  {
1172
- "acc": 0.75555606,
1173
- "epoch": 2.0939334637964775,
1174
- "grad_norm": 1.3515625,
1175
- "learning_rate": 7.011347336916277e-05,
1176
- "loss": 0.86877937,
1177
- "memory(GiB)": 18.46,
1178
  "step": 535,
1179
- "train_speed(iter/s)": 0.074938
1180
  },
1181
  {
1182
- "acc": 0.74034052,
1183
- "epoch": 2.1135029354207435,
1184
- "grad_norm": 1.546875,
1185
- "learning_rate": 6.897877545939475e-05,
1186
- "loss": 0.90922012,
1187
- "memory(GiB)": 19.89,
1188
  "step": 540,
1189
- "train_speed(iter/s)": 0.075027
1190
  },
1191
  {
1192
- "acc": 0.72400937,
1193
- "epoch": 2.1330724070450096,
1194
- "grad_norm": 1.90625,
1195
- "learning_rate": 6.784434590106808e-05,
1196
- "loss": 0.98424711,
1197
- "memory(GiB)": 19.11,
1198
  "step": 545,
1199
- "train_speed(iter/s)": 0.075114
1200
  },
1201
  {
1202
- "acc": 0.77706275,
1203
- "epoch": 2.152641878669276,
1204
- "grad_norm": 1.359375,
1205
- "learning_rate": 6.671048279297972e-05,
1206
- "loss": 0.80820856,
1207
- "memory(GiB)": 19.86,
1208
  "step": 550,
1209
- "train_speed(iter/s)": 0.075193
1210
  },
1211
  {
1212
- "epoch": 2.152641878669276,
1213
- "eval_acc": 0.6260015710919089,
1214
- "eval_loss": 1.6081812381744385,
1215
- "eval_runtime": 68.6973,
1216
- "eval_samples_per_second": 1.106,
1217
- "eval_steps_per_second": 0.553,
1218
  "step": 550
1219
  },
1220
  {
1221
- "acc": 0.75351696,
1222
- "epoch": 2.172211350293542,
1223
- "grad_norm": 2.015625,
1224
- "learning_rate": 6.55774840850782e-05,
1225
- "loss": 0.86192131,
1226
- "memory(GiB)": 22.21,
1227
  "step": 555,
1228
- "train_speed(iter/s)": 0.074578
1229
  },
1230
  {
1231
- "acc": 0.74249997,
1232
- "epoch": 2.191780821917808,
1233
- "grad_norm": 1.4609375,
1234
- "learning_rate": 6.444564750017003e-05,
1235
- "loss": 0.91982813,
1236
- "memory(GiB)": 19.87,
1237
  "step": 560,
1238
- "train_speed(iter/s)": 0.074665
1239
  },
1240
  {
1241
- "acc": 0.73636398,
1242
- "epoch": 2.2113502935420746,
1243
- "grad_norm": 1.9375,
1244
- "learning_rate": 6.331527045568573e-05,
1245
- "loss": 0.93448582,
1246
- "memory(GiB)": 19.33,
1247
  "step": 565,
1248
- "train_speed(iter/s)": 0.074752
1249
  },
1250
  {
1251
- "acc": 0.74081583,
1252
- "epoch": 2.2309197651663406,
1253
- "grad_norm": 2.21875,
1254
- "learning_rate": 6.218664998552634e-05,
1255
- "loss": 0.94956303,
1256
- "memory(GiB)": 19.8,
1257
  "step": 570,
1258
- "train_speed(iter/s)": 0.074842
1259
  },
1260
  {
1261
- "acc": 0.74573116,
1262
- "epoch": 2.2504892367906066,
1263
- "grad_norm": 2.546875,
1264
- "learning_rate": 6.106008266201046e-05,
1265
- "loss": 0.88486786,
1266
- "memory(GiB)": 19.92,
1267
  "step": 575,
1268
- "train_speed(iter/s)": 0.074925
1269
  },
1270
  {
1271
- "acc": 0.75495067,
1272
- "epoch": 2.2700587084148727,
1273
- "grad_norm": 2.09375,
1274
- "learning_rate": 5.9935864517942844e-05,
1275
- "loss": 0.84776802,
1276
- "memory(GiB)": 19.89,
1277
  "step": 580,
1278
- "train_speed(iter/s)": 0.075
1279
  },
1280
  {
1281
- "acc": 0.74743519,
1282
- "epoch": 2.2896281800391387,
1283
- "grad_norm": 1.5859375,
1284
- "learning_rate": 5.881429096882449e-05,
1285
- "loss": 0.92330503,
1286
- "memory(GiB)": 19.03,
1287
  "step": 585,
1288
- "train_speed(iter/s)": 0.075076
1289
  },
1290
  {
1291
- "acc": 0.74913769,
1292
- "epoch": 2.309197651663405,
1293
- "grad_norm": 1.6640625,
1294
- "learning_rate": 5.769565673522515e-05,
1295
- "loss": 0.92942295,
1296
- "memory(GiB)": 20.04,
1297
  "step": 590,
1298
- "train_speed(iter/s)": 0.075149
1299
  },
1300
  {
1301
- "acc": 0.74875064,
1302
- "epoch": 2.328767123287671,
1303
- "grad_norm": 1.25,
1304
- "learning_rate": 5.658025576533832e-05,
1305
- "loss": 0.90142069,
1306
- "memory(GiB)": 19.96,
1307
  "step": 595,
1308
- "train_speed(iter/s)": 0.075215
1309
  },
1310
  {
1311
- "acc": 0.74648356,
1312
- "epoch": 2.3483365949119372,
1313
- "grad_norm": 1.65625,
1314
- "learning_rate": 5.546838115773929e-05,
1315
- "loss": 0.91528139,
1316
- "memory(GiB)": 19.84,
1317
  "step": 600,
1318
- "train_speed(iter/s)": 0.075292
1319
  },
1320
  {
1321
- "epoch": 2.3483365949119372,
1322
- "eval_acc": 0.6284367635506677,
1323
- "eval_loss": 1.593437910079956,
1324
- "eval_runtime": 68.9856,
1325
- "eval_samples_per_second": 1.102,
1326
- "eval_steps_per_second": 0.551,
1327
  "step": 600
1328
  },
1329
  {
1330
- "acc": 0.75246172,
1331
- "epoch": 2.3679060665362037,
1332
- "grad_norm": 1.2109375,
1333
- "learning_rate": 5.4360325084366416e-05,
1334
- "loss": 0.87402363,
1335
- "memory(GiB)": 22.69,
1336
  "step": 605,
1337
- "train_speed(iter/s)": 0.074706
1338
  },
1339
  {
1340
- "acc": 0.74078665,
1341
- "epoch": 2.3874755381604698,
1342
- "grad_norm": 1.0390625,
1343
- "learning_rate": 5.3256378713745815e-05,
1344
- "loss": 0.91142588,
1345
- "memory(GiB)": 20.15,
1346
  "step": 610,
1347
- "train_speed(iter/s)": 0.074788
1348
  },
1349
  {
1350
- "acc": 0.75772052,
1351
- "epoch": 2.407045009784736,
1352
- "grad_norm": 2.03125,
1353
- "learning_rate": 5.21568321344799e-05,
1354
- "loss": 0.85517597,
1355
- "memory(GiB)": 19.37,
1356
  "step": 615,
1357
- "train_speed(iter/s)": 0.074857
1358
  },
1359
  {
1360
- "acc": 0.75341692,
1361
- "epoch": 2.426614481409002,
1362
- "grad_norm": 1.40625,
1363
- "learning_rate": 5.10619742790194e-05,
1364
- "loss": 0.87981377,
1365
- "memory(GiB)": 18.91,
1366
  "step": 620,
1367
- "train_speed(iter/s)": 0.074925
1368
  },
1369
  {
1370
- "acc": 0.76221485,
1371
- "epoch": 2.446183953033268,
1372
- "grad_norm": 5.5625,
1373
- "learning_rate": 4.9972092847739603e-05,
1374
- "loss": 0.89623175,
1375
- "memory(GiB)": 20.27,
1376
  "step": 625,
1377
- "train_speed(iter/s)": 0.074994
1378
  },
1379
  {
1380
- "acc": 0.74322577,
1381
- "epoch": 2.4657534246575343,
1382
- "grad_norm": 1.6796875,
1383
- "learning_rate": 4.8887474233339963e-05,
1384
- "loss": 0.89493027,
1385
- "memory(GiB)": 19.38,
1386
  "step": 630,
1387
- "train_speed(iter/s)": 0.075068
1388
  },
1389
  {
1390
- "acc": 0.74455509,
1391
- "epoch": 2.4853228962818004,
1392
- "grad_norm": 1.3046875,
1393
- "learning_rate": 4.780840344558753e-05,
1394
- "loss": 0.92399101,
1395
- "memory(GiB)": 19.32,
1396
  "step": 635,
1397
- "train_speed(iter/s)": 0.075143
1398
  },
1399
  {
1400
- "acc": 0.75597148,
1401
- "epoch": 2.5048923679060664,
1402
- "grad_norm": 1.65625,
1403
- "learning_rate": 4.673516403642383e-05,
1404
- "loss": 0.86396818,
1405
- "memory(GiB)": 19.52,
1406
  "step": 640,
1407
- "train_speed(iter/s)": 0.075214
1408
  },
1409
  {
1410
- "acc": 0.75100412,
1411
- "epoch": 2.524461839530333,
1412
- "grad_norm": 1.5390625,
1413
- "learning_rate": 4.5668038025454554e-05,
1414
- "loss": 0.89630232,
1415
- "memory(GiB)": 19.54,
1416
  "step": 645,
1417
- "train_speed(iter/s)": 0.07528
1418
  },
1419
  {
1420
- "acc": 0.74814,
1421
- "epoch": 2.544031311154599,
1422
- "grad_norm": 1.7265625,
1423
- "learning_rate": 4.460730582584228e-05,
1424
- "loss": 0.90660105,
1425
- "memory(GiB)": 19.46,
1426
  "step": 650,
1427
- "train_speed(iter/s)": 0.075343
1428
  },
1429
  {
1430
- "epoch": 2.544031311154599,
1431
- "eval_acc": 0.6304006284367636,
1432
- "eval_loss": 1.6207610368728638,
1433
- "eval_runtime": 68.9365,
1434
- "eval_samples_per_second": 1.102,
1435
- "eval_steps_per_second": 0.551,
1436
  "step": 650
1437
  },
1438
  {
1439
- "acc": 0.74153934,
1440
- "epoch": 2.563600782778865,
1441
- "grad_norm": 2.328125,
1442
- "learning_rate": 4.3553246170621e-05,
1443
- "loss": 0.90404129,
1444
- "memory(GiB)": 19.38,
1445
  "step": 655,
1446
- "train_speed(iter/s)": 0.074813
1447
  },
1448
  {
1449
- "acc": 0.76082869,
1450
- "epoch": 2.583170254403131,
1451
- "grad_norm": 1.5390625,
1452
- "learning_rate": 4.2506136039452357e-05,
1453
- "loss": 0.90251627,
1454
- "memory(GiB)": 20.24,
1455
  "step": 660,
1456
- "train_speed(iter/s)": 0.074877
1457
  },
1458
  {
1459
- "acc": 0.76424356,
1460
- "epoch": 2.602739726027397,
1461
- "grad_norm": 1.109375,
1462
- "learning_rate": 4.146625058584251e-05,
1463
- "loss": 0.85076065,
1464
- "memory(GiB)": 19.4,
1465
  "step": 665,
1466
- "train_speed(iter/s)": 0.07494
1467
  },
1468
  {
1469
- "acc": 0.75788155,
1470
- "epoch": 2.6223091976516635,
1471
- "grad_norm": 1.828125,
1472
- "learning_rate": 4.043386306483886e-05,
1473
- "loss": 0.8638917,
1474
- "memory(GiB)": 18.71,
1475
  "step": 670,
1476
- "train_speed(iter/s)": 0.075
1477
  },
1478
  {
1479
- "acc": 0.74567804,
1480
- "epoch": 2.6418786692759295,
1481
- "grad_norm": 1.5078125,
1482
- "learning_rate": 3.940924476122573e-05,
1483
- "loss": 0.91406345,
1484
- "memory(GiB)": 19.53,
1485
  "step": 675,
1486
- "train_speed(iter/s)": 0.075062
1487
  },
1488
  {
1489
- "acc": 0.77229648,
1490
- "epoch": 2.6614481409001955,
1491
- "grad_norm": 1.3984375,
1492
- "learning_rate": 3.839266491823776e-05,
1493
- "loss": 0.79556112,
1494
- "memory(GiB)": 19.59,
1495
  "step": 680,
1496
- "train_speed(iter/s)": 0.075125
1497
  },
1498
  {
1499
- "acc": 0.7331708,
1500
- "epoch": 2.681017612524462,
1501
- "grad_norm": 1.6015625,
1502
- "learning_rate": 3.73843906668096e-05,
1503
- "loss": 0.95133247,
1504
- "memory(GiB)": 19.69,
1505
  "step": 685,
1506
- "train_speed(iter/s)": 0.075185
1507
  },
1508
  {
1509
- "acc": 0.76955137,
1510
- "epoch": 2.700587084148728,
1511
- "grad_norm": 1.4140625,
1512
- "learning_rate": 3.6384686955380996e-05,
1513
- "loss": 0.82770052,
1514
- "memory(GiB)": 19.53,
1515
  "step": 690,
1516
- "train_speed(iter/s)": 0.075245
1517
  },
1518
  {
1519
- "acc": 0.73245583,
1520
- "epoch": 2.720156555772994,
1521
- "grad_norm": 1.59375,
1522
- "learning_rate": 3.539381648027495e-05,
1523
- "loss": 0.93347349,
1524
- "memory(GiB)": 19.38,
1525
  "step": 695,
1526
- "train_speed(iter/s)": 0.075313
1527
  },
1528
  {
1529
- "acc": 0.7664053,
1530
- "epoch": 2.73972602739726,
1531
- "grad_norm": 1.4296875,
1532
- "learning_rate": 3.441203961666818e-05,
1533
- "loss": 0.84118309,
1534
- "memory(GiB)": 19.55,
1535
  "step": 700,
1536
- "train_speed(iter/s)": 0.075373
1537
  },
1538
  {
1539
- "epoch": 2.73972602739726,
1540
- "eval_acc": 0.628750981932443,
1541
- "eval_loss": 1.5982366800308228,
1542
- "eval_runtime": 69.1268,
1543
- "eval_samples_per_second": 1.099,
1544
- "eval_steps_per_second": 0.55,
1545
  "step": 700
1546
  },
1547
  {
1548
- "acc": 0.74386759,
1549
- "epoch": 2.759295499021526,
1550
- "grad_norm": 2.21875,
1551
- "learning_rate": 3.343961435017094e-05,
1552
- "loss": 0.92712116,
1553
- "memory(GiB)": 23.1,
1554
  "step": 705,
1555
- "train_speed(iter/s)": 0.074881
1556
  },
1557
  {
1558
- "acc": 0.75352135,
1559
- "epoch": 2.7788649706457926,
1560
- "grad_norm": 1.5625,
1561
- "learning_rate": 3.247679620903533e-05,
1562
- "loss": 0.90610752,
1563
- "memory(GiB)": 19.56,
1564
  "step": 710,
1565
- "train_speed(iter/s)": 0.074934
1566
  },
1567
  {
1568
- "acc": 0.75765467,
1569
- "epoch": 2.7984344422700587,
1570
- "grad_norm": 4.4375,
1571
- "learning_rate": 3.1523838197008956e-05,
1572
- "loss": 0.88628139,
1573
- "memory(GiB)": 19.44,
1574
  "step": 715,
1575
- "train_speed(iter/s)": 0.074999
1576
  },
1577
  {
1578
- "acc": 0.763375,
1579
- "epoch": 2.8180039138943247,
1580
- "grad_norm": 1.1640625,
1581
- "learning_rate": 3.058099072685204e-05,
1582
- "loss": 0.86159172,
1583
- "memory(GiB)": 19.5,
1584
  "step": 720,
1585
- "train_speed(iter/s)": 0.075059
1586
  },
1587
  {
1588
- "acc": 0.75694184,
1589
- "epoch": 2.837573385518591,
1590
- "grad_norm": 1.6171875,
1591
- "learning_rate": 2.964850155453543e-05,
1592
- "loss": 0.85433092,
1593
- "memory(GiB)": 19.38,
1594
  "step": 725,
1595
- "train_speed(iter/s)": 0.075121
1596
  },
1597
  {
1598
- "acc": 0.76086893,
1599
- "epoch": 2.857142857142857,
1600
- "grad_norm": 1.5859375,
1601
- "learning_rate": 2.8726615714136827e-05,
1602
- "loss": 0.8608798,
1603
- "memory(GiB)": 19.58,
1604
  "step": 730,
1605
- "train_speed(iter/s)": 0.075181
1606
  },
1607
  {
1608
- "acc": 0.74008894,
1609
- "epoch": 2.8767123287671232,
1610
- "grad_norm": 1.4375,
1611
- "learning_rate": 2.7815575453452058e-05,
1612
- "loss": 0.98413734,
1613
- "memory(GiB)": 19.59,
1614
  "step": 735,
1615
- "train_speed(iter/s)": 0.075242
1616
  },
1617
  {
1618
- "acc": 0.75941825,
1619
- "epoch": 2.8962818003913893,
1620
- "grad_norm": 1.7734375,
1621
- "learning_rate": 2.6915620170338612e-05,
1622
- "loss": 0.85438929,
1623
- "memory(GiB)": 19.39,
1624
  "step": 740,
1625
- "train_speed(iter/s)": 0.075307
1626
  },
1627
  {
1628
- "acc": 0.77891464,
1629
- "epoch": 2.9158512720156553,
1630
- "grad_norm": 1.7265625,
1631
- "learning_rate": 2.6026986349808058e-05,
1632
- "loss": 0.79716868,
1633
- "memory(GiB)": 19.61,
1634
  "step": 745,
1635
- "train_speed(iter/s)": 0.075361
1636
  },
1637
  {
1638
- "acc": 0.75023217,
1639
- "epoch": 2.935420743639922,
1640
- "grad_norm": 1.28125,
1641
- "learning_rate": 2.514990750188399e-05,
1642
- "loss": 0.85774508,
1643
- "memory(GiB)": 18.86,
1644
  "step": 750,
1645
- "train_speed(iter/s)": 0.075417
1646
  },
1647
  {
1648
- "epoch": 2.935420743639922,
1649
- "eval_acc": 0.6324430479183032,
1650
- "eval_loss": 1.5986852645874023,
1651
- "eval_runtime": 69.3348,
1652
- "eval_samples_per_second": 1.096,
1653
- "eval_steps_per_second": 0.548,
1654
  "step": 750
1655
  },
1656
  {
1657
- "acc": 0.74531512,
1658
- "epoch": 2.954990215264188,
1659
- "grad_norm": 1.5625,
1660
- "learning_rate": 2.4284614100241538e-05,
1661
- "loss": 0.93483381,
1662
- "memory(GiB)": 23.14,
1663
  "step": 755,
1664
- "train_speed(iter/s)": 0.074953
1665
  },
1666
  {
1667
- "acc": 0.76761031,
1668
- "epoch": 2.974559686888454,
1669
- "grad_norm": 1.6171875,
1670
- "learning_rate": 2.343133352164477e-05,
1671
- "loss": 0.84630623,
1672
- "memory(GiB)": 19.36,
1673
  "step": 760,
1674
- "train_speed(iter/s)": 0.075015
1675
  },
1676
  {
1677
- "acc": 0.75018072,
1678
- "epoch": 2.9941291585127203,
1679
- "grad_norm": 1.5703125,
1680
- "learning_rate": 2.2590289986198136e-05,
1681
- "loss": 0.89352074,
1682
- "memory(GiB)": 19.6,
1683
  "step": 765,
1684
- "train_speed(iter/s)": 0.075072
1685
  },
1686
  {
1687
- "acc": 0.80383377,
1688
- "epoch": 3.0136986301369864,
1689
- "grad_norm": 1.453125,
1690
- "learning_rate": 2.1761704498427003e-05,
1691
- "loss": 0.68276234,
1692
- "memory(GiB)": 19.62,
1693
  "step": 770,
1694
- "train_speed(iter/s)": 0.075153
1695
  },
1696
  {
1697
- "acc": 0.82252359,
1698
- "epoch": 3.0332681017612524,
1699
- "grad_norm": 1.328125,
1700
- "learning_rate": 2.094579478920358e-05,
1701
- "loss": 0.64008789,
1702
- "memory(GiB)": 19.76,
1703
  "step": 775,
1704
- "train_speed(iter/s)": 0.075213
1705
  },
1706
  {
1707
- "acc": 0.83448801,
1708
- "epoch": 3.0528375733855184,
1709
- "grad_norm": 1.8828125,
1710
- "learning_rate": 2.0142775258532654e-05,
1711
- "loss": 0.61610913,
1712
- "memory(GiB)": 19.59,
1713
  "step": 780,
1714
- "train_speed(iter/s)": 0.075271
1715
  },
1716
  {
1717
- "acc": 0.83116817,
1718
- "epoch": 3.072407045009785,
1719
- "grad_norm": 1.5546875,
1720
- "learning_rate": 1.9352856919212994e-05,
1721
- "loss": 0.58688097,
1722
- "memory(GiB)": 19.53,
1723
  "step": 785,
1724
- "train_speed(iter/s)": 0.075323
1725
  },
1726
  {
1727
- "acc": 0.82525949,
1728
- "epoch": 3.091976516634051,
1729
- "grad_norm": 1.4375,
1730
- "learning_rate": 1.8576247341388544e-05,
1731
- "loss": 0.62312498,
1732
- "memory(GiB)": 19.85,
1733
  "step": 790,
1734
- "train_speed(iter/s)": 0.07537
1735
  },
1736
  {
1737
- "acc": 0.81645441,
1738
- "epoch": 3.111545988258317,
1739
- "grad_norm": 1.65625,
1740
- "learning_rate": 1.7813150598004313e-05,
1741
- "loss": 0.62203112,
1742
- "memory(GiB)": 19.79,
1743
  "step": 795,
1744
- "train_speed(iter/s)": 0.075423
1745
  },
1746
  {
1747
- "acc": 0.83432789,
1748
- "epoch": 3.131115459882583,
1749
- "grad_norm": 1.5859375,
1750
- "learning_rate": 1.7063767211181333e-05,
1751
- "loss": 0.60077624,
1752
- "memory(GiB)": 19.52,
1753
  "step": 800,
1754
- "train_speed(iter/s)": 0.07548
1755
  },
1756
  {
1757
- "epoch": 3.131115459882583,
1758
- "eval_acc": 0.6209740769835035,
1759
- "eval_loss": 1.7955598831176758,
1760
- "eval_runtime": 69.0109,
1761
- "eval_samples_per_second": 1.101,
1762
- "eval_steps_per_second": 0.551,
1763
  "step": 800
1764
- },
1765
- {
1766
- "acc": 0.82124023,
1767
- "epoch": 3.1506849315068495,
1768
- "grad_norm": 1.7578125,
1769
- "learning_rate": 1.6328294099524644e-05,
1770
- "loss": 0.60847788,
1771
- "memory(GiB)": 22.65,
1772
- "step": 805,
1773
- "train_speed(iter/s)": 0.075043
1774
- },
1775
- {
1776
- "acc": 0.83265171,
1777
- "epoch": 3.1702544031311155,
1778
- "grad_norm": 4.09375,
1779
- "learning_rate": 1.5606924526378136e-05,
1780
- "loss": 0.57863126,
1781
- "memory(GiB)": 18.89,
1782
- "step": 810,
1783
- "train_speed(iter/s)": 0.07509
1784
- },
1785
- {
1786
- "acc": 0.8407362,
1787
- "epoch": 3.1898238747553815,
1788
- "grad_norm": 1.1796875,
1789
- "learning_rate": 1.4899848049039881e-05,
1790
- "loss": 0.53706379,
1791
- "memory(GiB)": 19.37,
1792
- "step": 815,
1793
- "train_speed(iter/s)": 0.075142
1794
- },
1795
- {
1796
- "acc": 0.82116756,
1797
- "epoch": 3.2093933463796476,
1798
- "grad_norm": 1.859375,
1799
- "learning_rate": 1.4207250468951426e-05,
1800
- "loss": 0.64039102,
1801
- "memory(GiB)": 19.52,
1802
- "step": 820,
1803
- "train_speed(iter/s)": 0.075197
1804
- },
1805
- {
1806
- "acc": 0.85004549,
1807
- "epoch": 3.228962818003914,
1808
- "grad_norm": 1.0390625,
1809
- "learning_rate": 1.3529313782874023e-05,
1810
- "loss": 0.53315983,
1811
- "memory(GiB)": 19.52,
1812
- "step": 825,
1813
- "train_speed(iter/s)": 0.07525
1814
- },
1815
- {
1816
- "acc": 0.83273296,
1817
- "epoch": 3.24853228962818,
1818
- "grad_norm": 1.578125,
1819
- "learning_rate": 1.2866216135064487e-05,
1820
- "loss": 0.58545351,
1821
- "memory(GiB)": 19.36,
1822
- "step": 830,
1823
- "train_speed(iter/s)": 0.075303
1824
- },
1825
- {
1826
- "acc": 0.80788403,
1827
- "epoch": 3.268101761252446,
1828
- "grad_norm": 2.296875,
1829
- "learning_rate": 1.2218131770463487e-05,
1830
- "loss": 0.67468171,
1831
- "memory(GiB)": 19.28,
1832
- "step": 835,
1833
- "train_speed(iter/s)": 0.075356
1834
- },
1835
- {
1836
- "acc": 0.8440134,
1837
- "epoch": 3.287671232876712,
1838
- "grad_norm": 1.21875,
1839
- "learning_rate": 1.1585230988908576e-05,
1840
- "loss": 0.55293651,
1841
- "memory(GiB)": 19.37,
1842
- "step": 840,
1843
- "train_speed(iter/s)": 0.07541
1844
- },
1845
- {
1846
- "acc": 0.81569691,
1847
- "epoch": 3.3072407045009786,
1848
- "grad_norm": 1.671875,
1849
- "learning_rate": 1.0967680100383645e-05,
1850
- "loss": 0.61190109,
1851
- "memory(GiB)": 18.09,
1852
- "step": 845,
1853
- "train_speed(iter/s)": 0.075466
1854
- },
1855
- {
1856
- "acc": 0.84766483,
1857
- "epoch": 3.3268101761252447,
1858
- "grad_norm": 1.8046875,
1859
- "learning_rate": 1.0365641381317113e-05,
1860
- "loss": 0.52525816,
1861
- "memory(GiB)": 19.31,
1862
- "step": 850,
1863
- "train_speed(iter/s)": 0.075523
1864
- },
1865
- {
1866
- "epoch": 3.3268101761252447,
1867
- "eval_acc": 0.6203456402199529,
1868
- "eval_loss": 1.7881730794906616,
1869
- "eval_runtime": 69.1552,
1870
- "eval_samples_per_second": 1.099,
1871
- "eval_steps_per_second": 0.549,
1872
- "step": 850
1873
- },
1874
- {
1875
- "acc": 0.84491625,
1876
- "epoch": 3.3463796477495107,
1877
- "grad_norm": 1.8046875,
1878
- "learning_rate": 9.779273031939692e-06,
1879
- "loss": 0.56272998,
1880
- "memory(GiB)": 23.04,
1881
- "step": 855,
1882
- "train_speed(iter/s)": 0.07511
1883
- },
1884
- {
1885
- "acc": 0.84104662,
1886
- "epoch": 3.3659491193737767,
1887
- "grad_norm": 1.796875,
1888
- "learning_rate": 9.20872913471363e-06,
1889
- "loss": 0.57019663,
1890
- "memory(GiB)": 19.42,
1891
- "step": 860,
1892
- "train_speed(iter/s)": 0.075157
1893
- },
1894
- {
1895
- "acc": 0.84433002,
1896
- "epoch": 3.385518590998043,
1897
- "grad_norm": 1.6484375,
1898
- "learning_rate": 8.654159613843715e-06,
1899
- "loss": 0.55449514,
1900
- "memory(GiB)": 19.59,
1901
- "step": 865,
1902
- "train_speed(iter/s)": 0.07521
1903
- },
1904
- {
1905
- "acc": 0.80005312,
1906
- "epoch": 3.4050880626223092,
1907
- "grad_norm": 1.46875,
1908
- "learning_rate": 8.115710195881068e-06,
1909
- "loss": 0.73595409,
1910
- "memory(GiB)": 19.36,
1911
- "step": 870,
1912
- "train_speed(iter/s)": 0.075258
1913
- },
1914
- {
1915
- "acc": 0.83217945,
1916
- "epoch": 3.4246575342465753,
1917
- "grad_norm": 3.328125,
1918
- "learning_rate": 7.593522371429972e-06,
1919
- "loss": 0.58270836,
1920
- "memory(GiB)": 19.58,
1921
- "step": 875,
1922
- "train_speed(iter/s)": 0.075306
1923
- },
1924
- {
1925
- "acc": 0.82742786,
1926
- "epoch": 3.4442270058708413,
1927
- "grad_norm": 1.234375,
1928
- "learning_rate": 7.0877333579678585e-06,
1929
- "loss": 0.59052157,
1930
- "memory(GiB)": 19.6,
1931
- "step": 880,
1932
- "train_speed(iter/s)": 0.075358
1933
- },
1934
- {
1935
- "acc": 0.81994705,
1936
- "epoch": 3.4637964774951078,
1937
- "grad_norm": 1.7578125,
1938
- "learning_rate": 6.598476063788036e-06,
1939
- "loss": 0.62256751,
1940
- "memory(GiB)": 19.56,
1941
- "step": 885,
1942
- "train_speed(iter/s)": 0.075405
1943
- },
1944
- {
1945
- "acc": 0.8157341,
1946
- "epoch": 3.483365949119374,
1947
- "grad_norm": 1.8203125,
1948
- "learning_rate": 6.12587905307477e-06,
1949
- "loss": 0.66806622,
1950
- "memory(GiB)": 19.49,
1951
- "step": 890,
1952
- "train_speed(iter/s)": 0.075454
1953
- },
1954
- {
1955
- "acc": 0.82838688,
1956
- "epoch": 3.50293542074364,
1957
- "grad_norm": 1.515625,
1958
- "learning_rate": 5.67006651212008e-06,
1959
- "loss": 0.63044977,
1960
- "memory(GiB)": 19.54,
1961
- "step": 895,
1962
- "train_speed(iter/s)": 0.075497
1963
- },
1964
- {
1965
- "acc": 0.79130597,
1966
- "epoch": 3.5225048923679063,
1967
- "grad_norm": 1.640625,
1968
- "learning_rate": 5.2311582166906605e-06,
1969
- "loss": 0.7558567,
1970
- "memory(GiB)": 19.28,
1971
- "step": 900,
1972
- "train_speed(iter/s)": 0.07555
1973
- },
1974
- {
1975
- "epoch": 3.5225048923679063,
1976
- "eval_acc": 0.6211311861743912,
1977
- "eval_loss": 1.7854998111724854,
1978
- "eval_runtime": 69.2434,
1979
- "eval_samples_per_second": 1.098,
1980
- "eval_steps_per_second": 0.549,
1981
- "step": 900
1982
  }
1983
  ],
1984
  "logging_steps": 5,
1985
- "max_steps": 1020,
1986
  "num_input_tokens_seen": 0,
1987
  "num_train_epochs": 4,
1988
- "save_steps": 300,
1989
  "stateful_callbacks": {
1990
  "TrainerControl": {
1991
  "args": {
@@ -1998,7 +1780,7 @@
1998
  "attributes": {}
1999
  }
2000
  },
2001
- "total_flos": 2.605539502350213e+17,
2002
  "train_batch_size": 2,
2003
  "trial_name": null,
2004
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.47908163,
3
+ "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v9-20240710-235159\\checkpoint-500",
4
+ "epoch": 2.8828828828828827,
5
  "eval_steps": 50,
6
+ "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "acc": 0.4856407,
13
+ "epoch": 0.0036036036036036037,
14
+ "grad_norm": 0.734375,
15
+ "learning_rate": 2.4107142857142856e-06,
16
+ "loss": 2.42667556,
17
+ "memory(GiB)": 18.11,
18
  "step": 1,
19
+ "train_speed(iter/s)": 0.072451
20
  },
21
  {
22
+ "acc": 0.50815099,
23
+ "epoch": 0.018018018018018018,
24
+ "grad_norm": 0.671875,
25
+ "learning_rate": 1.2053571428571429e-05,
26
+ "loss": 2.28746271,
27
+ "memory(GiB)": 19.3,
28
  "step": 5,
29
+ "train_speed(iter/s)": 0.081978
30
  },
31
  {
32
+ "acc": 0.50680609,
33
+ "epoch": 0.036036036036036036,
34
+ "grad_norm": 0.76953125,
35
+ "learning_rate": 2.4107142857142858e-05,
36
+ "loss": 2.29894772,
37
+ "memory(GiB)": 19.3,
38
  "step": 10,
39
+ "train_speed(iter/s)": 0.084125
40
  },
41
  {
42
+ "acc": 0.51412601,
43
+ "epoch": 0.05405405405405406,
44
+ "grad_norm": 0.76171875,
45
+ "learning_rate": 3.616071428571428e-05,
46
+ "loss": 2.34161263,
47
+ "memory(GiB)": 19.7,
48
  "step": 15,
49
+ "train_speed(iter/s)": 0.08456
50
  },
51
  {
52
+ "acc": 0.52338777,
53
+ "epoch": 0.07207207207207207,
54
+ "grad_norm": 0.6015625,
55
+ "learning_rate": 4.8214285714285716e-05,
56
+ "loss": 2.23036633,
57
+ "memory(GiB)": 19.88,
58
  "step": 20,
59
+ "train_speed(iter/s)": 0.084117
60
  },
61
  {
62
+ "acc": 0.55944238,
63
+ "epoch": 0.09009009009009009,
64
+ "grad_norm": 0.66796875,
65
+ "learning_rate": 6.026785714285715e-05,
66
+ "loss": 2.01084595,
67
+ "memory(GiB)": 19.93,
68
  "step": 25,
69
+ "train_speed(iter/s)": 0.084444
70
  },
71
  {
72
+ "acc": 0.57758675,
73
+ "epoch": 0.10810810810810811,
74
+ "grad_norm": 0.765625,
75
+ "learning_rate": 7.232142857142856e-05,
76
+ "loss": 1.94100876,
77
+ "memory(GiB)": 20.21,
78
  "step": 30,
79
+ "train_speed(iter/s)": 0.085158
80
  },
81
  {
82
+ "acc": 0.5666451,
83
+ "epoch": 0.12612612612612611,
84
+ "grad_norm": 0.796875,
85
+ "learning_rate": 8.4375e-05,
86
+ "loss": 1.96992569,
87
+ "memory(GiB)": 19.42,
88
  "step": 35,
89
+ "train_speed(iter/s)": 0.085562
90
  },
91
  {
92
+ "acc": 0.55766659,
93
+ "epoch": 0.14414414414414414,
94
+ "grad_norm": 0.828125,
95
+ "learning_rate": 9.642857142857143e-05,
96
+ "loss": 2.01305885,
97
+ "memory(GiB)": 19.71,
98
  "step": 40,
99
+ "train_speed(iter/s)": 0.0857
100
  },
101
  {
102
+ "acc": 0.56964116,
103
+ "epoch": 0.16216216216216217,
104
+ "grad_norm": 0.83203125,
105
+ "learning_rate": 0.00010848214285714286,
106
+ "loss": 1.925914,
107
+ "memory(GiB)": 19.68,
108
  "step": 45,
109
+ "train_speed(iter/s)": 0.08577
110
  },
111
  {
112
+ "acc": 0.56270452,
113
+ "epoch": 0.18018018018018017,
114
+ "grad_norm": 0.9375,
115
+ "learning_rate": 0.0001205357142857143,
116
+ "loss": 1.94923038,
117
+ "memory(GiB)": 19.65,
118
  "step": 50,
119
+ "train_speed(iter/s)": 0.085942
120
  },
121
  {
122
+ "epoch": 0.18018018018018017,
123
+ "eval_acc": 0.5890983000739098,
124
+ "eval_loss": 1.795773983001709,
125
+ "eval_runtime": 136.6505,
126
+ "eval_samples_per_second": 1.105,
127
+ "eval_steps_per_second": 0.556,
128
  "step": 50
129
  },
130
  {
131
+ "acc": 0.57772484,
132
+ "epoch": 0.1981981981981982,
133
+ "grad_norm": 0.7265625,
134
+ "learning_rate": 0.00013258928571428571,
135
+ "loss": 1.86195869,
136
+ "memory(GiB)": 23.11,
137
  "step": 55,
138
+ "train_speed(iter/s)": 0.070857
139
  },
140
  {
141
+ "acc": 0.59196444,
142
+ "epoch": 0.21621621621621623,
143
+ "grad_norm": 0.8125,
144
+ "learning_rate": 0.00013499518432841625,
145
+ "loss": 1.74724998,
146
+ "memory(GiB)": 19.42,
147
  "step": 60,
148
+ "train_speed(iter/s)": 0.071911
149
  },
150
  {
151
+ "acc": 0.57253065,
152
+ "epoch": 0.23423423423423423,
153
+ "grad_norm": 0.69921875,
154
+ "learning_rate": 0.00013497562184025362,
155
+ "loss": 1.87580814,
156
+ "memory(GiB)": 19.61,
157
  "step": 65,
158
+ "train_speed(iter/s)": 0.072807
159
  },
160
  {
161
+ "acc": 0.59546819,
162
+ "epoch": 0.25225225225225223,
163
+ "grad_norm": 0.73046875,
164
+ "learning_rate": 0.00013494101591406666,
165
+ "loss": 1.73464546,
166
+ "memory(GiB)": 19.58,
167
  "step": 70,
168
+ "train_speed(iter/s)": 0.073652
169
  },
170
  {
171
+ "acc": 0.59667702,
172
+ "epoch": 0.2702702702702703,
173
+ "grad_norm": 0.8203125,
174
+ "learning_rate": 0.00013489137426511745,
175
+ "loss": 1.69518318,
176
+ "memory(GiB)": 18.19,
177
  "step": 75,
178
+ "train_speed(iter/s)": 0.074445
179
  },
180
  {
181
+ "acc": 0.61824327,
182
+ "epoch": 0.2882882882882883,
183
+ "grad_norm": 0.828125,
184
+ "learning_rate": 0.00013482670796082633,
185
+ "loss": 1.64374161,
186
+ "memory(GiB)": 19.52,
187
  "step": 80,
188
+ "train_speed(iter/s)": 0.075071
189
  },
190
  {
191
+ "acc": 0.60798159,
192
+ "epoch": 0.3063063063063063,
193
+ "grad_norm": 0.7734375,
194
+ "learning_rate": 0.00013474703141830443,
195
+ "loss": 1.68669338,
196
+ "memory(GiB)": 19.57,
197
  "step": 85,
198
+ "train_speed(iter/s)": 0.07562
199
  },
200
  {
201
+ "acc": 0.5981144,
202
+ "epoch": 0.32432432432432434,
203
+ "grad_norm": 0.80078125,
204
+ "learning_rate": 0.00013465236240113953,
205
+ "loss": 1.701264,
206
+ "memory(GiB)": 20.19,
207
  "step": 90,
208
+ "train_speed(iter/s)": 0.076188
209
  },
210
  {
211
+ "acc": 0.59871612,
212
+ "epoch": 0.34234234234234234,
213
+ "grad_norm": 1.0234375,
214
+ "learning_rate": 0.00013454272201543564,
215
+ "loss": 1.76608849,
216
+ "memory(GiB)": 19.35,
217
  "step": 95,
218
+ "train_speed(iter/s)": 0.076637
219
  },
220
  {
221
+ "acc": 0.61396523,
222
+ "epoch": 0.36036036036036034,
223
+ "grad_norm": 0.7109375,
224
+ "learning_rate": 0.00013441813470510747,
225
+ "loss": 1.61449242,
226
+ "memory(GiB)": 19.69,
227
  "step": 100,
228
+ "train_speed(iter/s)": 0.077075
229
  },
230
  {
231
+ "epoch": 0.36036036036036034,
232
+ "eval_acc": 0.6091648189209165,
233
+ "eval_loss": 1.6449466943740845,
234
+ "eval_runtime": 134.5726,
235
+ "eval_samples_per_second": 1.122,
236
+ "eval_steps_per_second": 0.565,
237
  "step": 100
238
  },
239
  {
240
+ "acc": 0.61147785,
241
+ "epoch": 0.3783783783783784,
242
+ "grad_norm": 0.69921875,
243
+ "learning_rate": 0.00013427862824643083,
244
+ "loss": 1.60589867,
245
+ "memory(GiB)": 21.03,
246
  "step": 105,
247
+ "train_speed(iter/s)": 0.070426
248
  },
249
  {
250
+ "acc": 0.6038115,
251
+ "epoch": 0.3963963963963964,
252
+ "grad_norm": 0.88671875,
253
+ "learning_rate": 0.00013412423374184996,
254
+ "loss": 1.69055023,
255
+ "memory(GiB)": 19.44,
256
  "step": 110,
257
+ "train_speed(iter/s)": 0.07105
258
  },
259
  {
260
+ "acc": 0.62303677,
261
+ "epoch": 0.4144144144144144,
262
+ "grad_norm": 0.84375,
263
+ "learning_rate": 0.00013395498561304334,
264
+ "loss": 1.5716897,
265
+ "memory(GiB)": 19.27,
266
  "step": 115,
267
+ "train_speed(iter/s)": 0.071618
268
  },
269
  {
270
+ "acc": 0.6214046,
271
+ "epoch": 0.43243243243243246,
272
+ "grad_norm": 0.640625,
273
+ "learning_rate": 0.00013377092159324956,
274
+ "loss": 1.57531881,
275
+ "memory(GiB)": 19.36,
276
  "step": 120,
277
+ "train_speed(iter/s)": 0.07209
278
  },
279
  {
280
+ "acc": 0.58676672,
281
+ "epoch": 0.45045045045045046,
282
+ "grad_norm": 0.68359375,
283
+ "learning_rate": 0.00013357208271885473,
284
+ "loss": 1.74933128,
285
+ "memory(GiB)": 19.32,
286
  "step": 125,
287
+ "train_speed(iter/s)": 0.072581
288
  },
289
  {
290
+ "acc": 0.59380612,
291
+ "epoch": 0.46846846846846846,
292
+ "grad_norm": 0.7890625,
293
+ "learning_rate": 0.00013335851332024374,
294
+ "loss": 1.69583378,
295
+ "memory(GiB)": 20.18,
296
  "step": 130,
297
+ "train_speed(iter/s)": 0.073016
298
  },
299
  {
300
+ "acc": 0.62007999,
301
+ "epoch": 0.4864864864864865,
302
+ "grad_norm": 0.73828125,
303
+ "learning_rate": 0.0001331302610119168,
304
+ "loss": 1.60020466,
305
+ "memory(GiB)": 19.52,
306
  "step": 135,
307
+ "train_speed(iter/s)": 0.073417
308
  },
309
  {
310
+ "acc": 0.6116991,
311
+ "epoch": 0.5045045045045045,
312
+ "grad_norm": 1.1015625,
313
+ "learning_rate": 0.00013288737668187408,
314
+ "loss": 1.62470894,
315
+ "memory(GiB)": 19.47,
316
  "step": 140,
317
+ "train_speed(iter/s)": 0.073817
318
  },
319
  {
320
+ "acc": 0.60051751,
321
+ "epoch": 0.5225225225225225,
322
+ "grad_norm": 0.87109375,
323
+ "learning_rate": 0.00013262991448027034,
324
+ "loss": 1.6651041,
325
+ "memory(GiB)": 19.42,
326
  "step": 145,
327
+ "train_speed(iter/s)": 0.074194
328
  },
329
  {
330
+ "acc": 0.60736594,
331
+ "epoch": 0.5405405405405406,
332
+ "grad_norm": 0.76953125,
333
+ "learning_rate": 0.00013235793180734238,
334
+ "loss": 1.64281559,
335
+ "memory(GiB)": 19.53,
336
  "step": 150,
337
+ "train_speed(iter/s)": 0.074547
338
  },
339
  {
340
+ "epoch": 0.5405405405405406,
341
+ "eval_acc": 0.6190317812269032,
342
+ "eval_loss": 1.5917434692382812,
343
+ "eval_runtime": 135.0141,
344
+ "eval_samples_per_second": 1.118,
345
+ "eval_steps_per_second": 0.563,
346
  "step": 150
347
  },
348
  {
349
+ "acc": 0.61663914,
350
+ "epoch": 0.5585585585585585,
351
+ "grad_norm": 1.0625,
352
+ "learning_rate": 0.00013207148930061195,
353
+ "loss": 1.60914173,
354
+ "memory(GiB)": 23.05,
355
  "step": 155,
356
+ "train_speed(iter/s)": 0.070306
357
  },
358
  {
359
+ "acc": 0.60967774,
360
+ "epoch": 0.5765765765765766,
361
+ "grad_norm": 0.76953125,
362
+ "learning_rate": 0.00013177065082136668,
363
+ "loss": 1.59582939,
364
+ "memory(GiB)": 19.47,
365
  "step": 160,
366
+ "train_speed(iter/s)": 0.070712
367
  },
368
  {
369
+ "acc": 0.63630972,
370
+ "epoch": 0.5945945945945946,
371
+ "grad_norm": 0.70703125,
372
+ "learning_rate": 0.00013145548344042262,
373
+ "loss": 1.50356016,
374
+ "memory(GiB)": 19.62,
375
  "step": 165,
376
+ "train_speed(iter/s)": 0.071104
377
  },
378
  {
379
+ "acc": 0.60439692,
380
+ "epoch": 0.6126126126126126,
381
+ "grad_norm": 0.73046875,
382
+ "learning_rate": 0.00013112605742317095,
383
+ "loss": 1.67050171,
384
+ "memory(GiB)": 19.41,
385
  "step": 170,
386
+ "train_speed(iter/s)": 0.071478
387
  },
388
  {
389
+ "acc": 0.62380457,
390
+ "epoch": 0.6306306306306306,
391
+ "grad_norm": 0.76171875,
392
+ "learning_rate": 0.0001307824462139125,
393
+ "loss": 1.53042831,
394
+ "memory(GiB)": 19.5,
395
  "step": 175,
396
+ "train_speed(iter/s)": 0.071843
397
  },
398
  {
399
+ "acc": 0.61549187,
400
+ "epoch": 0.6486486486486487,
401
+ "grad_norm": 0.7578125,
402
+ "learning_rate": 0.00013042472641948386,
403
+ "loss": 1.59476538,
404
+ "memory(GiB)": 19.53,
405
  "step": 180,
406
+ "train_speed(iter/s)": 0.072168
407
  },
408
  {
409
+ "acc": 0.64418182,
410
+ "epoch": 0.6666666666666666,
411
+ "grad_norm": 1.1796875,
412
+ "learning_rate": 0.0001300529777921779,
413
+ "loss": 1.47999802,
414
+ "memory(GiB)": 19.32,
415
  "step": 185,
416
+ "train_speed(iter/s)": 0.072501
417
  },
418
  {
419
+ "acc": 0.62201657,
420
+ "epoch": 0.6846846846846847,
421
+ "grad_norm": 0.6484375,
422
+ "learning_rate": 0.00012966728321196346,
423
+ "loss": 1.5685544,
424
+ "memory(GiB)": 19.47,
425
  "step": 190,
426
+ "train_speed(iter/s)": 0.072821
427
  },
428
  {
429
+ "acc": 0.61418505,
430
+ "epoch": 0.7027027027027027,
431
+ "grad_norm": 0.8984375,
432
+ "learning_rate": 0.00012926772866800757,
433
+ "loss": 1.6284462,
434
+ "memory(GiB)": 19.45,
435
  "step": 195,
436
+ "train_speed(iter/s)": 0.073127
437
  },
438
  {
439
+ "acc": 0.62820964,
440
+ "epoch": 0.7207207207207207,
441
+ "grad_norm": 0.8515625,
442
+ "learning_rate": 0.00012885440323950434,
443
+ "loss": 1.54364405,
444
+ "memory(GiB)": 19.53,
445
  "step": 200,
446
+ "train_speed(iter/s)": 0.073413
447
  },
448
  {
449
+ "epoch": 0.7207207207207207,
450
+ "eval_acc": 0.6269770879526977,
451
+ "eval_loss": 1.5466336011886597,
452
+ "eval_runtime": 134.7868,
453
+ "eval_samples_per_second": 1.12,
454
+ "eval_steps_per_second": 0.564,
455
  "step": 200
456
  },
457
  {
458
+ "acc": 0.6605804,
459
+ "epoch": 0.7387387387387387,
460
+ "grad_norm": 0.7578125,
461
+ "learning_rate": 0.00012842739907581525,
462
+ "loss": 1.42957153,
463
+ "memory(GiB)": 23.0,
464
  "step": 205,
465
+ "train_speed(iter/s)": 0.070232
466
  },
467
  {
468
+ "acc": 0.61267309,
469
+ "epoch": 0.7567567567567568,
470
+ "grad_norm": 0.90234375,
471
+ "learning_rate": 0.00012798681137592477,
472
+ "loss": 1.62853241,
473
+ "memory(GiB)": 17.96,
474
  "step": 210,
475
+ "train_speed(iter/s)": 0.070571
476
  },
477
  {
478
+ "acc": 0.63069816,
479
+ "epoch": 0.7747747747747747,
480
+ "grad_norm": 0.89453125,
481
+ "learning_rate": 0.00012753273836721597,
482
+ "loss": 1.56295233,
483
+ "memory(GiB)": 19.4,
484
  "step": 215,
485
+ "train_speed(iter/s)": 0.070892
486
  },
487
  {
488
+ "acc": 0.60362072,
489
+ "epoch": 0.7927927927927928,
490
+ "grad_norm": 1.0703125,
491
+ "learning_rate": 0.00012706528128357127,
492
+ "loss": 1.63038826,
493
+ "memory(GiB)": 19.37,
494
  "step": 220,
495
+ "train_speed(iter/s)": 0.071181
496
  },
497
  {
498
+ "acc": 0.62272639,
499
+ "epoch": 0.8108108108108109,
500
+ "grad_norm": 0.8828125,
501
+ "learning_rate": 0.00012658454434280253,
502
+ "loss": 1.5756237,
503
+ "memory(GiB)": 19.62,
504
  "step": 225,
505
+ "train_speed(iter/s)": 0.071466
506
  },
507
  {
508
+ "acc": 0.59926658,
509
+ "epoch": 0.8288288288288288,
510
+ "grad_norm": 0.75390625,
511
+ "learning_rate": 0.00012609063472341633,
512
+ "loss": 1.60503426,
513
+ "memory(GiB)": 19.63,
514
  "step": 230,
515
+ "train_speed(iter/s)": 0.071751
516
  },
517
  {
518
+ "acc": 0.60133944,
519
+ "epoch": 0.8468468468468469,
520
+ "grad_norm": 1.3515625,
521
+ "learning_rate": 0.0001255836625407187,
522
+ "loss": 1.64450779,
523
+ "memory(GiB)": 19.31,
524
  "step": 235,
525
+ "train_speed(iter/s)": 0.072034
526
  },
527
  {
528
+ "acc": 0.64020758,
529
+ "epoch": 0.8648648648648649,
530
+ "grad_norm": 0.9375,
531
+ "learning_rate": 0.00012506374082226534,
532
+ "loss": 1.47053967,
533
+ "memory(GiB)": 18.85,
534
  "step": 240,
535
+ "train_speed(iter/s)": 0.072286
536
  },
537
  {
538
+ "acc": 0.62713485,
539
+ "epoch": 0.8828828828828829,
540
+ "grad_norm": 0.82421875,
541
+ "learning_rate": 0.00012453098548266276,
542
+ "loss": 1.51464148,
543
+ "memory(GiB)": 19.35,
544
  "step": 245,
545
+ "train_speed(iter/s)": 0.07254
546
  },
547
  {
548
+ "acc": 0.6202302,
549
+ "epoch": 0.9009009009009009,
550
+ "grad_norm": 0.625,
551
+ "learning_rate": 0.0001239855152977253,
552
+ "loss": 1.54778471,
553
+ "memory(GiB)": 19.53,
554
  "step": 250,
555
+ "train_speed(iter/s)": 0.072758
556
  },
557
  {
558
+ "epoch": 0.9009009009009009,
559
+ "eval_acc": 0.6308573540280857,
560
+ "eval_loss": 1.510523796081543,
561
+ "eval_runtime": 134.5445,
562
+ "eval_samples_per_second": 1.122,
563
+ "eval_steps_per_second": 0.565,
564
  "step": 250
565
  },
566
  {
567
+ "acc": 0.63671951,
568
+ "epoch": 0.918918918918919,
569
+ "grad_norm": 1.7109375,
570
+ "learning_rate": 0.00012342745187799459,
571
+ "loss": 1.48321924,
572
+ "memory(GiB)": 19.53,
573
  "step": 255,
574
+ "train_speed(iter/s)": 0.070273
575
  },
576
  {
577
+ "acc": 0.63577223,
578
+ "epoch": 0.9369369369369369,
579
+ "grad_norm": 0.7890625,
580
+ "learning_rate": 0.000122856919641627,
581
+ "loss": 1.50699987,
582
+ "memory(GiB)": 19.94,
583
  "step": 260,
584
+ "train_speed(iter/s)": 0.070553
585
  },
586
  {
587
+ "acc": 0.64953299,
588
+ "epoch": 0.954954954954955,
589
+ "grad_norm": 0.85546875,
590
+ "learning_rate": 0.000122274045786655,
591
+ "loss": 1.46005678,
592
+ "memory(GiB)": 20.1,
593
  "step": 265,
594
+ "train_speed(iter/s)": 0.070802
595
  },
596
  {
597
+ "acc": 0.62153759,
598
+ "epoch": 0.972972972972973,
599
+ "grad_norm": 1.0625,
600
+ "learning_rate": 0.00012167896026262893,
601
+ "loss": 1.55834417,
602
+ "memory(GiB)": 19.86,
603
  "step": 270,
604
+ "train_speed(iter/s)": 0.071052
605
  },
606
  {
607
+ "acc": 0.64055209,
608
+ "epoch": 0.990990990990991,
609
+ "grad_norm": 1.125,
610
+ "learning_rate": 0.00012107179574164504,
611
+ "loss": 1.54932261,
612
+ "memory(GiB)": 20.06,
613
  "step": 275,
614
+ "train_speed(iter/s)": 0.071274
615
  },
616
  {
617
+ "acc": 0.62708969,
618
+ "epoch": 1.009009009009009,
619
+ "grad_norm": 0.671875,
620
+ "learning_rate": 0.00012045268758876699,
621
+ "loss": 1.49731979,
622
+ "memory(GiB)": 19.82,
623
  "step": 280,
624
+ "train_speed(iter/s)": 0.07152
625
  },
626
  {
627
+ "acc": 0.6689836,
628
+ "epoch": 1.027027027027027,
629
+ "grad_norm": 0.859375,
630
+ "learning_rate": 0.00011982177383184648,
631
+ "loss": 1.2817215,
632
+ "memory(GiB)": 19.85,
633
  "step": 285,
634
+ "train_speed(iter/s)": 0.07175
635
  },
636
  {
637
+ "acc": 0.67519293,
638
+ "epoch": 1.045045045045045,
639
+ "grad_norm": 1.046875,
640
+ "learning_rate": 0.00011917919513075066,
641
+ "loss": 1.28632126,
642
+ "memory(GiB)": 19.98,
643
  "step": 290,
644
+ "train_speed(iter/s)": 0.071951
645
  },
646
  {
647
+ "acc": 0.67276659,
648
+ "epoch": 1.063063063063063,
649
+ "grad_norm": 0.8984375,
650
+ "learning_rate": 0.00011852509474600237,
651
+ "loss": 1.27065611,
652
+ "memory(GiB)": 20.03,
653
  "step": 295,
654
+ "train_speed(iter/s)": 0.072155
655
  },
656
  {
657
+ "acc": 0.64641519,
658
+ "epoch": 1.0810810810810811,
659
+ "grad_norm": 0.98046875,
660
+ "learning_rate": 0.00011785961850684083,
661
+ "loss": 1.38271847,
662
+ "memory(GiB)": 19.09,
663
  "step": 300,
664
+ "train_speed(iter/s)": 0.072371
665
  },
666
  {
667
+ "epoch": 1.0810810810810811,
668
+ "eval_acc": 0.6305617147080562,
669
+ "eval_loss": 1.523685097694397,
670
+ "eval_runtime": 134.8234,
671
+ "eval_samples_per_second": 1.12,
672
+ "eval_steps_per_second": 0.564,
673
  "step": 300
674
  },
675
  {
676
+ "acc": 0.67837138,
677
+ "epoch": 1.0990990990990992,
678
+ "grad_norm": 0.953125,
679
+ "learning_rate": 0.00011718291477870959,
680
+ "loss": 1.29290819,
681
+ "memory(GiB)": 22.8,
682
  "step": 305,
683
+ "train_speed(iter/s)": 0.070277
684
  },
685
  {
686
+ "acc": 0.67195911,
687
+ "epoch": 1.117117117117117,
688
+ "grad_norm": 1.796875,
689
+ "learning_rate": 0.00011649513443017889,
690
+ "loss": 1.24073734,
691
+ "memory(GiB)": 19.39,
692
  "step": 310,
693
+ "train_speed(iter/s)": 0.070516
694
  },
695
  {
696
+ "acc": 0.69478951,
697
+ "epoch": 1.135135135135135,
698
+ "grad_norm": 1.203125,
699
+ "learning_rate": 0.00011579643079931018,
700
+ "loss": 1.20378675,
701
+ "memory(GiB)": 19.38,
702
  "step": 315,
703
+ "train_speed(iter/s)": 0.070713
704
  },
705
  {
706
+ "acc": 0.68726826,
707
+ "epoch": 1.1531531531531531,
708
+ "grad_norm": 0.98828125,
709
+ "learning_rate": 0.00011508695965946992,
710
+ "loss": 1.23284683,
711
+ "memory(GiB)": 19.98,
712
  "step": 320,
713
+ "train_speed(iter/s)": 0.070919
714
  },
715
  {
716
+ "acc": 0.65419765,
717
+ "epoch": 1.1711711711711712,
718
+ "grad_norm": 0.93359375,
719
+ "learning_rate": 0.00011436687918460052,
720
+ "loss": 1.37520065,
721
+ "memory(GiB)": 20.02,
722
  "step": 325,
723
+ "train_speed(iter/s)": 0.071117
724
  },
725
  {
726
+ "acc": 0.66610641,
727
+ "epoch": 1.1891891891891893,
728
+ "grad_norm": 0.8671875,
729
+ "learning_rate": 0.000113636349913956,
730
+ "loss": 1.30743008,
731
+ "memory(GiB)": 19.35,
732
  "step": 330,
733
+ "train_speed(iter/s)": 0.071322
734
  },
735
  {
736
+ "acc": 0.67390976,
737
+ "epoch": 1.2072072072072073,
738
+ "grad_norm": 1.6640625,
739
+ "learning_rate": 0.00011289553471631045,
740
+ "loss": 1.28322783,
741
+ "memory(GiB)": 19.49,
742
  "step": 335,
743
+ "train_speed(iter/s)": 0.071518
744
  },
745
  {
746
+ "acc": 0.68137512,
747
+ "epoch": 1.2252252252252251,
748
+ "grad_norm": 0.6953125,
749
+ "learning_rate": 0.00011214459875364693,
750
+ "loss": 1.23027716,
751
+ "memory(GiB)": 19.38,
752
  "step": 340,
753
+ "train_speed(iter/s)": 0.071692
754
  },
755
  {
756
+ "acc": 0.67859125,
757
+ "epoch": 1.2432432432432432,
758
+ "grad_norm": 0.78515625,
759
+ "learning_rate": 0.00011138370944433531,
760
+ "loss": 1.22896252,
761
+ "memory(GiB)": 20.06,
762
  "step": 345,
763
+ "train_speed(iter/s)": 0.071876
764
  },
765
  {
766
+ "acc": 0.66445112,
767
+ "epoch": 1.2612612612612613,
768
+ "grad_norm": 0.90234375,
769
+ "learning_rate": 0.00011061303642580694,
770
+ "loss": 1.30674038,
771
+ "memory(GiB)": 19.49,
772
  "step": 350,
773
+ "train_speed(iter/s)": 0.072045
774
  },
775
  {
776
+ "epoch": 1.2612612612612613,
777
+ "eval_acc": 0.6356245380635624,
778
+ "eval_loss": 1.5072119235992432,
779
+ "eval_runtime": 134.5232,
780
+ "eval_samples_per_second": 1.122,
781
+ "eval_steps_per_second": 0.565,
782
  "step": 350
783
  },
784
  {
785
+ "acc": 0.67729836,
786
+ "epoch": 1.2792792792792793,
787
+ "grad_norm": 0.90625,
788
+ "learning_rate": 0.00010983275151673467,
789
+ "loss": 1.24173574,
790
+ "memory(GiB)": 18.93,
791
  "step": 355,
792
+ "train_speed(iter/s)": 0.07029
793
  },
794
  {
795
+ "acc": 0.7040791,
796
+ "epoch": 1.2972972972972974,
797
+ "grad_norm": 0.84765625,
798
+ "learning_rate": 0.00010904302867872639,
799
+ "loss": 1.17582674,
800
+ "memory(GiB)": 19.29,
801
  "step": 360,
802
+ "train_speed(iter/s)": 0.070479
803
  },
804
  {
805
+ "acc": 0.66356058,
806
+ "epoch": 1.3153153153153152,
807
+ "grad_norm": 0.82421875,
808
+ "learning_rate": 0.00010824404397754104,
809
+ "loss": 1.26798725,
810
+ "memory(GiB)": 19.36,
811
  "step": 365,
812
+ "train_speed(iter/s)": 0.070661
813
  },
814
  {
815
+ "acc": 0.69379635,
816
+ "epoch": 1.3333333333333333,
817
+ "grad_norm": 0.98828125,
818
+ "learning_rate": 0.0001074359755438354,
819
+ "loss": 1.24331112,
820
+ "memory(GiB)": 20.16,
821
  "step": 370,
822
+ "train_speed(iter/s)": 0.070843
823
  },
824
  {
825
+ "acc": 0.68220735,
826
+ "epoch": 1.3513513513513513,
827
+ "grad_norm": 0.94140625,
828
+ "learning_rate": 0.00010661900353345051,
829
+ "loss": 1.20891714,
830
+ "memory(GiB)": 19.61,
831
  "step": 375,
832
+ "train_speed(iter/s)": 0.071015
833
  },
834
  {
835
+ "acc": 0.67620883,
836
+ "epoch": 1.3693693693693694,
837
+ "grad_norm": 1.0625,
838
+ "learning_rate": 0.0001057933100872466,
839
+ "loss": 1.23957863,
840
+ "memory(GiB)": 20.17,
841
  "step": 380,
842
+ "train_speed(iter/s)": 0.071181
843
  },
844
  {
845
+ "acc": 0.63655629,
846
+ "epoch": 1.3873873873873874,
847
+ "grad_norm": 0.78515625,
848
+ "learning_rate": 0.00010495907929049546,
849
+ "loss": 1.44390507,
850
+ "memory(GiB)": 19.25,
851
  "step": 385,
852
+ "train_speed(iter/s)": 0.071356
853
  },
854
  {
855
+ "acc": 0.67883902,
856
+ "epoch": 1.4054054054054055,
857
+ "grad_norm": 0.8828125,
858
+ "learning_rate": 0.00010411649713183925,
859
+ "loss": 1.29691544,
860
+ "memory(GiB)": 18.78,
861
  "step": 390,
862
+ "train_speed(iter/s)": 0.071515
863
  },
864
  {
865
+ "acc": 0.67202511,
866
+ "epoch": 1.4234234234234235,
867
+ "grad_norm": 0.953125,
868
+ "learning_rate": 0.00010326575146182521,
869
+ "loss": 1.31318274,
870
+ "memory(GiB)": 19.88,
871
  "step": 395,
872
+ "train_speed(iter/s)": 0.071677
873
  },
874
  {
875
+ "acc": 0.69274058,
876
+ "epoch": 1.4414414414414414,
877
+ "grad_norm": 0.82421875,
878
+ "learning_rate": 0.00010240703195102489,
879
+ "loss": 1.15976305,
880
+ "memory(GiB)": 19.46,
881
  "step": 400,
882
+ "train_speed(iter/s)": 0.071832
883
  },
884
  {
885
+ "epoch": 1.4414414414414414,
886
+ "eval_acc": 0.6368440502586844,
887
+ "eval_loss": 1.4986343383789062,
888
+ "eval_runtime": 134.3425,
889
+ "eval_samples_per_second": 1.124,
890
+ "eval_steps_per_second": 0.566,
891
  "step": 400
892
  },
893
  {
894
+ "acc": 0.71039405,
895
+ "epoch": 1.4594594594594594,
896
+ "grad_norm": 0.77734375,
897
+ "learning_rate": 0.0001015405300477479,
898
+ "loss": 1.12253609,
899
+ "memory(GiB)": 19.92,
900
  "step": 405,
901
+ "train_speed(iter/s)": 0.070298
902
  },
903
  {
904
+ "acc": 0.71356583,
905
+ "epoch": 1.4774774774774775,
906
+ "grad_norm": 0.84375,
907
+ "learning_rate": 0.0001006664389353592,
908
+ "loss": 1.13753939,
909
+ "memory(GiB)": 19.31,
910
  "step": 410,
911
+ "train_speed(iter/s)": 0.070457
912
  },
913
  {
914
+ "acc": 0.675458,
915
+ "epoch": 1.4954954954954955,
916
+ "grad_norm": 1.1328125,
917
+ "learning_rate": 9.978495348920958e-05,
918
+ "loss": 1.29233532,
919
+ "memory(GiB)": 19.06,
920
  "step": 415,
921
+ "train_speed(iter/s)": 0.070616
922
  },
923
  {
924
+ "acc": 0.67761598,
925
+ "epoch": 1.5135135135135136,
926
+ "grad_norm": 0.6875,
927
+ "learning_rate": 9.889627023318897e-05,
928
+ "loss": 1.22440186,
929
+ "memory(GiB)": 19.16,
930
  "step": 420,
931
+ "train_speed(iter/s)": 0.070773
932
  },
933
  {
934
+ "acc": 0.67492404,
935
+ "epoch": 1.5315315315315314,
936
+ "grad_norm": 0.81640625,
937
+ "learning_rate": 9.800058729591212e-05,
938
+ "loss": 1.22408361,
939
+ "memory(GiB)": 19.97,
940
  "step": 425,
941
+ "train_speed(iter/s)": 0.070935
942
  },
943
  {
944
+ "acc": 0.68050842,
945
+ "epoch": 1.5495495495495497,
946
+ "grad_norm": 0.84765625,
947
+ "learning_rate": 9.70981043665466e-05,
948
+ "loss": 1.2078824,
949
+ "memory(GiB)": 19.92,
950
  "step": 430,
951
+ "train_speed(iter/s)": 0.07109
952
  },
953
  {
954
+ "acc": 0.6750885,
955
+ "epoch": 1.5675675675675675,
956
+ "grad_norm": 0.66796875,
957
+ "learning_rate": 9.618902265029284e-05,
958
+ "loss": 1.28742075,
959
+ "memory(GiB)": 19.27,
960
  "step": 435,
961
+ "train_speed(iter/s)": 0.071229
962
  },
963
  {
964
+ "acc": 0.64411507,
965
+ "epoch": 1.5855855855855856,
966
+ "grad_norm": 0.95703125,
967
+ "learning_rate": 9.527354482352616e-05,
968
+ "loss": 1.37240067,
969
+ "memory(GiB)": 20.21,
970
  "step": 440,
971
+ "train_speed(iter/s)": 0.071374
972
  },
973
  {
974
+ "acc": 0.67574663,
975
+ "epoch": 1.6036036036036037,
976
+ "grad_norm": 0.83984375,
977
+ "learning_rate": 9.435187498861085e-05,
978
+ "loss": 1.27780771,
979
+ "memory(GiB)": 19.95,
980
  "step": 445,
981
+ "train_speed(iter/s)": 0.071519
982
  },
983
  {
984
+ "acc": 0.67897987,
985
+ "epoch": 1.6216216216216215,
986
+ "grad_norm": 1.2265625,
987
+ "learning_rate": 9.342421862839632e-05,
988
+ "loss": 1.26616125,
989
+ "memory(GiB)": 19.32,
990
  "step": 450,
991
+ "train_speed(iter/s)": 0.071661
992
  },
993
  {
994
+ "epoch": 1.6216216216216215,
995
+ "eval_acc": 0.6424611973392461,
996
+ "eval_loss": 1.4772522449493408,
997
+ "eval_runtime": 134.5995,
998
+ "eval_samples_per_second": 1.122,
999
+ "eval_steps_per_second": 0.565,
1000
  "step": 450
1001
  },
1002
  {
1003
+ "acc": 0.66755495,
1004
+ "epoch": 1.6396396396396398,
1005
+ "grad_norm": 1.0390625,
1006
+ "learning_rate": 9.249078256040541e-05,
1007
+ "loss": 1.30118093,
1008
+ "memory(GiB)": 22.82,
1009
  "step": 455,
1010
+ "train_speed(iter/s)": 0.070312
1011
  },
1012
  {
1013
+ "acc": 0.66560607,
1014
+ "epoch": 1.6576576576576576,
1015
+ "grad_norm": 1.0546875,
1016
+ "learning_rate": 9.155177489072527e-05,
1017
+ "loss": 1.31042576,
1018
+ "memory(GiB)": 19.56,
1019
  "step": 460,
1020
+ "train_speed(iter/s)": 0.070454
1021
  },
1022
  {
1023
+ "acc": 0.67957892,
1024
+ "epoch": 1.6756756756756757,
1025
+ "grad_norm": 1.3828125,
1026
+ "learning_rate": 9.060740496761082e-05,
1027
+ "loss": 1.31165123,
1028
+ "memory(GiB)": 19.38,
1029
  "step": 465,
1030
+ "train_speed(iter/s)": 0.070592
1031
  },
1032
  {
1033
+ "acc": 0.6744031,
1034
+ "epoch": 1.6936936936936937,
1035
+ "grad_norm": 1.4140625,
1036
+ "learning_rate": 8.965788333481144e-05,
1037
+ "loss": 1.26758223,
1038
+ "memory(GiB)": 19.42,
1039
  "step": 470,
1040
+ "train_speed(iter/s)": 0.070726
1041
  },
1042
  {
1043
+ "acc": 0.66551232,
1044
+ "epoch": 1.7117117117117115,
1045
+ "grad_norm": 0.98046875,
1046
+ "learning_rate": 8.870342168463085e-05,
1047
+ "loss": 1.27216129,
1048
+ "memory(GiB)": 19.27,
1049
  "step": 475,
1050
+ "train_speed(iter/s)": 0.070864
1051
  },
1052
  {
1053
+ "acc": 0.65833273,
1054
+ "epoch": 1.7297297297297298,
1055
+ "grad_norm": 0.9140625,
1056
+ "learning_rate": 8.77442328107313e-05,
1057
+ "loss": 1.32684155,
1058
+ "memory(GiB)": 19.48,
1059
  "step": 480,
1060
+ "train_speed(iter/s)": 0.070997
1061
  },
1062
  {
1063
+ "acc": 0.68646383,
1064
+ "epoch": 1.7477477477477477,
1065
+ "grad_norm": 1.3671875,
1066
+ "learning_rate": 8.678053056069184e-05,
1067
+ "loss": 1.2200016,
1068
+ "memory(GiB)": 19.24,
1069
  "step": 485,
1070
+ "train_speed(iter/s)": 0.071136
1071
  },
1072
  {
1073
+ "acc": 0.69040904,
1074
+ "epoch": 1.7657657657657657,
1075
+ "grad_norm": 1.6171875,
1076
+ "learning_rate": 8.581252978833194e-05,
1077
+ "loss": 1.18706884,
1078
+ "memory(GiB)": 19.53,
1079
  "step": 490,
1080
+ "train_speed(iter/s)": 0.07127
1081
  },
1082
  {
1083
+ "acc": 0.66571455,
1084
+ "epoch": 1.7837837837837838,
1085
+ "grad_norm": 0.8515625,
1086
+ "learning_rate": 8.484044630581057e-05,
1087
+ "loss": 1.29456005,
1088
+ "memory(GiB)": 20.09,
1089
  "step": 495,
1090
+ "train_speed(iter/s)": 0.071401
1091
  },
1092
  {
1093
+ "acc": 0.67682033,
1094
+ "epoch": 1.8018018018018018,
1095
+ "grad_norm": 1.0,
1096
+ "learning_rate": 8.386449683551164e-05,
1097
+ "loss": 1.20547714,
1098
+ "memory(GiB)": 19.95,
1099
  "step": 500,
1100
+ "train_speed(iter/s)": 0.071533
1101
  },
1102
  {
1103
+ "epoch": 1.8018018018018018,
1104
+ "eval_acc": 0.6413155949741316,
1105
+ "eval_loss": 1.479081630706787,
1106
+ "eval_runtime": 134.2299,
1107
+ "eval_samples_per_second": 1.125,
1108
+ "eval_steps_per_second": 0.566,
1109
  "step": 500
1110
  },
1111
  {
1112
+ "acc": 0.67326751,
1113
+ "epoch": 1.8198198198198199,
1114
+ "grad_norm": 1.0546875,
1115
+ "learning_rate": 8.288489896172669e-05,
1116
+ "loss": 1.25247726,
1117
+ "memory(GiB)": 20.29,
1118
  "step": 505,
1119
+ "train_speed(iter/s)": 0.070304
1120
  },
1121
  {
1122
+ "acc": 0.66375732,
1123
+ "epoch": 1.8378378378378377,
1124
+ "grad_norm": 0.9296875,
1125
+ "learning_rate": 8.190187108214514e-05,
1126
+ "loss": 1.28065901,
1127
+ "memory(GiB)": 20.04,
1128
  "step": 510,
1129
+ "train_speed(iter/s)": 0.070438
1130
  },
1131
  {
1132
+ "acc": 0.69006267,
1133
+ "epoch": 1.855855855855856,
1134
+ "grad_norm": 1.0234375,
1135
+ "learning_rate": 8.091563235916343e-05,
1136
+ "loss": 1.13905525,
1137
+ "memory(GiB)": 20.03,
1138
  "step": 515,
1139
+ "train_speed(iter/s)": 0.070569
1140
  },
1141
  {
1142
+ "acc": 0.69745221,
1143
+ "epoch": 1.8738738738738738,
1144
+ "grad_norm": 0.96484375,
1145
+ "learning_rate": 7.992640267102351e-05,
1146
+ "loss": 1.14712362,
1147
+ "memory(GiB)": 18.5,
1148
  "step": 520,
1149
+ "train_speed(iter/s)": 0.070709
1150
  },
1151
  {
1152
+ "acc": 0.6707756,
1153
+ "epoch": 1.8918918918918919,
1154
+ "grad_norm": 1.328125,
1155
+ "learning_rate": 7.893440256279186e-05,
1156
+ "loss": 1.30717278,
1157
+ "memory(GiB)": 20.66,
1158
  "step": 525,
1159
+ "train_speed(iter/s)": 0.07083
1160
  },
1161
  {
1162
+ "acc": 0.66872559,
1163
+ "epoch": 1.90990990990991,
1164
+ "grad_norm": 0.9765625,
1165
+ "learning_rate": 7.793985319718982e-05,
1166
+ "loss": 1.28408003,
1167
+ "memory(GiB)": 19.48,
1168
  "step": 530,
1169
+ "train_speed(iter/s)": 0.070948
1170
  },
1171
  {
1172
+ "acc": 0.68111048,
1173
+ "epoch": 1.9279279279279278,
1174
+ "grad_norm": 0.76171875,
1175
+ "learning_rate": 7.694297630528612e-05,
1176
+ "loss": 1.21391411,
1177
+ "memory(GiB)": 19.88,
1178
  "step": 535,
1179
+ "train_speed(iter/s)": 0.071071
1180
  },
1181
  {
1182
+ "acc": 0.65094652,
1183
+ "epoch": 1.945945945945946,
1184
+ "grad_norm": 0.83203125,
1185
+ "learning_rate": 7.594399413706277e-05,
1186
+ "loss": 1.34138126,
1187
+ "memory(GiB)": 19.9,
1188
  "step": 540,
1189
+ "train_speed(iter/s)": 0.071193
1190
  },
1191
  {
1192
+ "acc": 0.67896776,
1193
+ "epoch": 1.9639639639639639,
1194
+ "grad_norm": 0.796875,
1195
+ "learning_rate": 7.494312941186529e-05,
1196
+ "loss": 1.22575331,
1197
+ "memory(GiB)": 19.43,
1198
  "step": 545,
1199
+ "train_speed(iter/s)": 0.071302
1200
  },
1201
  {
1202
+ "acc": 0.6839644,
1203
+ "epoch": 1.981981981981982,
1204
+ "grad_norm": 0.78515625,
1205
+ "learning_rate": 7.394060526874825e-05,
1206
+ "loss": 1.25017443,
1207
+ "memory(GiB)": 19.25,
1208
  "step": 550,
1209
+ "train_speed(iter/s)": 0.07142
1210
  },
1211
  {
1212
+ "epoch": 1.981981981981982,
1213
+ "eval_acc": 0.645269770879527,
1214
+ "eval_loss": 1.4606801271438599,
1215
+ "eval_runtime": 134.7756,
1216
+ "eval_samples_per_second": 1.12,
1217
+ "eval_steps_per_second": 0.564,
1218
  "step": 550
1219
  },
1220
  {
1221
+ "acc": 0.68771811,
1222
+ "epoch": 2.0,
1223
+ "grad_norm": 0.81640625,
1224
+ "learning_rate": 7.293664521672729e-05,
1225
+ "loss": 1.22415581,
1226
+ "memory(GiB)": 22.67,
1227
  "step": 555,
1228
+ "train_speed(iter/s)": 0.070304
1229
  },
1230
  {
1231
+ "acc": 0.741537,
1232
+ "epoch": 2.018018018018018,
1233
+ "grad_norm": 0.6171875,
1234
+ "learning_rate": 7.193147308494851e-05,
1235
+ "loss": 0.95370378,
1236
+ "memory(GiB)": 19.64,
1237
  "step": 560,
1238
+ "train_speed(iter/s)": 0.070425
1239
  },
1240
  {
1241
+ "acc": 0.75044699,
1242
+ "epoch": 2.036036036036036,
1243
+ "grad_norm": 1.09375,
1244
+ "learning_rate": 7.09253129727867e-05,
1245
+ "loss": 0.95568914,
1246
+ "memory(GiB)": 19.4,
1247
  "step": 565,
1248
+ "train_speed(iter/s)": 0.070541
1249
  },
1250
  {
1251
+ "acc": 0.75126195,
1252
+ "epoch": 2.054054054054054,
1253
+ "grad_norm": 1.3671875,
1254
+ "learning_rate": 6.991838919988322e-05,
1255
+ "loss": 0.92719631,
1256
+ "memory(GiB)": 19.54,
1257
  "step": 570,
1258
+ "train_speed(iter/s)": 0.070658
1259
  },
1260
  {
1261
+ "acc": 0.74883032,
1262
+ "epoch": 2.0720720720720722,
1263
+ "grad_norm": 1.0078125,
1264
+ "learning_rate": 6.891092625613469e-05,
1265
+ "loss": 0.92080975,
1266
+ "memory(GiB)": 20.17,
1267
  "step": 575,
1268
+ "train_speed(iter/s)": 0.07077
1269
  },
1270
  {
1271
+ "acc": 0.76222944,
1272
+ "epoch": 2.09009009009009,
1273
+ "grad_norm": 0.99609375,
1274
+ "learning_rate": 6.790314875164393e-05,
1275
+ "loss": 0.88407106,
1276
+ "memory(GiB)": 19.57,
1277
  "step": 580,
1278
+ "train_speed(iter/s)": 0.070882
1279
  },
1280
  {
1281
+ "acc": 0.76224823,
1282
+ "epoch": 2.108108108108108,
1283
+ "grad_norm": 1.0859375,
1284
+ "learning_rate": 6.689528136664377e-05,
1285
+ "loss": 0.85150976,
1286
+ "memory(GiB)": 19.54,
1287
  "step": 585,
1288
+ "train_speed(iter/s)": 0.070995
1289
  },
1290
  {
1291
+ "acc": 0.73958569,
1292
+ "epoch": 2.126126126126126,
1293
+ "grad_norm": 1.3828125,
1294
+ "learning_rate": 6.588754880140573e-05,
1295
+ "loss": 0.92128286,
1296
+ "memory(GiB)": 19.58,
1297
  "step": 590,
1298
+ "train_speed(iter/s)": 0.071101
1299
  },
1300
  {
1301
+ "acc": 0.74549003,
1302
+ "epoch": 2.144144144144144,
1303
+ "grad_norm": 1.359375,
1304
+ "learning_rate": 6.488017572614363e-05,
1305
+ "loss": 0.90851021,
1306
+ "memory(GiB)": 18.59,
1307
  "step": 595,
1308
+ "train_speed(iter/s)": 0.071211
1309
  },
1310
  {
1311
+ "acc": 0.73912826,
1312
+ "epoch": 2.1621621621621623,
1313
+ "grad_norm": 1.3125,
1314
+ "learning_rate": 6.387338673092443e-05,
1315
+ "loss": 0.92900734,
1316
+ "memory(GiB)": 19.54,
1317
  "step": 600,
1318
+ "train_speed(iter/s)": 0.071321
1319
  },
1320
  {
1321
+ "epoch": 2.1621621621621623,
1322
+ "eval_acc": 0.6320768662232077,
1323
+ "eval_loss": 1.5818341970443726,
1324
+ "eval_runtime": 134.4691,
1325
+ "eval_samples_per_second": 1.123,
1326
+ "eval_steps_per_second": 0.565,
1327
  "step": 600
1328
  },
1329
  {
1330
+ "acc": 0.75979438,
1331
+ "epoch": 2.18018018018018,
1332
+ "grad_norm": 1.09375,
1333
+ "learning_rate": 6.286740627559656e-05,
1334
+ "loss": 0.89129753,
1335
+ "memory(GiB)": 22.37,
1336
  "step": 605,
1337
+ "train_speed(iter/s)": 0.070301
1338
  },
1339
  {
1340
+ "acc": 0.72820721,
1341
+ "epoch": 2.1981981981981984,
1342
+ "grad_norm": 2.15625,
1343
+ "learning_rate": 6.186245863974757e-05,
1344
+ "loss": 0.96495447,
1345
+ "memory(GiB)": 19.6,
1346
  "step": 610,
1347
+ "train_speed(iter/s)": 0.070413
1348
  },
1349
  {
1350
+ "acc": 0.75764585,
1351
+ "epoch": 2.2162162162162162,
1352
+ "grad_norm": 1.0078125,
1353
+ "learning_rate": 6.0858767872701715e-05,
1354
+ "loss": 0.89218092,
1355
+ "memory(GiB)": 20.15,
1356
  "step": 615,
1357
+ "train_speed(iter/s)": 0.070515
1358
  },
1359
  {
1360
+ "acc": 0.75772595,
1361
+ "epoch": 2.234234234234234,
1362
+ "grad_norm": 1.6328125,
1363
+ "learning_rate": 5.985655774356901e-05,
1364
+ "loss": 0.89191771,
1365
+ "memory(GiB)": 19.46,
1366
  "step": 620,
1367
+ "train_speed(iter/s)": 0.070627
1368
  },
1369
  {
1370
+ "acc": 0.7377079,
1371
+ "epoch": 2.2522522522522523,
1372
+ "grad_norm": 1.1875,
1373
+ "learning_rate": 5.8856051691356884e-05,
1374
+ "loss": 0.94241228,
1375
+ "memory(GiB)": 19.35,
1376
  "step": 625,
1377
+ "train_speed(iter/s)": 0.070733
1378
  },
1379
  {
1380
+ "acc": 0.77948771,
1381
+ "epoch": 2.27027027027027,
1382
+ "grad_norm": 1.2890625,
1383
+ "learning_rate": 5.785747277515506e-05,
1384
+ "loss": 0.79317036,
1385
+ "memory(GiB)": 20.48,
1386
  "step": 630,
1387
+ "train_speed(iter/s)": 0.070844
1388
  },
1389
  {
1390
+ "acc": 0.76766949,
1391
+ "epoch": 2.2882882882882885,
1392
+ "grad_norm": 0.97265625,
1393
+ "learning_rate": 5.686104362440552e-05,
1394
+ "loss": 0.82855272,
1395
+ "memory(GiB)": 20.12,
1396
  "step": 635,
1397
+ "train_speed(iter/s)": 0.070945
1398
  },
1399
  {
1400
+ "acc": 0.74998231,
1401
+ "epoch": 2.3063063063063063,
1402
+ "grad_norm": 2.9375,
1403
+ "learning_rate": 5.586698638926811e-05,
1404
+ "loss": 0.93049393,
1405
+ "memory(GiB)": 20.06,
1406
  "step": 640,
1407
+ "train_speed(iter/s)": 0.071044
1408
  },
1409
  {
1410
+ "acc": 0.75094385,
1411
+ "epoch": 2.3243243243243246,
1412
+ "grad_norm": 1.1875,
1413
+ "learning_rate": 5.487552269109287e-05,
1414
+ "loss": 0.86875353,
1415
+ "memory(GiB)": 19.33,
1416
  "step": 645,
1417
+ "train_speed(iter/s)": 0.071146
1418
  },
1419
  {
1420
+ "acc": 0.74836354,
1421
+ "epoch": 2.3423423423423424,
1422
+ "grad_norm": 1.1328125,
1423
+ "learning_rate": 5.388687357301051e-05,
1424
+ "loss": 0.88861446,
1425
+ "memory(GiB)": 20.11,
1426
  "step": 650,
1427
+ "train_speed(iter/s)": 0.071249
1428
  },
1429
  {
1430
+ "epoch": 2.3423423423423424,
1431
+ "eval_acc": 0.630709534368071,
1432
+ "eval_loss": 1.5767972469329834,
1433
+ "eval_runtime": 134.3063,
1434
+ "eval_samples_per_second": 1.124,
1435
+ "eval_steps_per_second": 0.566,
1436
  "step": 650
1437
  },
1438
  {
1439
+ "acc": 0.76697993,
1440
+ "epoch": 2.3603603603603602,
1441
+ "grad_norm": 1.2734375,
1442
+ "learning_rate": 5.290125945065162e-05,
1443
+ "loss": 0.85701361,
1444
+ "memory(GiB)": 22.96,
1445
  "step": 655,
1446
+ "train_speed(iter/s)": 0.070324
1447
  },
1448
  {
1449
+ "acc": 0.76252317,
1450
+ "epoch": 2.3783783783783785,
1451
+ "grad_norm": 1.0390625,
1452
+ "learning_rate": 5.191890006300573e-05,
1453
+ "loss": 0.85787058,
1454
+ "memory(GiB)": 20.13,
1455
  "step": 660,
1456
+ "train_speed(iter/s)": 0.070422
1457
  },
1458
  {
1459
+ "acc": 0.7651772,
1460
+ "epoch": 2.3963963963963963,
1461
+ "grad_norm": 1.1875,
1462
+ "learning_rate": 5.094001442343155e-05,
1463
+ "loss": 0.8521904,
1464
+ "memory(GiB)": 19.86,
1465
  "step": 665,
1466
+ "train_speed(iter/s)": 0.070523
1467
  },
1468
  {
1469
+ "acc": 0.73847542,
1470
+ "epoch": 2.4144144144144146,
1471
+ "grad_norm": 1.2734375,
1472
+ "learning_rate": 4.996482077082849e-05,
1473
+ "loss": 0.95858736,
1474
+ "memory(GiB)": 19.29,
1475
  "step": 670,
1476
+ "train_speed(iter/s)": 0.070628
1477
  },
1478
  {
1479
+ "acc": 0.74675932,
1480
+ "epoch": 2.4324324324324325,
1481
+ "grad_norm": 1.2734375,
1482
+ "learning_rate": 4.899353652098139e-05,
1483
+ "loss": 0.86487961,
1484
+ "memory(GiB)": 18.64,
1485
  "step": 675,
1486
+ "train_speed(iter/s)": 0.070727
1487
  },
1488
  {
1489
+ "acc": 0.73309464,
1490
+ "epoch": 2.4504504504504503,
1491
+ "grad_norm": 1.8671875,
1492
+ "learning_rate": 4.802637821808819e-05,
1493
+ "loss": 0.93775883,
1494
+ "memory(GiB)": 19.78,
1495
  "step": 680,
1496
+ "train_speed(iter/s)": 0.070825
1497
  },
1498
  {
1499
+ "acc": 0.76575212,
1500
+ "epoch": 2.4684684684684686,
1501
+ "grad_norm": 1.03125,
1502
+ "learning_rate": 4.706356148648246e-05,
1503
+ "loss": 0.8259285,
1504
+ "memory(GiB)": 19.9,
1505
  "step": 685,
1506
+ "train_speed(iter/s)": 0.07092
1507
  },
1508
  {
1509
+ "acc": 0.76865396,
1510
+ "epoch": 2.4864864864864864,
1511
+ "grad_norm": 1.3125,
1512
+ "learning_rate": 4.6105300982560625e-05,
1513
+ "loss": 0.84868517,
1514
+ "memory(GiB)": 19.19,
1515
  "step": 690,
1516
+ "train_speed(iter/s)": 0.071014
1517
  },
1518
  {
1519
+ "acc": 0.75694928,
1520
+ "epoch": 2.5045045045045047,
1521
+ "grad_norm": 1.03125,
1522
+ "learning_rate": 4.515181034692515e-05,
1523
+ "loss": 0.87043924,
1524
+ "memory(GiB)": 19.95,
1525
  "step": 695,
1526
+ "train_speed(iter/s)": 0.071105
1527
  },
1528
  {
1529
+ "acc": 0.75771561,
1530
+ "epoch": 2.5225225225225225,
1531
+ "grad_norm": 1.3515625,
1532
+ "learning_rate": 4.420330215675415e-05,
1533
+ "loss": 0.86245804,
1534
+ "memory(GiB)": 19.18,
1535
  "step": 700,
1536
+ "train_speed(iter/s)": 0.071194
1537
  },
1538
  {
1539
+ "epoch": 2.5225225225225225,
1540
+ "eval_acc": 0.6335181079083518,
1541
+ "eval_loss": 1.5894646644592285,
1542
+ "eval_runtime": 134.225,
1543
+ "eval_samples_per_second": 1.125,
1544
+ "eval_steps_per_second": 0.566,
1545
  "step": 700
1546
  },
1547
  {
1548
+ "acc": 0.76191721,
1549
+ "epoch": 2.5405405405405403,
1550
+ "grad_norm": 1.71875,
1551
+ "learning_rate": 4.325998787840818e-05,
1552
+ "loss": 0.85848246,
1553
+ "memory(GiB)": 19.14,
1554
  "step": 705,
1555
+ "train_speed(iter/s)": 0.070324
1556
  },
1557
  {
1558
+ "acc": 0.76571012,
1559
+ "epoch": 2.5585585585585586,
1560
+ "grad_norm": 1.15625,
1561
+ "learning_rate": 4.2322077820284477e-05,
1562
+ "loss": 0.85979414,
1563
+ "memory(GiB)": 20.01,
1564
  "step": 710,
1565
+ "train_speed(iter/s)": 0.070422
1566
  },
1567
  {
1568
+ "acc": 0.73852654,
1569
+ "epoch": 2.5765765765765765,
1570
+ "grad_norm": 1.6484375,
1571
+ "learning_rate": 4.138978108592962e-05,
1572
+ "loss": 0.90148897,
1573
+ "memory(GiB)": 19.05,
1574
  "step": 715,
1575
+ "train_speed(iter/s)": 0.070518
1576
  },
1577
  {
1578
+ "acc": 0.76960816,
1579
+ "epoch": 2.5945945945945947,
1580
+ "grad_norm": 3.71875,
1581
+ "learning_rate": 4.046330552742053e-05,
1582
+ "loss": 0.88053255,
1583
+ "memory(GiB)": 19.25,
1584
  "step": 720,
1585
+ "train_speed(iter/s)": 0.070616
1586
  },
1587
  {
1588
+ "acc": 0.77552128,
1589
+ "epoch": 2.6126126126126126,
1590
+ "grad_norm": 0.96484375,
1591
+ "learning_rate": 3.954285769902474e-05,
1592
+ "loss": 0.83608866,
1593
+ "memory(GiB)": 19.96,
1594
  "step": 725,
1595
+ "train_speed(iter/s)": 0.070707
1596
  },
1597
  {
1598
+ "acc": 0.76034231,
1599
+ "epoch": 2.6306306306306304,
1600
+ "grad_norm": 1.078125,
1601
+ "learning_rate": 3.8628642811149894e-05,
1602
+ "loss": 0.84258709,
1603
+ "memory(GiB)": 19.75,
1604
  "step": 730,
1605
+ "train_speed(iter/s)": 0.070796
1606
  },
1607
  {
1608
+ "acc": 0.73506665,
1609
+ "epoch": 2.6486486486486487,
1610
+ "grad_norm": 2.125,
1611
+ "learning_rate": 3.772086468459271e-05,
1612
+ "loss": 0.96418314,
1613
+ "memory(GiB)": 19.94,
1614
  "step": 735,
1615
+ "train_speed(iter/s)": 0.070887
1616
  },
1617
  {
1618
+ "acc": 0.74339218,
1619
+ "epoch": 2.6666666666666665,
1620
+ "grad_norm": 1.3359375,
1621
+ "learning_rate": 3.6819725705098094e-05,
1622
+ "loss": 0.94632616,
1623
+ "memory(GiB)": 19.98,
1624
  "step": 740,
1625
+ "train_speed(iter/s)": 0.070978
1626
  },
1627
  {
1628
+ "acc": 0.75258017,
1629
+ "epoch": 2.684684684684685,
1630
+ "grad_norm": 1.328125,
1631
+ "learning_rate": 3.592542677823787e-05,
1632
+ "loss": 0.89630384,
1633
+ "memory(GiB)": 19.9,
1634
  "step": 745,
1635
+ "train_speed(iter/s)": 0.071065
1636
  },
1637
  {
1638
+ "acc": 0.7422905,
1639
+ "epoch": 2.7027027027027026,
1640
+ "grad_norm": 1.46875,
1641
+ "learning_rate": 3.503816728461963e-05,
1642
+ "loss": 0.92554636,
1643
+ "memory(GiB)": 19.94,
1644
  "step": 750,
1645
+ "train_speed(iter/s)": 0.071152
1646
  },
1647
  {
1648
+ "epoch": 2.7027027027027026,
1649
+ "eval_acc": 0.6360679970436068,
1650
+ "eval_loss": 1.577430248260498,
1651
+ "eval_runtime": 134.0595,
1652
+ "eval_samples_per_second": 1.126,
1653
+ "eval_steps_per_second": 0.567,
1654
  "step": 750
1655
  },
1656
  {
1657
+ "acc": 0.76009235,
1658
+ "epoch": 2.7207207207207205,
1659
+ "grad_norm": 1.7265625,
1660
+ "learning_rate": 3.415814503543563e-05,
1661
+ "loss": 0.89433851,
1662
+ "memory(GiB)": 19.38,
1663
  "step": 755,
1664
+ "train_speed(iter/s)": 0.070345
1665
  },
1666
  {
1667
+ "acc": 0.75049233,
1668
+ "epoch": 2.7387387387387387,
1669
+ "grad_norm": 1.453125,
1670
+ "learning_rate": 3.3285556228361483e-05,
1671
+ "loss": 0.90194426,
1672
+ "memory(GiB)": 19.78,
1673
  "step": 760,
1674
+ "train_speed(iter/s)": 0.070432
1675
  },
1676
  {
1677
+ "acc": 0.73652792,
1678
+ "epoch": 2.756756756756757,
1679
+ "grad_norm": 1.375,
1680
+ "learning_rate": 3.2420595403814615e-05,
1681
+ "loss": 0.94170513,
1682
+ "memory(GiB)": 19.18,
1683
  "step": 765,
1684
+ "train_speed(iter/s)": 0.070517
1685
  },
1686
  {
1687
+ "acc": 0.74097948,
1688
+ "epoch": 2.774774774774775,
1689
+ "grad_norm": 1.171875,
1690
+ "learning_rate": 3.156345540158226e-05,
1691
+ "loss": 0.92526283,
1692
+ "memory(GiB)": 19.96,
1693
  "step": 770,
1694
+ "train_speed(iter/s)": 0.070603
1695
  },
1696
  {
1697
+ "acc": 0.77357135,
1698
+ "epoch": 2.7927927927927927,
1699
+ "grad_norm": 1.21875,
1700
+ "learning_rate": 3.0714327317828445e-05,
1701
+ "loss": 0.84344234,
1702
+ "memory(GiB)": 19.42,
1703
  "step": 775,
1704
+ "train_speed(iter/s)": 0.070681
1705
  },
1706
  {
1707
+ "acc": 0.76570077,
1708
+ "epoch": 2.810810810810811,
1709
+ "grad_norm": 1.4765625,
1710
+ "learning_rate": 2.9873400462489982e-05,
1711
+ "loss": 0.85261898,
1712
+ "memory(GiB)": 19.91,
1713
  "step": 780,
1714
+ "train_speed(iter/s)": 0.070768
1715
  },
1716
  {
1717
+ "acc": 0.73979292,
1718
+ "epoch": 2.828828828828829,
1719
+ "grad_norm": 1.375,
1720
+ "learning_rate": 2.904086231707032e-05,
1721
+ "loss": 0.94777365,
1722
+ "memory(GiB)": 19.72,
1723
  "step": 785,
1724
+ "train_speed(iter/s)": 0.07085
1725
  },
1726
  {
1727
+ "acc": 0.75035534,
1728
+ "epoch": 2.846846846846847,
1729
+ "grad_norm": 1.1484375,
1730
+ "learning_rate": 2.8216898492841355e-05,
1731
+ "loss": 0.88380022,
1732
+ "memory(GiB)": 19.09,
1733
  "step": 790,
1734
+ "train_speed(iter/s)": 0.070936
1735
  },
1736
  {
1737
+ "acc": 0.76033754,
1738
+ "epoch": 2.864864864864865,
1739
+ "grad_norm": 1.078125,
1740
+ "learning_rate": 2.7401692689462153e-05,
1741
+ "loss": 0.84767551,
1742
+ "memory(GiB)": 20.02,
1743
  "step": 795,
1744
+ "train_speed(iter/s)": 0.071016
1745
  },
1746
  {
1747
+ "acc": 0.74806399,
1748
+ "epoch": 2.8828828828828827,
1749
+ "grad_norm": 1.53125,
1750
+ "learning_rate": 2.6595426654023643e-05,
1751
+ "loss": 0.92544088,
1752
+ "memory(GiB)": 19.88,
1753
  "step": 800,
1754
+ "train_speed(iter/s)": 0.0711
1755
  },
1756
  {
1757
+ "epoch": 2.8828828828828827,
1758
+ "eval_acc": 0.635920177383592,
1759
+ "eval_loss": 1.5869847536087036,
1760
+ "eval_runtime": 134.517,
1761
+ "eval_samples_per_second": 1.123,
1762
+ "eval_steps_per_second": 0.565,
1763
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1764
  }
1765
  ],
1766
  "logging_steps": 5,
1767
+ "max_steps": 1108,
1768
  "num_input_tokens_seen": 0,
1769
  "num_train_epochs": 4,
1770
+ "save_steps": 100,
1771
  "stateful_callbacks": {
1772
  "TrainerControl": {
1773
  "args": {
 
1780
  "attributes": {}
1781
  }
1782
  },
1783
+ "total_flos": 2.3166381763355443e+17,
1784
  "train_batch_size": 2,
1785
  "trial_name": null,
1786
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f78ee04ba526a9cc075440a6469f13f84190c66bb1135fbaf243aba05b3ef977
3
  size 7224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ab222ecb986b0dd4618d234f15502efb691678dd397c47e5eae69b67e78b68
3
  size 7224