Desm0nt commited on
Commit
b194f04
1 Parent(s): 6b7014c

Upload 5 files

Browse files
Files changed (4) hide show
  1. scheduler.pt +3 -0
  2. sft_args.json +209 -0
  3. trainer_state.json +3251 -0
  4. training_args.bin +3 -0
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8a8fcfee27e2311e99fceeb21fe6187b6a8ab14c19134e1a91af67755049cd7
3
+ size 1064
sft_args.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "phi3-vision-128k-instruct",
3
+ "model_id_or_path": "LLM-Research/Phi-3-vision-128k-instruct",
4
+ "model_revision": "master",
5
+ "sft_type": "lora",
6
+ "freeze_parameters": 0.0,
7
+ "additional_trainable_parameters": [],
8
+ "tuner_backend": "peft",
9
+ "template_type": "phi3-vl",
10
+ "output_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240531-071942",
11
+ "add_output_dir_suffix": true,
12
+ "ddp_backend": null,
13
+ "ddp_find_unused_parameters": null,
14
+ "ddp_broadcast_buffers": null,
15
+ "seed": 42,
16
+ "resume_from_checkpoint": null,
17
+ "ignore_data_skip": false,
18
+ "dtype": "bf16",
19
+ "packing": false,
20
+ "dataset": [
21
+ "test2"
22
+ ],
23
+ "val_dataset": [],
24
+ "dataset_seed": 42,
25
+ "dataset_test_ratio": 0.05,
26
+ "use_loss_scale": false,
27
+ "system": null,
28
+ "max_length": 2048,
29
+ "truncation_strategy": "delete",
30
+ "check_dataset_strategy": "none",
31
+ "model_name": [
32
+ null,
33
+ null
34
+ ],
35
+ "model_author": [
36
+ null,
37
+ null
38
+ ],
39
+ "quant_method": null,
40
+ "quantization_bit": 0,
41
+ "hqq_axis": 0,
42
+ "hqq_dynamic_config_path": null,
43
+ "bnb_4bit_comp_dtype": "bf16",
44
+ "bnb_4bit_quant_type": "nf4",
45
+ "bnb_4bit_use_double_quant": true,
46
+ "bnb_4bit_quant_storage": null,
47
+ "lora_target_modules": [
48
+ "v_proj",
49
+ "q_proj",
50
+ "out_proj",
51
+ "gate_up_proj",
52
+ "img_projection.0",
53
+ "qkv_proj",
54
+ "img_projection.2",
55
+ "fc1",
56
+ "k_proj",
57
+ "fc2",
58
+ "o_proj",
59
+ "down_proj"
60
+ ],
61
+ "lora_rank": 64,
62
+ "lora_alpha": 64,
63
+ "lora_dropout_p": 0.05,
64
+ "lora_bias_trainable": "none",
65
+ "lora_modules_to_save": [],
66
+ "lora_dtype": null,
67
+ "lora_lr_ratio": null,
68
+ "use_rslora": false,
69
+ "use_dora": false,
70
+ "init_lora_weights": true,
71
+ "boft_block_size": 4,
72
+ "boft_block_num": 0,
73
+ "boft_n_butterfly_factor": 1,
74
+ "boft_target_modules": [
75
+ "DEFAULT"
76
+ ],
77
+ "boft_dropout": 0.0,
78
+ "boft_modules_to_save": [],
79
+ "vera_rank": 256,
80
+ "vera_target_modules": [
81
+ "DEFAULT"
82
+ ],
83
+ "vera_projection_prng_key": 0,
84
+ "vera_dropout": 0.0,
85
+ "vera_d_initial": 0.1,
86
+ "vera_modules_to_save": [],
87
+ "adapter_act": "gelu",
88
+ "adapter_length": 128,
89
+ "use_galore": false,
90
+ "galore_rank": 128,
91
+ "galore_target_modules": null,
92
+ "galore_update_proj_gap": 50,
93
+ "galore_scale": 1.0,
94
+ "galore_proj_type": "std",
95
+ "galore_optim_per_parameter": false,
96
+ "galore_with_embedding": false,
97
+ "adalora_target_r": 8,
98
+ "adalora_init_r": 12,
99
+ "adalora_tinit": 0,
100
+ "adalora_tfinal": 0,
101
+ "adalora_deltaT": 1,
102
+ "adalora_beta1": 0.85,
103
+ "adalora_beta2": 0.85,
104
+ "adalora_orth_reg_weight": 0.5,
105
+ "ia3_target_modules": [
106
+ "DEFAULT"
107
+ ],
108
+ "ia3_feedforward_modules": [],
109
+ "ia3_modules_to_save": [],
110
+ "llamapro_num_new_blocks": 4,
111
+ "llamapro_num_groups": null,
112
+ "neftune_noise_alpha": null,
113
+ "neftune_backend": "transformers",
114
+ "lisa_activated_layers": 0,
115
+ "lisa_step_interval": 20,
116
+ "gradient_checkpointing": true,
117
+ "deepspeed": null,
118
+ "batch_size": 1,
119
+ "eval_batch_size": 1,
120
+ "num_train_epochs": 4,
121
+ "max_steps": -1,
122
+ "optim": "adamw_torch",
123
+ "adam_beta1": 0.9,
124
+ "adam_beta2": 0.999,
125
+ "learning_rate": 0.00015,
126
+ "weight_decay": 0.1,
127
+ "gradient_accumulation_steps": 2,
128
+ "max_grad_norm": 0.5,
129
+ "predict_with_generate": false,
130
+ "lr_scheduler_type": "linear",
131
+ "warmup_ratio": 0.05,
132
+ "eval_steps": 50,
133
+ "save_steps": 369,
134
+ "save_only_model": false,
135
+ "save_total_limit": 2,
136
+ "logging_steps": 5,
137
+ "dataloader_num_workers": 1,
138
+ "dataloader_pin_memory": true,
139
+ "dataloader_drop_last": false,
140
+ "push_to_hub": false,
141
+ "hub_model_id": null,
142
+ "hub_token": null,
143
+ "hub_private_repo": false,
144
+ "push_hub_strategy": "push_best",
145
+ "test_oom_error": false,
146
+ "disable_tqdm": false,
147
+ "lazy_tokenize": true,
148
+ "preprocess_num_proc": 1,
149
+ "use_flash_attn": null,
150
+ "ignore_args_error": false,
151
+ "check_model_is_latest": true,
152
+ "logging_dir": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240531-071942/runs",
153
+ "report_to": [
154
+ "tensorboard"
155
+ ],
156
+ "acc_strategy": "token",
157
+ "save_on_each_node": true,
158
+ "evaluation_strategy": "steps",
159
+ "save_strategy": "steps",
160
+ "save_safetensors": true,
161
+ "gpu_memory_fraction": null,
162
+ "include_num_input_tokens_seen": false,
163
+ "local_repo_path": null,
164
+ "custom_register_path": null,
165
+ "custom_dataset_info": null,
166
+ "device_map_config_path": null,
167
+ "max_new_tokens": 2048,
168
+ "do_sample": true,
169
+ "temperature": 0.3,
170
+ "top_k": 20,
171
+ "top_p": 0.7,
172
+ "repetition_penalty": 1.0,
173
+ "num_beams": 1,
174
+ "fsdp": "",
175
+ "fsdp_config": null,
176
+ "sequence_parallel_size": 1,
177
+ "model_layer_cls_name": null,
178
+ "metric_warmup_step": 0,
179
+ "fsdp_num": 1,
180
+ "per_device_train_batch_size": null,
181
+ "per_device_eval_batch_size": null,
182
+ "self_cognition_sample": 0,
183
+ "train_dataset_mix_ratio": 0.0,
184
+ "train_dataset_mix_ds": [
185
+ "ms-bench"
186
+ ],
187
+ "train_dataset_sample": -1,
188
+ "val_dataset_sample": null,
189
+ "safe_serialization": null,
190
+ "only_save_model": null,
191
+ "neftune_alpha": null,
192
+ "deepspeed_config_path": null,
193
+ "model_cache_dir": null,
194
+ "custom_train_dataset_path": [],
195
+ "custom_val_dataset_path": [],
196
+ "use_self_cognition": false,
197
+ "lora_use_embedding": false,
198
+ "lora_use_all": true,
199
+ "lora_m2s_use_embedding": false,
200
+ "lora_m2s_use_ln": false,
201
+ "torch_dtype": "torch.bfloat16",
202
+ "fp16": false,
203
+ "bf16": true,
204
+ "bnb_4bit_compute_dtype": "torch.bfloat16",
205
+ "load_in_4bit": false,
206
+ "load_in_8bit": false,
207
+ "train_sampler_random": true,
208
+ "training_args": "Seq2SeqTrainingArguments(output_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240531-071942', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, learning_rate=0.00015, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.5, num_train_epochs=4, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, lr_scheduler_kwargs={}, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240531-071942/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=369, save_total_limit=2, save_safetensors=True, save_on_each_node=True, save_only_model=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=None, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=1, dataloader_prefetch_factor=None, past_index=-1, run_name='D:\\\\_____NEW_NN\\\\LLM\\\\MiniCPM-V\\\\finetune\\\\output\\\\phi3-vision-128k-instruct\\\\v0-20240531-071942', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, gradient_accumulation_kwargs=None), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=False, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, sortish_sampler=True, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=GenerationConfig {\n \"do_sample\": true,\n \"eos_token_id\": 32000,\n \"max_new_tokens\": 2048,\n \"pad_token_id\": 32000,\n \"temperature\": 0.3,\n \"top_k\": 20,\n \"top_p\": 0.7\n}\n, train_sampler_random=True, push_hub_strategy='push_best', acc_strategy='token', additional_saved_files=[], metric_warmup_step=0, train_dataset_sample=738)"
209
+ }
trainer_state.json ADDED
@@ -0,0 +1,3251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.89783895,
3
+ "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240531-071942\\checkpoint-1476",
4
+ "epoch": 4.0,
5
+ "eval_steps": 50,
6
+ "global_step": 1476,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "acc": 0.47843874,
13
+ "epoch": 0.0027100271002710027,
14
+ "grad_norm": 0.921875,
15
+ "learning_rate": 2.027027027027027e-06,
16
+ "loss": 2.52092218,
17
+ "memory(GiB)": 12.33,
18
+ "step": 1,
19
+ "train_speed(iter/s)": 0.066334
20
+ },
21
+ {
22
+ "acc": 0.51126236,
23
+ "epoch": 0.013550135501355014,
24
+ "grad_norm": 0.796875,
25
+ "learning_rate": 1.0135135135135135e-05,
26
+ "loss": 2.42214346,
27
+ "memory(GiB)": 13.38,
28
+ "step": 5,
29
+ "train_speed(iter/s)": 0.127981
30
+ },
31
+ {
32
+ "acc": 0.48636103,
33
+ "epoch": 0.02710027100271003,
34
+ "grad_norm": 1.09375,
35
+ "learning_rate": 2.027027027027027e-05,
36
+ "loss": 2.53938828,
37
+ "memory(GiB)": 13.38,
38
+ "step": 10,
39
+ "train_speed(iter/s)": 0.144622
40
+ },
41
+ {
42
+ "acc": 0.47292571,
43
+ "epoch": 0.04065040650406504,
44
+ "grad_norm": 1.1328125,
45
+ "learning_rate": 3.0405405405405404e-05,
46
+ "loss": 2.50403137,
47
+ "memory(GiB)": 13.38,
48
+ "step": 15,
49
+ "train_speed(iter/s)": 0.150329
50
+ },
51
+ {
52
+ "acc": 0.52069569,
53
+ "epoch": 0.05420054200542006,
54
+ "grad_norm": 0.83984375,
55
+ "learning_rate": 4.054054054054054e-05,
56
+ "loss": 2.28312988,
57
+ "memory(GiB)": 13.38,
58
+ "step": 20,
59
+ "train_speed(iter/s)": 0.152832
60
+ },
61
+ {
62
+ "acc": 0.53072515,
63
+ "epoch": 0.06775067750677506,
64
+ "grad_norm": 0.59765625,
65
+ "learning_rate": 5.067567567567567e-05,
66
+ "loss": 2.20206394,
67
+ "memory(GiB)": 13.38,
68
+ "step": 25,
69
+ "train_speed(iter/s)": 0.154186
70
+ },
71
+ {
72
+ "acc": 0.52423077,
73
+ "epoch": 0.08130081300813008,
74
+ "grad_norm": 0.89453125,
75
+ "learning_rate": 6.081081081081081e-05,
76
+ "loss": 2.31712227,
77
+ "memory(GiB)": 14.44,
78
+ "step": 30,
79
+ "train_speed(iter/s)": 0.155403
80
+ },
81
+ {
82
+ "acc": 0.54341574,
83
+ "epoch": 0.0948509485094851,
84
+ "grad_norm": 0.7109375,
85
+ "learning_rate": 7.094594594594594e-05,
86
+ "loss": 2.03825722,
87
+ "memory(GiB)": 14.44,
88
+ "step": 35,
89
+ "train_speed(iter/s)": 0.156383
90
+ },
91
+ {
92
+ "acc": 0.56128917,
93
+ "epoch": 0.10840108401084012,
94
+ "grad_norm": 0.734375,
95
+ "learning_rate": 8.108108108108108e-05,
96
+ "loss": 1.98176575,
97
+ "memory(GiB)": 14.44,
98
+ "step": 40,
99
+ "train_speed(iter/s)": 0.156973
100
+ },
101
+ {
102
+ "acc": 0.53511982,
103
+ "epoch": 0.12195121951219512,
104
+ "grad_norm": 1.1484375,
105
+ "learning_rate": 9.121621621621621e-05,
106
+ "loss": 2.07999401,
107
+ "memory(GiB)": 14.44,
108
+ "step": 45,
109
+ "train_speed(iter/s)": 0.157395
110
+ },
111
+ {
112
+ "acc": 0.56112757,
113
+ "epoch": 0.13550135501355012,
114
+ "grad_norm": 0.83984375,
115
+ "learning_rate": 0.00010135135135135135,
116
+ "loss": 1.92188244,
117
+ "memory(GiB)": 14.44,
118
+ "step": 50,
119
+ "train_speed(iter/s)": 0.157924
120
+ },
121
+ {
122
+ "epoch": 0.13550135501355012,
123
+ "eval_acc": 0.5618983279851376,
124
+ "eval_loss": 1.901513695716858,
125
+ "eval_runtime": 44.6741,
126
+ "eval_samples_per_second": 0.851,
127
+ "eval_steps_per_second": 0.851,
128
+ "step": 50
129
+ },
130
+ {
131
+ "acc": 0.57467785,
132
+ "epoch": 0.14905149051490515,
133
+ "grad_norm": 0.9921875,
134
+ "learning_rate": 0.00011148648648648647,
135
+ "loss": 1.92710114,
136
+ "memory(GiB)": 15.21,
137
+ "step": 55,
138
+ "train_speed(iter/s)": 0.140365
139
+ },
140
+ {
141
+ "acc": 0.55041471,
142
+ "epoch": 0.16260162601626016,
143
+ "grad_norm": 1.1484375,
144
+ "learning_rate": 0.00012162162162162162,
145
+ "loss": 2.09731407,
146
+ "memory(GiB)": 15.21,
147
+ "step": 60,
148
+ "train_speed(iter/s)": 0.141894
149
+ },
150
+ {
151
+ "acc": 0.53853202,
152
+ "epoch": 0.17615176151761516,
153
+ "grad_norm": 1.03125,
154
+ "learning_rate": 0.00013175675675675675,
155
+ "loss": 2.01167774,
156
+ "memory(GiB)": 15.21,
157
+ "step": 65,
158
+ "train_speed(iter/s)": 0.143162
159
+ },
160
+ {
161
+ "acc": 0.57625084,
162
+ "epoch": 0.1897018970189702,
163
+ "grad_norm": 0.81640625,
164
+ "learning_rate": 0.00014189189189189188,
165
+ "loss": 1.77697067,
166
+ "memory(GiB)": 15.21,
167
+ "step": 70,
168
+ "train_speed(iter/s)": 0.144318
169
+ },
170
+ {
171
+ "acc": 0.57468171,
172
+ "epoch": 0.2032520325203252,
173
+ "grad_norm": 0.96484375,
174
+ "learning_rate": 0.00014989300998573466,
175
+ "loss": 1.92596684,
176
+ "memory(GiB)": 15.21,
177
+ "step": 75,
178
+ "train_speed(iter/s)": 0.145344
179
+ },
180
+ {
181
+ "acc": 0.58551345,
182
+ "epoch": 0.21680216802168023,
183
+ "grad_norm": 1.0546875,
184
+ "learning_rate": 0.00014935805991440798,
185
+ "loss": 1.85387783,
186
+ "memory(GiB)": 15.21,
187
+ "step": 80,
188
+ "train_speed(iter/s)": 0.146049
189
+ },
190
+ {
191
+ "acc": 0.54782548,
192
+ "epoch": 0.23035230352303523,
193
+ "grad_norm": 1.0078125,
194
+ "learning_rate": 0.0001488231098430813,
195
+ "loss": 2.04980812,
196
+ "memory(GiB)": 15.21,
197
+ "step": 85,
198
+ "train_speed(iter/s)": 0.146726
199
+ },
200
+ {
201
+ "acc": 0.56594706,
202
+ "epoch": 0.24390243902439024,
203
+ "grad_norm": 1.0,
204
+ "learning_rate": 0.0001482881597717546,
205
+ "loss": 1.93099308,
206
+ "memory(GiB)": 15.21,
207
+ "step": 90,
208
+ "train_speed(iter/s)": 0.147437
209
+ },
210
+ {
211
+ "acc": 0.57289362,
212
+ "epoch": 0.25745257452574527,
213
+ "grad_norm": 0.8046875,
214
+ "learning_rate": 0.00014775320970042795,
215
+ "loss": 1.84038963,
216
+ "memory(GiB)": 15.58,
217
+ "step": 95,
218
+ "train_speed(iter/s)": 0.147976
219
+ },
220
+ {
221
+ "acc": 0.5938961,
222
+ "epoch": 0.27100271002710025,
223
+ "grad_norm": 1.015625,
224
+ "learning_rate": 0.00014721825962910127,
225
+ "loss": 1.71151619,
226
+ "memory(GiB)": 15.58,
227
+ "step": 100,
228
+ "train_speed(iter/s)": 0.148573
229
+ },
230
+ {
231
+ "epoch": 0.27100271002710025,
232
+ "eval_acc": 0.587907448066205,
233
+ "eval_loss": 1.7437644004821777,
234
+ "eval_runtime": 44.775,
235
+ "eval_samples_per_second": 0.849,
236
+ "eval_steps_per_second": 0.849,
237
+ "step": 100
238
+ },
239
+ {
240
+ "acc": 0.5828393,
241
+ "epoch": 0.2845528455284553,
242
+ "grad_norm": 1.078125,
243
+ "learning_rate": 0.00014668330955777461,
244
+ "loss": 1.90272522,
245
+ "memory(GiB)": 15.58,
246
+ "step": 105,
247
+ "train_speed(iter/s)": 0.140241
248
+ },
249
+ {
250
+ "acc": 0.57981925,
251
+ "epoch": 0.2981029810298103,
252
+ "grad_norm": 0.859375,
253
+ "learning_rate": 0.00014614835948644793,
254
+ "loss": 1.88997154,
255
+ "memory(GiB)": 15.58,
256
+ "step": 110,
257
+ "train_speed(iter/s)": 0.141057
258
+ },
259
+ {
260
+ "acc": 0.56191244,
261
+ "epoch": 0.3116531165311653,
262
+ "grad_norm": 0.94140625,
263
+ "learning_rate": 0.00014561340941512125,
264
+ "loss": 1.92102909,
265
+ "memory(GiB)": 15.58,
266
+ "step": 115,
267
+ "train_speed(iter/s)": 0.141824
268
+ },
269
+ {
270
+ "acc": 0.57420607,
271
+ "epoch": 0.3252032520325203,
272
+ "grad_norm": 1.171875,
273
+ "learning_rate": 0.00014507845934379456,
274
+ "loss": 1.80924416,
275
+ "memory(GiB)": 15.58,
276
+ "step": 120,
277
+ "train_speed(iter/s)": 0.14256
278
+ },
279
+ {
280
+ "acc": 0.57703466,
281
+ "epoch": 0.33875338753387535,
282
+ "grad_norm": 0.84765625,
283
+ "learning_rate": 0.00014454350927246788,
284
+ "loss": 1.85804043,
285
+ "memory(GiB)": 15.58,
286
+ "step": 125,
287
+ "train_speed(iter/s)": 0.143173
288
+ },
289
+ {
290
+ "acc": 0.58581357,
291
+ "epoch": 0.3523035230352303,
292
+ "grad_norm": 0.96484375,
293
+ "learning_rate": 0.0001440085592011412,
294
+ "loss": 1.75103779,
295
+ "memory(GiB)": 15.58,
296
+ "step": 130,
297
+ "train_speed(iter/s)": 0.143759
298
+ },
299
+ {
300
+ "acc": 0.59179254,
301
+ "epoch": 0.36585365853658536,
302
+ "grad_norm": 0.93359375,
303
+ "learning_rate": 0.00014347360912981454,
304
+ "loss": 1.67790794,
305
+ "memory(GiB)": 15.58,
306
+ "step": 135,
307
+ "train_speed(iter/s)": 0.14428
308
+ },
309
+ {
310
+ "acc": 0.59449296,
311
+ "epoch": 0.3794037940379404,
312
+ "grad_norm": 0.74609375,
313
+ "learning_rate": 0.00014293865905848786,
314
+ "loss": 1.72561359,
315
+ "memory(GiB)": 15.58,
316
+ "step": 140,
317
+ "train_speed(iter/s)": 0.144799
318
+ },
319
+ {
320
+ "acc": 0.57711525,
321
+ "epoch": 0.39295392953929537,
322
+ "grad_norm": 1.109375,
323
+ "learning_rate": 0.0001424037089871612,
324
+ "loss": 1.79506092,
325
+ "memory(GiB)": 15.58,
326
+ "step": 145,
327
+ "train_speed(iter/s)": 0.14532
328
+ },
329
+ {
330
+ "acc": 0.57541366,
331
+ "epoch": 0.4065040650406504,
332
+ "grad_norm": 5.46875,
333
+ "learning_rate": 0.00014186875891583452,
334
+ "loss": 1.80377541,
335
+ "memory(GiB)": 15.58,
336
+ "step": 150,
337
+ "train_speed(iter/s)": 0.145757
338
+ },
339
+ {
340
+ "epoch": 0.4065040650406504,
341
+ "eval_acc": 0.5983786522546867,
342
+ "eval_loss": 1.676965594291687,
343
+ "eval_runtime": 44.3227,
344
+ "eval_samples_per_second": 0.857,
345
+ "eval_steps_per_second": 0.857,
346
+ "step": 150
347
+ },
348
+ {
349
+ "acc": 0.58702893,
350
+ "epoch": 0.42005420054200543,
351
+ "grad_norm": 1.0625,
352
+ "learning_rate": 0.00014133380884450783,
353
+ "loss": 1.86865826,
354
+ "memory(GiB)": 15.58,
355
+ "step": 155,
356
+ "train_speed(iter/s)": 0.140358
357
+ },
358
+ {
359
+ "acc": 0.58541198,
360
+ "epoch": 0.43360433604336046,
361
+ "grad_norm": 1.015625,
362
+ "learning_rate": 0.00014079885877318115,
363
+ "loss": 1.82644196,
364
+ "memory(GiB)": 15.58,
365
+ "step": 160,
366
+ "train_speed(iter/s)": 0.140936
367
+ },
368
+ {
369
+ "acc": 0.58856125,
370
+ "epoch": 0.44715447154471544,
371
+ "grad_norm": 0.9921875,
372
+ "learning_rate": 0.00014026390870185447,
373
+ "loss": 1.68714104,
374
+ "memory(GiB)": 15.58,
375
+ "step": 165,
376
+ "train_speed(iter/s)": 0.141495
377
+ },
378
+ {
379
+ "acc": 0.59458299,
380
+ "epoch": 0.46070460704607047,
381
+ "grad_norm": 0.91015625,
382
+ "learning_rate": 0.0001397289586305278,
383
+ "loss": 1.72063637,
384
+ "memory(GiB)": 15.58,
385
+ "step": 170,
386
+ "train_speed(iter/s)": 0.142005
387
+ },
388
+ {
389
+ "acc": 0.57354069,
390
+ "epoch": 0.4742547425474255,
391
+ "grad_norm": 0.7734375,
392
+ "learning_rate": 0.00013919400855920113,
393
+ "loss": 1.83291931,
394
+ "memory(GiB)": 15.58,
395
+ "step": 175,
396
+ "train_speed(iter/s)": 0.142509
397
+ },
398
+ {
399
+ "acc": 0.58851786,
400
+ "epoch": 0.4878048780487805,
401
+ "grad_norm": 1.234375,
402
+ "learning_rate": 0.00013865905848787447,
403
+ "loss": 1.70540199,
404
+ "memory(GiB)": 15.58,
405
+ "step": 180,
406
+ "train_speed(iter/s)": 0.143002
407
+ },
408
+ {
409
+ "acc": 0.59666262,
410
+ "epoch": 0.5013550135501355,
411
+ "grad_norm": 0.87890625,
412
+ "learning_rate": 0.0001381241084165478,
413
+ "loss": 1.70772285,
414
+ "memory(GiB)": 15.58,
415
+ "step": 185,
416
+ "train_speed(iter/s)": 0.143463
417
+ },
418
+ {
419
+ "acc": 0.58001013,
420
+ "epoch": 0.5149051490514905,
421
+ "grad_norm": 0.890625,
422
+ "learning_rate": 0.0001375891583452211,
423
+ "loss": 1.73730106,
424
+ "memory(GiB)": 15.58,
425
+ "step": 190,
426
+ "train_speed(iter/s)": 0.143883
427
+ },
428
+ {
429
+ "acc": 0.5934463,
430
+ "epoch": 0.5284552845528455,
431
+ "grad_norm": 1.421875,
432
+ "learning_rate": 0.00013705420827389442,
433
+ "loss": 1.68941402,
434
+ "memory(GiB)": 15.58,
435
+ "step": 195,
436
+ "train_speed(iter/s)": 0.144297
437
+ },
438
+ {
439
+ "acc": 0.58476119,
440
+ "epoch": 0.5420054200542005,
441
+ "grad_norm": 0.91015625,
442
+ "learning_rate": 0.00013651925820256774,
443
+ "loss": 1.74896088,
444
+ "memory(GiB)": 15.58,
445
+ "step": 200,
446
+ "train_speed(iter/s)": 0.144655
447
+ },
448
+ {
449
+ "epoch": 0.5420054200542005,
450
+ "eval_acc": 0.6105387603445364,
451
+ "eval_loss": 1.6297248601913452,
452
+ "eval_runtime": 44.3331,
453
+ "eval_samples_per_second": 0.857,
454
+ "eval_steps_per_second": 0.857,
455
+ "step": 200
456
+ },
457
+ {
458
+ "acc": 0.58323898,
459
+ "epoch": 0.5555555555555556,
460
+ "grad_norm": 0.79296875,
461
+ "learning_rate": 0.00013598430813124105,
462
+ "loss": 1.74196358,
463
+ "memory(GiB)": 15.58,
464
+ "step": 205,
465
+ "train_speed(iter/s)": 0.140612
466
+ },
467
+ {
468
+ "acc": 0.59854908,
469
+ "epoch": 0.5691056910569106,
470
+ "grad_norm": 1.109375,
471
+ "learning_rate": 0.0001354493580599144,
472
+ "loss": 1.6279623,
473
+ "memory(GiB)": 15.58,
474
+ "step": 210,
475
+ "train_speed(iter/s)": 0.141053
476
+ },
477
+ {
478
+ "acc": 0.58306313,
479
+ "epoch": 0.5826558265582655,
480
+ "grad_norm": 0.98828125,
481
+ "learning_rate": 0.00013491440798858771,
482
+ "loss": 1.85492191,
483
+ "memory(GiB)": 15.58,
484
+ "step": 215,
485
+ "train_speed(iter/s)": 0.141461
486
+ },
487
+ {
488
+ "acc": 0.58454275,
489
+ "epoch": 0.5962059620596206,
490
+ "grad_norm": 0.890625,
491
+ "learning_rate": 0.00013437945791726106,
492
+ "loss": 1.75104046,
493
+ "memory(GiB)": 15.58,
494
+ "step": 220,
495
+ "train_speed(iter/s)": 0.141869
496
+ },
497
+ {
498
+ "acc": 0.60898943,
499
+ "epoch": 0.6097560975609756,
500
+ "grad_norm": 0.828125,
501
+ "learning_rate": 0.00013384450784593437,
502
+ "loss": 1.57786808,
503
+ "memory(GiB)": 15.58,
504
+ "step": 225,
505
+ "train_speed(iter/s)": 0.142237
506
+ },
507
+ {
508
+ "acc": 0.58954048,
509
+ "epoch": 0.6233062330623306,
510
+ "grad_norm": 1.1640625,
511
+ "learning_rate": 0.0001333095577746077,
512
+ "loss": 1.72581081,
513
+ "memory(GiB)": 15.58,
514
+ "step": 230,
515
+ "train_speed(iter/s)": 0.142622
516
+ },
517
+ {
518
+ "acc": 0.59608021,
519
+ "epoch": 0.6368563685636857,
520
+ "grad_norm": 1.2265625,
521
+ "learning_rate": 0.000132774607703281,
522
+ "loss": 1.70160866,
523
+ "memory(GiB)": 15.58,
524
+ "step": 235,
525
+ "train_speed(iter/s)": 0.142983
526
+ },
527
+ {
528
+ "acc": 0.57084417,
529
+ "epoch": 0.6504065040650406,
530
+ "grad_norm": 1.1640625,
531
+ "learning_rate": 0.00013223965763195432,
532
+ "loss": 1.81941319,
533
+ "memory(GiB)": 15.58,
534
+ "step": 240,
535
+ "train_speed(iter/s)": 0.143329
536
+ },
537
+ {
538
+ "acc": 0.61476159,
539
+ "epoch": 0.6639566395663956,
540
+ "grad_norm": 0.9453125,
541
+ "learning_rate": 0.00013170470756062767,
542
+ "loss": 1.68505306,
543
+ "memory(GiB)": 15.58,
544
+ "step": 245,
545
+ "train_speed(iter/s)": 0.143652
546
+ },
547
+ {
548
+ "acc": 0.5995626,
549
+ "epoch": 0.6775067750677507,
550
+ "grad_norm": 0.9140625,
551
+ "learning_rate": 0.00013116975748930098,
552
+ "loss": 1.77789631,
553
+ "memory(GiB)": 15.58,
554
+ "step": 250,
555
+ "train_speed(iter/s)": 0.144002
556
+ },
557
+ {
558
+ "epoch": 0.6775067750677507,
559
+ "eval_acc": 0.61087654112481,
560
+ "eval_loss": 1.6108067035675049,
561
+ "eval_runtime": 44.1094,
562
+ "eval_samples_per_second": 0.861,
563
+ "eval_steps_per_second": 0.861,
564
+ "step": 250
565
+ },
566
+ {
567
+ "acc": 0.62165804,
568
+ "epoch": 0.6910569105691057,
569
+ "grad_norm": 1.6171875,
570
+ "learning_rate": 0.0001306348074179743,
571
+ "loss": 1.57510653,
572
+ "memory(GiB)": 15.58,
573
+ "step": 255,
574
+ "train_speed(iter/s)": 0.140805
575
+ },
576
+ {
577
+ "acc": 0.59346585,
578
+ "epoch": 0.7046070460704607,
579
+ "grad_norm": 1.046875,
580
+ "learning_rate": 0.00013009985734664764,
581
+ "loss": 1.76646061,
582
+ "memory(GiB)": 15.58,
583
+ "step": 260,
584
+ "train_speed(iter/s)": 0.141165
585
+ },
586
+ {
587
+ "acc": 0.60632119,
588
+ "epoch": 0.7181571815718157,
589
+ "grad_norm": 1.078125,
590
+ "learning_rate": 0.00012956490727532096,
591
+ "loss": 1.59940271,
592
+ "memory(GiB)": 15.95,
593
+ "step": 265,
594
+ "train_speed(iter/s)": 0.141502
595
+ },
596
+ {
597
+ "acc": 0.59642992,
598
+ "epoch": 0.7317073170731707,
599
+ "grad_norm": 1.203125,
600
+ "learning_rate": 0.00012902995720399428,
601
+ "loss": 1.6549921,
602
+ "memory(GiB)": 15.95,
603
+ "step": 270,
604
+ "train_speed(iter/s)": 0.141855
605
+ },
606
+ {
607
+ "acc": 0.60280037,
608
+ "epoch": 0.7452574525745257,
609
+ "grad_norm": 1.0625,
610
+ "learning_rate": 0.0001284950071326676,
611
+ "loss": 1.63179569,
612
+ "memory(GiB)": 15.95,
613
+ "step": 275,
614
+ "train_speed(iter/s)": 0.142164
615
+ },
616
+ {
617
+ "acc": 0.594034,
618
+ "epoch": 0.7588075880758808,
619
+ "grad_norm": 1.1796875,
620
+ "learning_rate": 0.00012796005706134094,
621
+ "loss": 1.61152973,
622
+ "memory(GiB)": 15.95,
623
+ "step": 280,
624
+ "train_speed(iter/s)": 0.142474
625
+ },
626
+ {
627
+ "acc": 0.59763761,
628
+ "epoch": 0.7723577235772358,
629
+ "grad_norm": 1.0859375,
630
+ "learning_rate": 0.00012742510699001425,
631
+ "loss": 1.66443863,
632
+ "memory(GiB)": 15.95,
633
+ "step": 285,
634
+ "train_speed(iter/s)": 0.142791
635
+ },
636
+ {
637
+ "acc": 0.58316121,
638
+ "epoch": 0.7859078590785907,
639
+ "grad_norm": 0.73828125,
640
+ "learning_rate": 0.00012689015691868757,
641
+ "loss": 1.77028389,
642
+ "memory(GiB)": 15.95,
643
+ "step": 290,
644
+ "train_speed(iter/s)": 0.143067
645
+ },
646
+ {
647
+ "acc": 0.60665183,
648
+ "epoch": 0.7994579945799458,
649
+ "grad_norm": 1.25,
650
+ "learning_rate": 0.00012635520684736091,
651
+ "loss": 1.64849663,
652
+ "memory(GiB)": 15.95,
653
+ "step": 295,
654
+ "train_speed(iter/s)": 0.143366
655
+ },
656
+ {
657
+ "acc": 0.55526123,
658
+ "epoch": 0.8130081300813008,
659
+ "grad_norm": 1.203125,
660
+ "learning_rate": 0.00012582025677603423,
661
+ "loss": 1.86367321,
662
+ "memory(GiB)": 15.95,
663
+ "step": 300,
664
+ "train_speed(iter/s)": 0.143649
665
+ },
666
+ {
667
+ "epoch": 0.8130081300813008,
668
+ "eval_acc": 0.6129032258064516,
669
+ "eval_loss": 1.5865528583526611,
670
+ "eval_runtime": 44.0975,
671
+ "eval_samples_per_second": 0.862,
672
+ "eval_steps_per_second": 0.862,
673
+ "step": 300
674
+ },
675
+ {
676
+ "acc": 0.6019598,
677
+ "epoch": 0.8265582655826558,
678
+ "grad_norm": 1.1328125,
679
+ "learning_rate": 0.00012528530670470755,
680
+ "loss": 1.71853142,
681
+ "memory(GiB)": 15.95,
682
+ "step": 305,
683
+ "train_speed(iter/s)": 0.14098
684
+ },
685
+ {
686
+ "acc": 0.6009192,
687
+ "epoch": 0.8401084010840109,
688
+ "grad_norm": 1.359375,
689
+ "learning_rate": 0.00012475035663338086,
690
+ "loss": 1.72751312,
691
+ "memory(GiB)": 16.34,
692
+ "step": 310,
693
+ "train_speed(iter/s)": 0.141284
694
+ },
695
+ {
696
+ "acc": 0.58907671,
697
+ "epoch": 0.8536585365853658,
698
+ "grad_norm": 0.98046875,
699
+ "learning_rate": 0.0001242154065620542,
700
+ "loss": 1.71860523,
701
+ "memory(GiB)": 16.34,
702
+ "step": 315,
703
+ "train_speed(iter/s)": 0.141562
704
+ },
705
+ {
706
+ "acc": 0.62246222,
707
+ "epoch": 0.8672086720867209,
708
+ "grad_norm": 1.0234375,
709
+ "learning_rate": 0.00012368045649072752,
710
+ "loss": 1.58847914,
711
+ "memory(GiB)": 16.34,
712
+ "step": 320,
713
+ "train_speed(iter/s)": 0.141848
714
+ },
715
+ {
716
+ "acc": 0.61927052,
717
+ "epoch": 0.8807588075880759,
718
+ "grad_norm": 1.59375,
719
+ "learning_rate": 0.00012314550641940084,
720
+ "loss": 1.54207745,
721
+ "memory(GiB)": 16.34,
722
+ "step": 325,
723
+ "train_speed(iter/s)": 0.142133
724
+ },
725
+ {
726
+ "acc": 0.60672359,
727
+ "epoch": 0.8943089430894309,
728
+ "grad_norm": 2.140625,
729
+ "learning_rate": 0.00012261055634807416,
730
+ "loss": 1.71789646,
731
+ "memory(GiB)": 16.34,
732
+ "step": 330,
733
+ "train_speed(iter/s)": 0.142392
734
+ },
735
+ {
736
+ "acc": 0.63249903,
737
+ "epoch": 0.907859078590786,
738
+ "grad_norm": 1.1328125,
739
+ "learning_rate": 0.0001220756062767475,
740
+ "loss": 1.46934195,
741
+ "memory(GiB)": 16.34,
742
+ "step": 335,
743
+ "train_speed(iter/s)": 0.142658
744
+ },
745
+ {
746
+ "acc": 0.60973873,
747
+ "epoch": 0.9214092140921409,
748
+ "grad_norm": 0.84375,
749
+ "learning_rate": 0.00012154065620542082,
750
+ "loss": 1.67951736,
751
+ "memory(GiB)": 16.34,
752
+ "step": 340,
753
+ "train_speed(iter/s)": 0.142914
754
+ },
755
+ {
756
+ "acc": 0.58657136,
757
+ "epoch": 0.9349593495934959,
758
+ "grad_norm": 1.2734375,
759
+ "learning_rate": 0.00012100570613409413,
760
+ "loss": 1.68692303,
761
+ "memory(GiB)": 16.34,
762
+ "step": 345,
763
+ "train_speed(iter/s)": 0.143167
764
+ },
765
+ {
766
+ "acc": 0.59965162,
767
+ "epoch": 0.948509485094851,
768
+ "grad_norm": 1.421875,
769
+ "learning_rate": 0.00012047075606276746,
770
+ "loss": 1.71949959,
771
+ "memory(GiB)": 16.34,
772
+ "step": 350,
773
+ "train_speed(iter/s)": 0.143387
774
+ },
775
+ {
776
+ "epoch": 0.948509485094851,
777
+ "eval_acc": 0.6171254855598717,
778
+ "eval_loss": 1.562721610069275,
779
+ "eval_runtime": 44.0566,
780
+ "eval_samples_per_second": 0.863,
781
+ "eval_steps_per_second": 0.863,
782
+ "step": 350
783
+ },
784
+ {
785
+ "acc": 0.59322553,
786
+ "epoch": 0.962059620596206,
787
+ "grad_norm": 1.3046875,
788
+ "learning_rate": 0.0001199358059914408,
789
+ "loss": 1.66712799,
790
+ "memory(GiB)": 16.34,
791
+ "step": 355,
792
+ "train_speed(iter/s)": 0.14111
793
+ },
794
+ {
795
+ "acc": 0.59898105,
796
+ "epoch": 0.975609756097561,
797
+ "grad_norm": 1.265625,
798
+ "learning_rate": 0.00011940085592011411,
799
+ "loss": 1.66754684,
800
+ "memory(GiB)": 16.34,
801
+ "step": 360,
802
+ "train_speed(iter/s)": 0.141373
803
+ },
804
+ {
805
+ "acc": 0.60211391,
806
+ "epoch": 0.989159891598916,
807
+ "grad_norm": 1.359375,
808
+ "learning_rate": 0.00011886590584878744,
809
+ "loss": 1.67542057,
810
+ "memory(GiB)": 16.34,
811
+ "step": 365,
812
+ "train_speed(iter/s)": 0.141638
813
+ },
814
+ {
815
+ "acc": 0.63076615,
816
+ "epoch": 1.002710027100271,
817
+ "grad_norm": 0.8515625,
818
+ "learning_rate": 0.00011833095577746076,
819
+ "loss": 1.51196938,
820
+ "memory(GiB)": 16.34,
821
+ "step": 370,
822
+ "train_speed(iter/s)": 0.141355
823
+ },
824
+ {
825
+ "acc": 0.63542643,
826
+ "epoch": 1.016260162601626,
827
+ "grad_norm": 0.828125,
828
+ "learning_rate": 0.00011779600570613407,
829
+ "loss": 1.38132334,
830
+ "memory(GiB)": 16.34,
831
+ "step": 375,
832
+ "train_speed(iter/s)": 0.141603
833
+ },
834
+ {
835
+ "acc": 0.66358423,
836
+ "epoch": 1.029810298102981,
837
+ "grad_norm": 0.859375,
838
+ "learning_rate": 0.00011726105563480742,
839
+ "loss": 1.41512909,
840
+ "memory(GiB)": 16.34,
841
+ "step": 380,
842
+ "train_speed(iter/s)": 0.141845
843
+ },
844
+ {
845
+ "acc": 0.63692493,
846
+ "epoch": 1.043360433604336,
847
+ "grad_norm": 1.0859375,
848
+ "learning_rate": 0.00011672610556348073,
849
+ "loss": 1.38078823,
850
+ "memory(GiB)": 16.34,
851
+ "step": 385,
852
+ "train_speed(iter/s)": 0.142082
853
+ },
854
+ {
855
+ "acc": 0.6641499,
856
+ "epoch": 1.056910569105691,
857
+ "grad_norm": 1.125,
858
+ "learning_rate": 0.00011619115549215406,
859
+ "loss": 1.3458046,
860
+ "memory(GiB)": 16.34,
861
+ "step": 390,
862
+ "train_speed(iter/s)": 0.142305
863
+ },
864
+ {
865
+ "acc": 0.67276783,
866
+ "epoch": 1.070460704607046,
867
+ "grad_norm": 0.98828125,
868
+ "learning_rate": 0.00011565620542082738,
869
+ "loss": 1.2076004,
870
+ "memory(GiB)": 16.34,
871
+ "step": 395,
872
+ "train_speed(iter/s)": 0.142533
873
+ },
874
+ {
875
+ "acc": 0.65821433,
876
+ "epoch": 1.084010840108401,
877
+ "grad_norm": 1.15625,
878
+ "learning_rate": 0.0001151212553495007,
879
+ "loss": 1.38641891,
880
+ "memory(GiB)": 16.34,
881
+ "step": 400,
882
+ "train_speed(iter/s)": 0.142742
883
+ },
884
+ {
885
+ "epoch": 1.084010840108401,
886
+ "eval_acc": 0.6162810336091876,
887
+ "eval_loss": 1.5887514352798462,
888
+ "eval_runtime": 44.1624,
889
+ "eval_samples_per_second": 0.86,
890
+ "eval_steps_per_second": 0.86,
891
+ "step": 400
892
+ },
893
+ {
894
+ "acc": 0.63352804,
895
+ "epoch": 1.0975609756097562,
896
+ "grad_norm": 1.2421875,
897
+ "learning_rate": 0.00011458630527817403,
898
+ "loss": 1.44696302,
899
+ "memory(GiB)": 16.34,
900
+ "step": 405,
901
+ "train_speed(iter/s)": 0.140759
902
+ },
903
+ {
904
+ "acc": 0.61955671,
905
+ "epoch": 1.1111111111111112,
906
+ "grad_norm": 1.5546875,
907
+ "learning_rate": 0.00011405135520684734,
908
+ "loss": 1.48915672,
909
+ "memory(GiB)": 16.34,
910
+ "step": 410,
911
+ "train_speed(iter/s)": 0.140987
912
+ },
913
+ {
914
+ "acc": 0.69259667,
915
+ "epoch": 1.1246612466124661,
916
+ "grad_norm": 0.796875,
917
+ "learning_rate": 0.00011351640513552069,
918
+ "loss": 1.22069292,
919
+ "memory(GiB)": 16.34,
920
+ "step": 415,
921
+ "train_speed(iter/s)": 0.141191
922
+ },
923
+ {
924
+ "acc": 0.67346158,
925
+ "epoch": 1.1382113821138211,
926
+ "grad_norm": 1.171875,
927
+ "learning_rate": 0.000112981455064194,
928
+ "loss": 1.36795778,
929
+ "memory(GiB)": 16.34,
930
+ "step": 420,
931
+ "train_speed(iter/s)": 0.141417
932
+ },
933
+ {
934
+ "acc": 0.67346702,
935
+ "epoch": 1.151761517615176,
936
+ "grad_norm": 1.0390625,
937
+ "learning_rate": 0.00011244650499286732,
938
+ "loss": 1.27111759,
939
+ "memory(GiB)": 16.34,
940
+ "step": 425,
941
+ "train_speed(iter/s)": 0.141634
942
+ },
943
+ {
944
+ "acc": 0.61625972,
945
+ "epoch": 1.165311653116531,
946
+ "grad_norm": 1.2578125,
947
+ "learning_rate": 0.00011191155492154065,
948
+ "loss": 1.42474003,
949
+ "memory(GiB)": 16.34,
950
+ "step": 430,
951
+ "train_speed(iter/s)": 0.141853
952
+ },
953
+ {
954
+ "acc": 0.65880041,
955
+ "epoch": 1.1788617886178863,
956
+ "grad_norm": 1.2578125,
957
+ "learning_rate": 0.00011137660485021397,
958
+ "loss": 1.33981524,
959
+ "memory(GiB)": 16.34,
960
+ "step": 435,
961
+ "train_speed(iter/s)": 0.142048
962
+ },
963
+ {
964
+ "acc": 0.63503499,
965
+ "epoch": 1.1924119241192412,
966
+ "grad_norm": 1.2265625,
967
+ "learning_rate": 0.00011084165477888728,
968
+ "loss": 1.44122829,
969
+ "memory(GiB)": 16.34,
970
+ "step": 440,
971
+ "train_speed(iter/s)": 0.142254
972
+ },
973
+ {
974
+ "acc": 0.61518383,
975
+ "epoch": 1.2059620596205962,
976
+ "grad_norm": 1.4765625,
977
+ "learning_rate": 0.00011030670470756061,
978
+ "loss": 1.54818697,
979
+ "memory(GiB)": 16.34,
980
+ "step": 445,
981
+ "train_speed(iter/s)": 0.142446
982
+ },
983
+ {
984
+ "acc": 0.68010144,
985
+ "epoch": 1.2195121951219512,
986
+ "grad_norm": 1.3828125,
987
+ "learning_rate": 0.00010977175463623394,
988
+ "loss": 1.28560524,
989
+ "memory(GiB)": 16.34,
990
+ "step": 450,
991
+ "train_speed(iter/s)": 0.142639
992
+ },
993
+ {
994
+ "epoch": 1.2195121951219512,
995
+ "eval_acc": 0.6230366492146597,
996
+ "eval_loss": 1.5483555793762207,
997
+ "eval_runtime": 44.0588,
998
+ "eval_samples_per_second": 0.862,
999
+ "eval_steps_per_second": 0.862,
1000
+ "step": 450
1001
+ },
1002
+ {
1003
+ "acc": 0.64718337,
1004
+ "epoch": 1.2330623306233062,
1005
+ "grad_norm": 1.4375,
1006
+ "learning_rate": 0.00010923680456490727,
1007
+ "loss": 1.3778326,
1008
+ "memory(GiB)": 16.34,
1009
+ "step": 455,
1010
+ "train_speed(iter/s)": 0.140879
1011
+ },
1012
+ {
1013
+ "acc": 0.68637676,
1014
+ "epoch": 1.2466124661246614,
1015
+ "grad_norm": 1.453125,
1016
+ "learning_rate": 0.00010870185449358059,
1017
+ "loss": 1.21860657,
1018
+ "memory(GiB)": 16.34,
1019
+ "step": 460,
1020
+ "train_speed(iter/s)": 0.141097
1021
+ },
1022
+ {
1023
+ "acc": 0.63144946,
1024
+ "epoch": 1.2601626016260163,
1025
+ "grad_norm": 1.0703125,
1026
+ "learning_rate": 0.00010816690442225392,
1027
+ "loss": 1.38577023,
1028
+ "memory(GiB)": 16.34,
1029
+ "step": 465,
1030
+ "train_speed(iter/s)": 0.141294
1031
+ },
1032
+ {
1033
+ "acc": 0.66895781,
1034
+ "epoch": 1.2737127371273713,
1035
+ "grad_norm": 1.296875,
1036
+ "learning_rate": 0.00010763195435092724,
1037
+ "loss": 1.29167385,
1038
+ "memory(GiB)": 16.34,
1039
+ "step": 470,
1040
+ "train_speed(iter/s)": 0.141488
1041
+ },
1042
+ {
1043
+ "acc": 0.6553885,
1044
+ "epoch": 1.2872628726287263,
1045
+ "grad_norm": 1.1484375,
1046
+ "learning_rate": 0.00010709700427960055,
1047
+ "loss": 1.35011473,
1048
+ "memory(GiB)": 16.34,
1049
+ "step": 475,
1050
+ "train_speed(iter/s)": 0.141687
1051
+ },
1052
+ {
1053
+ "acc": 0.65576148,
1054
+ "epoch": 1.3008130081300813,
1055
+ "grad_norm": 1.1796875,
1056
+ "learning_rate": 0.00010656205420827388,
1057
+ "loss": 1.39116755,
1058
+ "memory(GiB)": 16.34,
1059
+ "step": 480,
1060
+ "train_speed(iter/s)": 0.141863
1061
+ },
1062
+ {
1063
+ "acc": 0.62025633,
1064
+ "epoch": 1.3143631436314362,
1065
+ "grad_norm": 1.375,
1066
+ "learning_rate": 0.00010602710413694721,
1067
+ "loss": 1.55190315,
1068
+ "memory(GiB)": 16.34,
1069
+ "step": 485,
1070
+ "train_speed(iter/s)": 0.142058
1071
+ },
1072
+ {
1073
+ "acc": 0.65667233,
1074
+ "epoch": 1.3279132791327912,
1075
+ "grad_norm": 1.1484375,
1076
+ "learning_rate": 0.00010549215406562054,
1077
+ "loss": 1.30497828,
1078
+ "memory(GiB)": 16.34,
1079
+ "step": 490,
1080
+ "train_speed(iter/s)": 0.142237
1081
+ },
1082
+ {
1083
+ "acc": 0.6458045,
1084
+ "epoch": 1.3414634146341464,
1085
+ "grad_norm": 2.421875,
1086
+ "learning_rate": 0.00010495720399429386,
1087
+ "loss": 1.40175285,
1088
+ "memory(GiB)": 16.34,
1089
+ "step": 495,
1090
+ "train_speed(iter/s)": 0.14241
1091
+ },
1092
+ {
1093
+ "acc": 0.65004997,
1094
+ "epoch": 1.3550135501355014,
1095
+ "grad_norm": 1.265625,
1096
+ "learning_rate": 0.00010442225392296718,
1097
+ "loss": 1.31625471,
1098
+ "memory(GiB)": 16.34,
1099
+ "step": 500,
1100
+ "train_speed(iter/s)": 0.142583
1101
+ },
1102
+ {
1103
+ "epoch": 1.3550135501355014,
1104
+ "eval_acc": 0.61932106063165,
1105
+ "eval_loss": 1.535814642906189,
1106
+ "eval_runtime": 44.07,
1107
+ "eval_samples_per_second": 0.862,
1108
+ "eval_steps_per_second": 0.862,
1109
+ "step": 500
1110
+ },
1111
+ {
1112
+ "acc": 0.65512385,
1113
+ "epoch": 1.3685636856368564,
1114
+ "grad_norm": 0.9375,
1115
+ "learning_rate": 0.00010388730385164051,
1116
+ "loss": 1.2989893,
1117
+ "memory(GiB)": 16.34,
1118
+ "step": 505,
1119
+ "train_speed(iter/s)": 0.140995
1120
+ },
1121
+ {
1122
+ "acc": 0.65444102,
1123
+ "epoch": 1.3821138211382114,
1124
+ "grad_norm": 1.5625,
1125
+ "learning_rate": 0.00010335235378031382,
1126
+ "loss": 1.32140932,
1127
+ "memory(GiB)": 16.34,
1128
+ "step": 510,
1129
+ "train_speed(iter/s)": 0.141172
1130
+ },
1131
+ {
1132
+ "acc": 0.66242504,
1133
+ "epoch": 1.3956639566395663,
1134
+ "grad_norm": 0.9140625,
1135
+ "learning_rate": 0.00010281740370898714,
1136
+ "loss": 1.34259834,
1137
+ "memory(GiB)": 16.34,
1138
+ "step": 515,
1139
+ "train_speed(iter/s)": 0.141336
1140
+ },
1141
+ {
1142
+ "acc": 0.62716827,
1143
+ "epoch": 1.4092140921409215,
1144
+ "grad_norm": 1.4296875,
1145
+ "learning_rate": 0.00010228245363766048,
1146
+ "loss": 1.46508284,
1147
+ "memory(GiB)": 16.34,
1148
+ "step": 520,
1149
+ "train_speed(iter/s)": 0.141508
1150
+ },
1151
+ {
1152
+ "acc": 0.61702657,
1153
+ "epoch": 1.4227642276422765,
1154
+ "grad_norm": 1.5546875,
1155
+ "learning_rate": 0.0001017475035663338,
1156
+ "loss": 1.43188276,
1157
+ "memory(GiB)": 16.34,
1158
+ "step": 525,
1159
+ "train_speed(iter/s)": 0.141681
1160
+ },
1161
+ {
1162
+ "acc": 0.63238263,
1163
+ "epoch": 1.4363143631436315,
1164
+ "grad_norm": 1.3671875,
1165
+ "learning_rate": 0.00010121255349500713,
1166
+ "loss": 1.52214499,
1167
+ "memory(GiB)": 16.34,
1168
+ "step": 530,
1169
+ "train_speed(iter/s)": 0.141854
1170
+ },
1171
+ {
1172
+ "acc": 0.6323854,
1173
+ "epoch": 1.4498644986449865,
1174
+ "grad_norm": 1.375,
1175
+ "learning_rate": 0.00010067760342368045,
1176
+ "loss": 1.38910236,
1177
+ "memory(GiB)": 16.34,
1178
+ "step": 535,
1179
+ "train_speed(iter/s)": 0.142019
1180
+ },
1181
+ {
1182
+ "acc": 0.60772176,
1183
+ "epoch": 1.4634146341463414,
1184
+ "grad_norm": 1.21875,
1185
+ "learning_rate": 0.00010014265335235376,
1186
+ "loss": 1.46768923,
1187
+ "memory(GiB)": 16.34,
1188
+ "step": 540,
1189
+ "train_speed(iter/s)": 0.14218
1190
+ },
1191
+ {
1192
+ "acc": 0.67315965,
1193
+ "epoch": 1.4769647696476964,
1194
+ "grad_norm": 1.34375,
1195
+ "learning_rate": 9.96077032810271e-05,
1196
+ "loss": 1.25136938,
1197
+ "memory(GiB)": 16.34,
1198
+ "step": 545,
1199
+ "train_speed(iter/s)": 0.142338
1200
+ },
1201
+ {
1202
+ "acc": 0.66408758,
1203
+ "epoch": 1.4905149051490514,
1204
+ "grad_norm": 2.046875,
1205
+ "learning_rate": 9.907275320970041e-05,
1206
+ "loss": 1.25655928,
1207
+ "memory(GiB)": 16.34,
1208
+ "step": 550,
1209
+ "train_speed(iter/s)": 0.142495
1210
+ },
1211
+ {
1212
+ "epoch": 1.4905149051490514,
1213
+ "eval_acc": 0.6254011146765749,
1214
+ "eval_loss": 1.5346177816390991,
1215
+ "eval_runtime": 44.1805,
1216
+ "eval_samples_per_second": 0.86,
1217
+ "eval_steps_per_second": 0.86,
1218
+ "step": 550
1219
+ },
1220
+ {
1221
+ "acc": 0.68655195,
1222
+ "epoch": 1.5040650406504064,
1223
+ "grad_norm": 1.125,
1224
+ "learning_rate": 9.853780313837375e-05,
1225
+ "loss": 1.08906736,
1226
+ "memory(GiB)": 16.34,
1227
+ "step": 555,
1228
+ "train_speed(iter/s)": 0.141037
1229
+ },
1230
+ {
1231
+ "acc": 0.65898876,
1232
+ "epoch": 1.5176151761517616,
1233
+ "grad_norm": 1.4296875,
1234
+ "learning_rate": 9.800285306704707e-05,
1235
+ "loss": 1.26696377,
1236
+ "memory(GiB)": 16.34,
1237
+ "step": 560,
1238
+ "train_speed(iter/s)": 0.14119
1239
+ },
1240
+ {
1241
+ "acc": 0.63674688,
1242
+ "epoch": 1.5311653116531165,
1243
+ "grad_norm": 1.5390625,
1244
+ "learning_rate": 9.746790299572039e-05,
1245
+ "loss": 1.38277016,
1246
+ "memory(GiB)": 16.34,
1247
+ "step": 565,
1248
+ "train_speed(iter/s)": 0.141349
1249
+ },
1250
+ {
1251
+ "acc": 0.67204466,
1252
+ "epoch": 1.5447154471544715,
1253
+ "grad_norm": 1.234375,
1254
+ "learning_rate": 9.693295292439372e-05,
1255
+ "loss": 1.25243311,
1256
+ "memory(GiB)": 16.34,
1257
+ "step": 570,
1258
+ "train_speed(iter/s)": 0.141502
1259
+ },
1260
+ {
1261
+ "acc": 0.67878027,
1262
+ "epoch": 1.5582655826558267,
1263
+ "grad_norm": 1.3828125,
1264
+ "learning_rate": 9.639800285306703e-05,
1265
+ "loss": 1.18253031,
1266
+ "memory(GiB)": 16.34,
1267
+ "step": 575,
1268
+ "train_speed(iter/s)": 0.141663
1269
+ },
1270
+ {
1271
+ "acc": 0.64645357,
1272
+ "epoch": 1.5718157181571817,
1273
+ "grad_norm": 1.671875,
1274
+ "learning_rate": 9.586305278174036e-05,
1275
+ "loss": 1.39789114,
1276
+ "memory(GiB)": 16.34,
1277
+ "step": 580,
1278
+ "train_speed(iter/s)": 0.141812
1279
+ },
1280
+ {
1281
+ "acc": 0.63896599,
1282
+ "epoch": 1.5853658536585367,
1283
+ "grad_norm": 1.5546875,
1284
+ "learning_rate": 9.532810271041368e-05,
1285
+ "loss": 1.37434454,
1286
+ "memory(GiB)": 16.34,
1287
+ "step": 585,
1288
+ "train_speed(iter/s)": 0.141961
1289
+ },
1290
+ {
1291
+ "acc": 0.67930498,
1292
+ "epoch": 1.5989159891598916,
1293
+ "grad_norm": 1.3671875,
1294
+ "learning_rate": 9.479315263908701e-05,
1295
+ "loss": 1.24675446,
1296
+ "memory(GiB)": 16.34,
1297
+ "step": 590,
1298
+ "train_speed(iter/s)": 0.142113
1299
+ },
1300
+ {
1301
+ "acc": 0.66582651,
1302
+ "epoch": 1.6124661246612466,
1303
+ "grad_norm": 1.1171875,
1304
+ "learning_rate": 9.425820256776034e-05,
1305
+ "loss": 1.33937092,
1306
+ "memory(GiB)": 16.34,
1307
+ "step": 595,
1308
+ "train_speed(iter/s)": 0.142256
1309
+ },
1310
+ {
1311
+ "acc": 0.65553112,
1312
+ "epoch": 1.6260162601626016,
1313
+ "grad_norm": 1.5859375,
1314
+ "learning_rate": 9.372325249643366e-05,
1315
+ "loss": 1.25127707,
1316
+ "memory(GiB)": 16.34,
1317
+ "step": 600,
1318
+ "train_speed(iter/s)": 0.142394
1319
+ },
1320
+ {
1321
+ "epoch": 1.6260162601626016,
1322
+ "eval_acc": 0.6233744299949333,
1323
+ "eval_loss": 1.5426762104034424,
1324
+ "eval_runtime": 44.3564,
1325
+ "eval_samples_per_second": 0.857,
1326
+ "eval_steps_per_second": 0.857,
1327
+ "step": 600
1328
+ },
1329
+ {
1330
+ "acc": 0.64805899,
1331
+ "epoch": 1.6395663956639566,
1332
+ "grad_norm": 1.078125,
1333
+ "learning_rate": 9.318830242510699e-05,
1334
+ "loss": 1.30654621,
1335
+ "memory(GiB)": 16.34,
1336
+ "step": 605,
1337
+ "train_speed(iter/s)": 0.141073
1338
+ },
1339
+ {
1340
+ "acc": 0.67441335,
1341
+ "epoch": 1.6531165311653115,
1342
+ "grad_norm": 1.34375,
1343
+ "learning_rate": 9.26533523537803e-05,
1344
+ "loss": 1.20785751,
1345
+ "memory(GiB)": 16.34,
1346
+ "step": 610,
1347
+ "train_speed(iter/s)": 0.141227
1348
+ },
1349
+ {
1350
+ "acc": 0.64989614,
1351
+ "epoch": 1.6666666666666665,
1352
+ "grad_norm": 2.25,
1353
+ "learning_rate": 9.211840228245362e-05,
1354
+ "loss": 1.41231976,
1355
+ "memory(GiB)": 16.34,
1356
+ "step": 615,
1357
+ "train_speed(iter/s)": 0.141374
1358
+ },
1359
+ {
1360
+ "acc": 0.65118213,
1361
+ "epoch": 1.6802168021680217,
1362
+ "grad_norm": 1.34375,
1363
+ "learning_rate": 9.158345221112695e-05,
1364
+ "loss": 1.33156195,
1365
+ "memory(GiB)": 16.34,
1366
+ "step": 620,
1367
+ "train_speed(iter/s)": 0.141527
1368
+ },
1369
+ {
1370
+ "acc": 0.68491917,
1371
+ "epoch": 1.6937669376693767,
1372
+ "grad_norm": 2.078125,
1373
+ "learning_rate": 9.104850213980028e-05,
1374
+ "loss": 1.15608921,
1375
+ "memory(GiB)": 16.34,
1376
+ "step": 625,
1377
+ "train_speed(iter/s)": 0.141656
1378
+ },
1379
+ {
1380
+ "acc": 0.6636075,
1381
+ "epoch": 1.7073170731707317,
1382
+ "grad_norm": 1.2890625,
1383
+ "learning_rate": 9.051355206847361e-05,
1384
+ "loss": 1.3377409,
1385
+ "memory(GiB)": 16.34,
1386
+ "step": 630,
1387
+ "train_speed(iter/s)": 0.141795
1388
+ },
1389
+ {
1390
+ "acc": 0.66115265,
1391
+ "epoch": 1.7208672086720869,
1392
+ "grad_norm": 1.796875,
1393
+ "learning_rate": 8.997860199714693e-05,
1394
+ "loss": 1.41666918,
1395
+ "memory(GiB)": 16.34,
1396
+ "step": 635,
1397
+ "train_speed(iter/s)": 0.141928
1398
+ },
1399
+ {
1400
+ "acc": 0.65316691,
1401
+ "epoch": 1.7344173441734418,
1402
+ "grad_norm": 1.875,
1403
+ "learning_rate": 8.944365192582024e-05,
1404
+ "loss": 1.37885714,
1405
+ "memory(GiB)": 16.34,
1406
+ "step": 640,
1407
+ "train_speed(iter/s)": 0.14206
1408
+ },
1409
+ {
1410
+ "acc": 0.63365035,
1411
+ "epoch": 1.7479674796747968,
1412
+ "grad_norm": 2.03125,
1413
+ "learning_rate": 8.890870185449357e-05,
1414
+ "loss": 1.43597651,
1415
+ "memory(GiB)": 16.34,
1416
+ "step": 645,
1417
+ "train_speed(iter/s)": 0.142195
1418
+ },
1419
+ {
1420
+ "acc": 0.6461009,
1421
+ "epoch": 1.7615176151761518,
1422
+ "grad_norm": 1.390625,
1423
+ "learning_rate": 8.837375178316689e-05,
1424
+ "loss": 1.40720987,
1425
+ "memory(GiB)": 16.34,
1426
+ "step": 650,
1427
+ "train_speed(iter/s)": 0.142326
1428
+ },
1429
+ {
1430
+ "epoch": 1.7615176151761518,
1431
+ "eval_acc": 0.6223610876541125,
1432
+ "eval_loss": 1.5279418230056763,
1433
+ "eval_runtime": 44.3195,
1434
+ "eval_samples_per_second": 0.857,
1435
+ "eval_steps_per_second": 0.857,
1436
+ "step": 650
1437
+ },
1438
+ {
1439
+ "acc": 0.68952079,
1440
+ "epoch": 1.7750677506775068,
1441
+ "grad_norm": 1.5859375,
1442
+ "learning_rate": 8.783880171184023e-05,
1443
+ "loss": 1.1804883,
1444
+ "memory(GiB)": 16.34,
1445
+ "step": 655,
1446
+ "train_speed(iter/s)": 0.141093
1447
+ },
1448
+ {
1449
+ "acc": 0.63278737,
1450
+ "epoch": 1.7886178861788617,
1451
+ "grad_norm": 1.3125,
1452
+ "learning_rate": 8.730385164051355e-05,
1453
+ "loss": 1.45864544,
1454
+ "memory(GiB)": 16.34,
1455
+ "step": 660,
1456
+ "train_speed(iter/s)": 0.141227
1457
+ },
1458
+ {
1459
+ "acc": 0.6499536,
1460
+ "epoch": 1.8021680216802167,
1461
+ "grad_norm": 1.0859375,
1462
+ "learning_rate": 8.676890156918687e-05,
1463
+ "loss": 1.36629667,
1464
+ "memory(GiB)": 16.34,
1465
+ "step": 665,
1466
+ "train_speed(iter/s)": 0.14135
1467
+ },
1468
+ {
1469
+ "acc": 0.66262636,
1470
+ "epoch": 1.8157181571815717,
1471
+ "grad_norm": 2.8125,
1472
+ "learning_rate": 8.62339514978602e-05,
1473
+ "loss": 1.28030796,
1474
+ "memory(GiB)": 16.34,
1475
+ "step": 670,
1476
+ "train_speed(iter/s)": 0.141486
1477
+ },
1478
+ {
1479
+ "acc": 0.6478013,
1480
+ "epoch": 1.8292682926829267,
1481
+ "grad_norm": 1.3203125,
1482
+ "learning_rate": 8.569900142653351e-05,
1483
+ "loss": 1.37910089,
1484
+ "memory(GiB)": 16.34,
1485
+ "step": 675,
1486
+ "train_speed(iter/s)": 0.141617
1487
+ },
1488
+ {
1489
+ "acc": 0.65061078,
1490
+ "epoch": 1.8428184281842819,
1491
+ "grad_norm": 1.859375,
1492
+ "learning_rate": 8.516405135520683e-05,
1493
+ "loss": 1.24110394,
1494
+ "memory(GiB)": 16.34,
1495
+ "step": 680,
1496
+ "train_speed(iter/s)": 0.141749
1497
+ },
1498
+ {
1499
+ "acc": 0.66720443,
1500
+ "epoch": 1.8563685636856369,
1501
+ "grad_norm": 1.2734375,
1502
+ "learning_rate": 8.462910128388016e-05,
1503
+ "loss": 1.36949673,
1504
+ "memory(GiB)": 16.34,
1505
+ "step": 685,
1506
+ "train_speed(iter/s)": 0.141882
1507
+ },
1508
+ {
1509
+ "acc": 0.65051932,
1510
+ "epoch": 1.8699186991869918,
1511
+ "grad_norm": 1.796875,
1512
+ "learning_rate": 8.409415121255349e-05,
1513
+ "loss": 1.3470686,
1514
+ "memory(GiB)": 16.34,
1515
+ "step": 690,
1516
+ "train_speed(iter/s)": 0.142007
1517
+ },
1518
+ {
1519
+ "acc": 0.64647999,
1520
+ "epoch": 1.883468834688347,
1521
+ "grad_norm": 1.0546875,
1522
+ "learning_rate": 8.355920114122682e-05,
1523
+ "loss": 1.27561255,
1524
+ "memory(GiB)": 16.34,
1525
+ "step": 695,
1526
+ "train_speed(iter/s)": 0.142124
1527
+ },
1528
+ {
1529
+ "acc": 0.64771528,
1530
+ "epoch": 1.897018970189702,
1531
+ "grad_norm": 2.5,
1532
+ "learning_rate": 8.302425106990014e-05,
1533
+ "loss": 1.3874403,
1534
+ "memory(GiB)": 16.34,
1535
+ "step": 700,
1536
+ "train_speed(iter/s)": 0.142242
1537
+ },
1538
+ {
1539
+ "epoch": 1.897018970189702,
1540
+ "eval_acc": 0.6274277993582165,
1541
+ "eval_loss": 1.5236802101135254,
1542
+ "eval_runtime": 44.4112,
1543
+ "eval_samples_per_second": 0.856,
1544
+ "eval_steps_per_second": 0.856,
1545
+ "step": 700
1546
+ },
1547
+ {
1548
+ "acc": 0.65817127,
1549
+ "epoch": 1.910569105691057,
1550
+ "grad_norm": 1.3203125,
1551
+ "learning_rate": 8.248930099857345e-05,
1552
+ "loss": 1.29632025,
1553
+ "memory(GiB)": 16.34,
1554
+ "step": 705,
1555
+ "train_speed(iter/s)": 0.141099
1556
+ },
1557
+ {
1558
+ "acc": 0.64647436,
1559
+ "epoch": 1.924119241192412,
1560
+ "grad_norm": 1.234375,
1561
+ "learning_rate": 8.195435092724678e-05,
1562
+ "loss": 1.39382238,
1563
+ "memory(GiB)": 16.34,
1564
+ "step": 710,
1565
+ "train_speed(iter/s)": 0.141217
1566
+ },
1567
+ {
1568
+ "acc": 0.65741391,
1569
+ "epoch": 1.937669376693767,
1570
+ "grad_norm": 0.98828125,
1571
+ "learning_rate": 8.14194008559201e-05,
1572
+ "loss": 1.32606802,
1573
+ "memory(GiB)": 16.34,
1574
+ "step": 715,
1575
+ "train_speed(iter/s)": 0.141337
1576
+ },
1577
+ {
1578
+ "acc": 0.65078535,
1579
+ "epoch": 1.951219512195122,
1580
+ "grad_norm": 1.6171875,
1581
+ "learning_rate": 8.088445078459343e-05,
1582
+ "loss": 1.28092451,
1583
+ "memory(GiB)": 16.34,
1584
+ "step": 720,
1585
+ "train_speed(iter/s)": 0.141457
1586
+ },
1587
+ {
1588
+ "acc": 0.64983764,
1589
+ "epoch": 1.9647696476964769,
1590
+ "grad_norm": 1.7890625,
1591
+ "learning_rate": 8.034950071326676e-05,
1592
+ "loss": 1.35295801,
1593
+ "memory(GiB)": 16.34,
1594
+ "step": 725,
1595
+ "train_speed(iter/s)": 0.141585
1596
+ },
1597
+ {
1598
+ "acc": 0.64880919,
1599
+ "epoch": 1.9783197831978319,
1600
+ "grad_norm": 1.6640625,
1601
+ "learning_rate": 7.981455064194009e-05,
1602
+ "loss": 1.38945732,
1603
+ "memory(GiB)": 16.34,
1604
+ "step": 730,
1605
+ "train_speed(iter/s)": 0.141703
1606
+ },
1607
+ {
1608
+ "acc": 0.64617214,
1609
+ "epoch": 1.9918699186991868,
1610
+ "grad_norm": 1.3671875,
1611
+ "learning_rate": 7.927960057061341e-05,
1612
+ "loss": 1.31189098,
1613
+ "memory(GiB)": 16.34,
1614
+ "step": 735,
1615
+ "train_speed(iter/s)": 0.141817
1616
+ },
1617
+ {
1618
+ "acc": 0.67255039,
1619
+ "epoch": 2.005420054200542,
1620
+ "grad_norm": 1.3359375,
1621
+ "learning_rate": 7.874465049928672e-05,
1622
+ "loss": 1.17746153,
1623
+ "memory(GiB)": 16.34,
1624
+ "step": 740,
1625
+ "train_speed(iter/s)": 0.141669
1626
+ },
1627
+ {
1628
+ "acc": 0.69897633,
1629
+ "epoch": 2.0189701897018972,
1630
+ "grad_norm": 1.1875,
1631
+ "learning_rate": 7.820970042796005e-05,
1632
+ "loss": 1.03704672,
1633
+ "memory(GiB)": 16.34,
1634
+ "step": 745,
1635
+ "train_speed(iter/s)": 0.141785
1636
+ },
1637
+ {
1638
+ "acc": 0.71377811,
1639
+ "epoch": 2.032520325203252,
1640
+ "grad_norm": 1.640625,
1641
+ "learning_rate": 7.767475035663337e-05,
1642
+ "loss": 1.04516726,
1643
+ "memory(GiB)": 16.34,
1644
+ "step": 750,
1645
+ "train_speed(iter/s)": 0.141905
1646
+ },
1647
+ {
1648
+ "epoch": 2.032520325203252,
1649
+ "eval_acc": 0.6208410741428813,
1650
+ "eval_loss": 1.603255271911621,
1651
+ "eval_runtime": 44.4359,
1652
+ "eval_samples_per_second": 0.855,
1653
+ "eval_steps_per_second": 0.855,
1654
+ "step": 750
1655
+ },
1656
+ {
1657
+ "acc": 0.73705945,
1658
+ "epoch": 2.046070460704607,
1659
+ "grad_norm": 1.8203125,
1660
+ "learning_rate": 7.713980028530669e-05,
1661
+ "loss": 0.9383255,
1662
+ "memory(GiB)": 16.34,
1663
+ "step": 755,
1664
+ "train_speed(iter/s)": 0.14084
1665
+ },
1666
+ {
1667
+ "acc": 0.73054934,
1668
+ "epoch": 2.059620596205962,
1669
+ "grad_norm": 1.8203125,
1670
+ "learning_rate": 7.660485021398003e-05,
1671
+ "loss": 0.91379232,
1672
+ "memory(GiB)": 16.34,
1673
+ "step": 760,
1674
+ "train_speed(iter/s)": 0.140958
1675
+ },
1676
+ {
1677
+ "acc": 0.73154573,
1678
+ "epoch": 2.073170731707317,
1679
+ "grad_norm": 1.6796875,
1680
+ "learning_rate": 7.606990014265335e-05,
1681
+ "loss": 0.97700481,
1682
+ "memory(GiB)": 16.34,
1683
+ "step": 765,
1684
+ "train_speed(iter/s)": 0.141076
1685
+ },
1686
+ {
1687
+ "acc": 0.72998781,
1688
+ "epoch": 2.086720867208672,
1689
+ "grad_norm": 1.4140625,
1690
+ "learning_rate": 7.553495007132668e-05,
1691
+ "loss": 0.93853807,
1692
+ "memory(GiB)": 16.34,
1693
+ "step": 770,
1694
+ "train_speed(iter/s)": 0.141186
1695
+ },
1696
+ {
1697
+ "acc": 0.73213534,
1698
+ "epoch": 2.100271002710027,
1699
+ "grad_norm": 1.78125,
1700
+ "learning_rate": 7.5e-05,
1701
+ "loss": 0.94589176,
1702
+ "memory(GiB)": 16.34,
1703
+ "step": 775,
1704
+ "train_speed(iter/s)": 0.141303
1705
+ },
1706
+ {
1707
+ "acc": 0.73035493,
1708
+ "epoch": 2.113821138211382,
1709
+ "grad_norm": 1.6953125,
1710
+ "learning_rate": 7.446504992867331e-05,
1711
+ "loss": 0.95597754,
1712
+ "memory(GiB)": 16.34,
1713
+ "step": 780,
1714
+ "train_speed(iter/s)": 0.141422
1715
+ },
1716
+ {
1717
+ "acc": 0.73428354,
1718
+ "epoch": 2.127371273712737,
1719
+ "grad_norm": 1.484375,
1720
+ "learning_rate": 7.393009985734664e-05,
1721
+ "loss": 0.98254423,
1722
+ "memory(GiB)": 16.34,
1723
+ "step": 785,
1724
+ "train_speed(iter/s)": 0.141543
1725
+ },
1726
+ {
1727
+ "acc": 0.69750729,
1728
+ "epoch": 2.140921409214092,
1729
+ "grad_norm": 1.828125,
1730
+ "learning_rate": 7.339514978601997e-05,
1731
+ "loss": 1.10206041,
1732
+ "memory(GiB)": 16.34,
1733
+ "step": 790,
1734
+ "train_speed(iter/s)": 0.141651
1735
+ },
1736
+ {
1737
+ "acc": 0.73764381,
1738
+ "epoch": 2.154471544715447,
1739
+ "grad_norm": 1.296875,
1740
+ "learning_rate": 7.286019971469329e-05,
1741
+ "loss": 0.84075432,
1742
+ "memory(GiB)": 16.34,
1743
+ "step": 795,
1744
+ "train_speed(iter/s)": 0.141765
1745
+ },
1746
+ {
1747
+ "acc": 0.72124152,
1748
+ "epoch": 2.168021680216802,
1749
+ "grad_norm": 1.4375,
1750
+ "learning_rate": 7.23252496433666e-05,
1751
+ "loss": 1.05863771,
1752
+ "memory(GiB)": 16.34,
1753
+ "step": 800,
1754
+ "train_speed(iter/s)": 0.141874
1755
+ },
1756
+ {
1757
+ "epoch": 2.168021680216802,
1758
+ "eval_acc": 0.6240499915554805,
1759
+ "eval_loss": 1.61227285861969,
1760
+ "eval_runtime": 44.3145,
1761
+ "eval_samples_per_second": 0.858,
1762
+ "eval_steps_per_second": 0.858,
1763
+ "step": 800
1764
+ },
1765
+ {
1766
+ "acc": 0.73518519,
1767
+ "epoch": 2.181571815718157,
1768
+ "grad_norm": 1.546875,
1769
+ "learning_rate": 7.179029957203993e-05,
1770
+ "loss": 0.96204119,
1771
+ "memory(GiB)": 16.34,
1772
+ "step": 805,
1773
+ "train_speed(iter/s)": 0.140881
1774
+ },
1775
+ {
1776
+ "acc": 0.74290161,
1777
+ "epoch": 2.1951219512195124,
1778
+ "grad_norm": 2.015625,
1779
+ "learning_rate": 7.125534950071326e-05,
1780
+ "loss": 0.92387733,
1781
+ "memory(GiB)": 16.34,
1782
+ "step": 810,
1783
+ "train_speed(iter/s)": 0.14021
1784
+ },
1785
+ {
1786
+ "acc": 0.73101602,
1787
+ "epoch": 2.2086720867208673,
1788
+ "grad_norm": 1.6015625,
1789
+ "learning_rate": 7.072039942938658e-05,
1790
+ "loss": 0.96248655,
1791
+ "memory(GiB)": 16.34,
1792
+ "step": 815,
1793
+ "train_speed(iter/s)": 0.140323
1794
+ },
1795
+ {
1796
+ "acc": 0.73887796,
1797
+ "epoch": 2.2222222222222223,
1798
+ "grad_norm": 1.796875,
1799
+ "learning_rate": 7.018544935805991e-05,
1800
+ "loss": 0.94751673,
1801
+ "memory(GiB)": 16.34,
1802
+ "step": 820,
1803
+ "train_speed(iter/s)": 0.140441
1804
+ },
1805
+ {
1806
+ "acc": 0.75793715,
1807
+ "epoch": 2.2357723577235773,
1808
+ "grad_norm": 2.015625,
1809
+ "learning_rate": 6.965049928673323e-05,
1810
+ "loss": 0.84263477,
1811
+ "memory(GiB)": 16.34,
1812
+ "step": 825,
1813
+ "train_speed(iter/s)": 0.140555
1814
+ },
1815
+ {
1816
+ "acc": 0.75873909,
1817
+ "epoch": 2.2493224932249323,
1818
+ "grad_norm": 1.953125,
1819
+ "learning_rate": 6.911554921540656e-05,
1820
+ "loss": 0.89048252,
1821
+ "memory(GiB)": 16.34,
1822
+ "step": 830,
1823
+ "train_speed(iter/s)": 0.14067
1824
+ },
1825
+ {
1826
+ "acc": 0.74456077,
1827
+ "epoch": 2.2628726287262872,
1828
+ "grad_norm": 1.7890625,
1829
+ "learning_rate": 6.858059914407987e-05,
1830
+ "loss": 0.90777779,
1831
+ "memory(GiB)": 16.34,
1832
+ "step": 835,
1833
+ "train_speed(iter/s)": 0.140775
1834
+ },
1835
+ {
1836
+ "acc": 0.75809846,
1837
+ "epoch": 2.2764227642276422,
1838
+ "grad_norm": 2.0625,
1839
+ "learning_rate": 6.80456490727532e-05,
1840
+ "loss": 0.87556753,
1841
+ "memory(GiB)": 16.34,
1842
+ "step": 840,
1843
+ "train_speed(iter/s)": 0.14088
1844
+ },
1845
+ {
1846
+ "acc": 0.73194971,
1847
+ "epoch": 2.289972899728997,
1848
+ "grad_norm": 2.765625,
1849
+ "learning_rate": 6.751069900142653e-05,
1850
+ "loss": 0.9232769,
1851
+ "memory(GiB)": 16.34,
1852
+ "step": 845,
1853
+ "train_speed(iter/s)": 0.140986
1854
+ },
1855
+ {
1856
+ "acc": 0.74470835,
1857
+ "epoch": 2.303523035230352,
1858
+ "grad_norm": 1.8515625,
1859
+ "learning_rate": 6.697574893009985e-05,
1860
+ "loss": 0.88600664,
1861
+ "memory(GiB)": 16.34,
1862
+ "step": 850,
1863
+ "train_speed(iter/s)": 0.141086
1864
+ },
1865
+ {
1866
+ "epoch": 2.303523035230352,
1867
+ "eval_acc": 0.6174632663401453,
1868
+ "eval_loss": 1.660492181777954,
1869
+ "eval_runtime": 44.1393,
1870
+ "eval_samples_per_second": 0.861,
1871
+ "eval_steps_per_second": 0.861,
1872
+ "step": 850
1873
+ },
1874
+ {
1875
+ "acc": 0.76063514,
1876
+ "epoch": 2.317073170731707,
1877
+ "grad_norm": 2.40625,
1878
+ "learning_rate": 6.644079885877318e-05,
1879
+ "loss": 0.80236473,
1880
+ "memory(GiB)": 16.34,
1881
+ "step": 855,
1882
+ "train_speed(iter/s)": 0.140168
1883
+ },
1884
+ {
1885
+ "acc": 0.71505499,
1886
+ "epoch": 2.330623306233062,
1887
+ "grad_norm": 2.671875,
1888
+ "learning_rate": 6.59058487874465e-05,
1889
+ "loss": 1.01317081,
1890
+ "memory(GiB)": 16.34,
1891
+ "step": 860,
1892
+ "train_speed(iter/s)": 0.140281
1893
+ },
1894
+ {
1895
+ "acc": 0.73396034,
1896
+ "epoch": 2.3441734417344176,
1897
+ "grad_norm": 1.703125,
1898
+ "learning_rate": 6.537089871611983e-05,
1899
+ "loss": 0.96496754,
1900
+ "memory(GiB)": 16.34,
1901
+ "step": 865,
1902
+ "train_speed(iter/s)": 0.140387
1903
+ },
1904
+ {
1905
+ "acc": 0.74145699,
1906
+ "epoch": 2.3577235772357725,
1907
+ "grad_norm": 1.9453125,
1908
+ "learning_rate": 6.483594864479316e-05,
1909
+ "loss": 1.00490999,
1910
+ "memory(GiB)": 16.34,
1911
+ "step": 870,
1912
+ "train_speed(iter/s)": 0.140492
1913
+ },
1914
+ {
1915
+ "acc": 0.76035104,
1916
+ "epoch": 2.3712737127371275,
1917
+ "grad_norm": 1.7265625,
1918
+ "learning_rate": 6.430099857346647e-05,
1919
+ "loss": 0.93969469,
1920
+ "memory(GiB)": 16.34,
1921
+ "step": 875,
1922
+ "train_speed(iter/s)": 0.140599
1923
+ },
1924
+ {
1925
+ "acc": 0.75214877,
1926
+ "epoch": 2.3848238482384825,
1927
+ "grad_norm": 1.9296875,
1928
+ "learning_rate": 6.376604850213979e-05,
1929
+ "loss": 0.9385232,
1930
+ "memory(GiB)": 16.34,
1931
+ "step": 880,
1932
+ "train_speed(iter/s)": 0.140705
1933
+ },
1934
+ {
1935
+ "acc": 0.73846526,
1936
+ "epoch": 2.3983739837398375,
1937
+ "grad_norm": 2.421875,
1938
+ "learning_rate": 6.323109843081312e-05,
1939
+ "loss": 0.95887814,
1940
+ "memory(GiB)": 16.34,
1941
+ "step": 885,
1942
+ "train_speed(iter/s)": 0.140812
1943
+ },
1944
+ {
1945
+ "acc": 0.72576594,
1946
+ "epoch": 2.4119241192411924,
1947
+ "grad_norm": 1.46875,
1948
+ "learning_rate": 6.269614835948645e-05,
1949
+ "loss": 0.95252094,
1950
+ "memory(GiB)": 16.34,
1951
+ "step": 890,
1952
+ "train_speed(iter/s)": 0.140912
1953
+ },
1954
+ {
1955
+ "acc": 0.74866586,
1956
+ "epoch": 2.4254742547425474,
1957
+ "grad_norm": 1.5546875,
1958
+ "learning_rate": 6.216119828815977e-05,
1959
+ "loss": 0.91533632,
1960
+ "memory(GiB)": 16.34,
1961
+ "step": 895,
1962
+ "train_speed(iter/s)": 0.141011
1963
+ },
1964
+ {
1965
+ "acc": 0.72289066,
1966
+ "epoch": 2.4390243902439024,
1967
+ "grad_norm": 1.375,
1968
+ "learning_rate": 6.162624821683308e-05,
1969
+ "loss": 0.996418,
1970
+ "memory(GiB)": 16.34,
1971
+ "step": 900,
1972
+ "train_speed(iter/s)": 0.13966
1973
+ },
1974
+ {
1975
+ "epoch": 2.4390243902439024,
1976
+ "eval_acc": 0.6152676912683668,
1977
+ "eval_loss": 1.6503233909606934,
1978
+ "eval_runtime": 44.1939,
1979
+ "eval_samples_per_second": 0.86,
1980
+ "eval_steps_per_second": 0.86,
1981
+ "step": 900
1982
+ },
1983
+ {
1984
+ "acc": 0.73535914,
1985
+ "epoch": 2.4525745257452574,
1986
+ "grad_norm": 1.3125,
1987
+ "learning_rate": 6.109129814550641e-05,
1988
+ "loss": 1.00612879,
1989
+ "memory(GiB)": 16.34,
1990
+ "step": 905,
1991
+ "train_speed(iter/s)": 0.138805
1992
+ },
1993
+ {
1994
+ "acc": 0.7277998,
1995
+ "epoch": 2.4661246612466123,
1996
+ "grad_norm": 2.328125,
1997
+ "learning_rate": 6.0556348074179737e-05,
1998
+ "loss": 1.00636635,
1999
+ "memory(GiB)": 16.34,
2000
+ "step": 910,
2001
+ "train_speed(iter/s)": 0.138904
2002
+ },
2003
+ {
2004
+ "acc": 0.70795035,
2005
+ "epoch": 2.4796747967479673,
2006
+ "grad_norm": 1.6484375,
2007
+ "learning_rate": 6.002139800285306e-05,
2008
+ "loss": 0.98440151,
2009
+ "memory(GiB)": 16.34,
2010
+ "step": 915,
2011
+ "train_speed(iter/s)": 0.139013
2012
+ },
2013
+ {
2014
+ "acc": 0.7650095,
2015
+ "epoch": 2.4932249322493227,
2016
+ "grad_norm": 2.25,
2017
+ "learning_rate": 5.948644793152638e-05,
2018
+ "loss": 0.77239523,
2019
+ "memory(GiB)": 16.34,
2020
+ "step": 920,
2021
+ "train_speed(iter/s)": 0.139126
2022
+ },
2023
+ {
2024
+ "acc": 0.71504927,
2025
+ "epoch": 2.5067750677506773,
2026
+ "grad_norm": 1.6328125,
2027
+ "learning_rate": 5.895149786019971e-05,
2028
+ "loss": 1.01518288,
2029
+ "memory(GiB)": 16.34,
2030
+ "step": 925,
2031
+ "train_speed(iter/s)": 0.139233
2032
+ },
2033
+ {
2034
+ "acc": 0.71582847,
2035
+ "epoch": 2.5203252032520327,
2036
+ "grad_norm": 2.28125,
2037
+ "learning_rate": 5.841654778887303e-05,
2038
+ "loss": 0.97862291,
2039
+ "memory(GiB)": 16.34,
2040
+ "step": 930,
2041
+ "train_speed(iter/s)": 0.13934
2042
+ },
2043
+ {
2044
+ "acc": 0.74984369,
2045
+ "epoch": 2.5338753387533877,
2046
+ "grad_norm": 1.8515625,
2047
+ "learning_rate": 5.788159771754635e-05,
2048
+ "loss": 0.85021992,
2049
+ "memory(GiB)": 16.34,
2050
+ "step": 935,
2051
+ "train_speed(iter/s)": 0.139448
2052
+ },
2053
+ {
2054
+ "acc": 0.74142261,
2055
+ "epoch": 2.5474254742547426,
2056
+ "grad_norm": 1.796875,
2057
+ "learning_rate": 5.734664764621968e-05,
2058
+ "loss": 0.91843338,
2059
+ "memory(GiB)": 16.34,
2060
+ "step": 940,
2061
+ "train_speed(iter/s)": 0.139564
2062
+ },
2063
+ {
2064
+ "acc": 0.71266222,
2065
+ "epoch": 2.5609756097560976,
2066
+ "grad_norm": 1.875,
2067
+ "learning_rate": 5.6811697574893007e-05,
2068
+ "loss": 1.14017658,
2069
+ "memory(GiB)": 16.34,
2070
+ "step": 945,
2071
+ "train_speed(iter/s)": 0.139667
2072
+ },
2073
+ {
2074
+ "acc": 0.73374839,
2075
+ "epoch": 2.5745257452574526,
2076
+ "grad_norm": 1.671875,
2077
+ "learning_rate": 5.627674750356633e-05,
2078
+ "loss": 0.99979248,
2079
+ "memory(GiB)": 16.34,
2080
+ "step": 950,
2081
+ "train_speed(iter/s)": 0.139769
2082
+ },
2083
+ {
2084
+ "epoch": 2.5745257452574526,
2085
+ "eval_acc": 0.6161121432190508,
2086
+ "eval_loss": 1.6511973142623901,
2087
+ "eval_runtime": 44.1881,
2088
+ "eval_samples_per_second": 0.86,
2089
+ "eval_steps_per_second": 0.86,
2090
+ "step": 950
2091
+ },
2092
+ {
2093
+ "acc": 0.74381332,
2094
+ "epoch": 2.5880758807588076,
2095
+ "grad_norm": 1.7734375,
2096
+ "learning_rate": 5.5741797432239646e-05,
2097
+ "loss": 0.93189411,
2098
+ "memory(GiB)": 16.34,
2099
+ "step": 955,
2100
+ "train_speed(iter/s)": 0.138972
2101
+ },
2102
+ {
2103
+ "acc": 0.73305674,
2104
+ "epoch": 2.6016260162601625,
2105
+ "grad_norm": 2.4375,
2106
+ "learning_rate": 5.5206847360912977e-05,
2107
+ "loss": 0.93946905,
2108
+ "memory(GiB)": 16.34,
2109
+ "step": 960,
2110
+ "train_speed(iter/s)": 0.139079
2111
+ },
2112
+ {
2113
+ "acc": 0.72961435,
2114
+ "epoch": 2.6151761517615175,
2115
+ "grad_norm": 1.84375,
2116
+ "learning_rate": 5.46718972895863e-05,
2117
+ "loss": 0.99850683,
2118
+ "memory(GiB)": 16.34,
2119
+ "step": 965,
2120
+ "train_speed(iter/s)": 0.139175
2121
+ },
2122
+ {
2123
+ "acc": 0.7133184,
2124
+ "epoch": 2.6287262872628725,
2125
+ "grad_norm": 1.96875,
2126
+ "learning_rate": 5.413694721825962e-05,
2127
+ "loss": 1.03378878,
2128
+ "memory(GiB)": 16.34,
2129
+ "step": 970,
2130
+ "train_speed(iter/s)": 0.139282
2131
+ },
2132
+ {
2133
+ "acc": 0.71164918,
2134
+ "epoch": 2.642276422764228,
2135
+ "grad_norm": 1.3828125,
2136
+ "learning_rate": 5.360199714693295e-05,
2137
+ "loss": 1.02268944,
2138
+ "memory(GiB)": 16.34,
2139
+ "step": 975,
2140
+ "train_speed(iter/s)": 0.139382
2141
+ },
2142
+ {
2143
+ "acc": 0.71824646,
2144
+ "epoch": 2.6558265582655824,
2145
+ "grad_norm": 1.84375,
2146
+ "learning_rate": 5.306704707560627e-05,
2147
+ "loss": 1.06014738,
2148
+ "memory(GiB)": 16.34,
2149
+ "step": 980,
2150
+ "train_speed(iter/s)": 0.139479
2151
+ },
2152
+ {
2153
+ "acc": 0.75523286,
2154
+ "epoch": 2.669376693766938,
2155
+ "grad_norm": 1.7734375,
2156
+ "learning_rate": 5.253209700427959e-05,
2157
+ "loss": 0.80533791,
2158
+ "memory(GiB)": 16.34,
2159
+ "step": 985,
2160
+ "train_speed(iter/s)": 0.139581
2161
+ },
2162
+ {
2163
+ "acc": 0.72265592,
2164
+ "epoch": 2.682926829268293,
2165
+ "grad_norm": 1.5546875,
2166
+ "learning_rate": 5.199714693295292e-05,
2167
+ "loss": 1.0223958,
2168
+ "memory(GiB)": 16.34,
2169
+ "step": 990,
2170
+ "train_speed(iter/s)": 0.139675
2171
+ },
2172
+ {
2173
+ "acc": 0.74640193,
2174
+ "epoch": 2.696476964769648,
2175
+ "grad_norm": 1.5546875,
2176
+ "learning_rate": 5.1462196861626247e-05,
2177
+ "loss": 0.8896265,
2178
+ "memory(GiB)": 16.34,
2179
+ "step": 995,
2180
+ "train_speed(iter/s)": 0.139767
2181
+ },
2182
+ {
2183
+ "acc": 0.75235152,
2184
+ "epoch": 2.710027100271003,
2185
+ "grad_norm": 1.6484375,
2186
+ "learning_rate": 5.092724679029957e-05,
2187
+ "loss": 0.87937946,
2188
+ "memory(GiB)": 16.34,
2189
+ "step": 1000,
2190
+ "train_speed(iter/s)": 0.139867
2191
+ },
2192
+ {
2193
+ "epoch": 2.710027100271003,
2194
+ "eval_acc": 0.6123965546360413,
2195
+ "eval_loss": 1.6541699171066284,
2196
+ "eval_runtime": 44.2136,
2197
+ "eval_samples_per_second": 0.859,
2198
+ "eval_steps_per_second": 0.859,
2199
+ "step": 1000
2200
+ },
2201
+ {
2202
+ "acc": 0.73264089,
2203
+ "epoch": 2.7235772357723578,
2204
+ "grad_norm": 1.96875,
2205
+ "learning_rate": 5.0392296718972886e-05,
2206
+ "loss": 0.95834293,
2207
+ "memory(GiB)": 16.34,
2208
+ "step": 1005,
2209
+ "train_speed(iter/s)": 0.139102
2210
+ },
2211
+ {
2212
+ "acc": 0.76931543,
2213
+ "epoch": 2.7371273712737128,
2214
+ "grad_norm": 1.7890625,
2215
+ "learning_rate": 4.9857346647646217e-05,
2216
+ "loss": 0.82889891,
2217
+ "memory(GiB)": 16.34,
2218
+ "step": 1010,
2219
+ "train_speed(iter/s)": 0.139199
2220
+ },
2221
+ {
2222
+ "acc": 0.74891815,
2223
+ "epoch": 2.7506775067750677,
2224
+ "grad_norm": 2.65625,
2225
+ "learning_rate": 4.932239657631954e-05,
2226
+ "loss": 0.93503094,
2227
+ "memory(GiB)": 16.34,
2228
+ "step": 1015,
2229
+ "train_speed(iter/s)": 0.139298
2230
+ },
2231
+ {
2232
+ "acc": 0.72551074,
2233
+ "epoch": 2.7642276422764227,
2234
+ "grad_norm": 1.5703125,
2235
+ "learning_rate": 4.878744650499286e-05,
2236
+ "loss": 1.06086788,
2237
+ "memory(GiB)": 16.34,
2238
+ "step": 1020,
2239
+ "train_speed(iter/s)": 0.139383
2240
+ },
2241
+ {
2242
+ "acc": 0.75321589,
2243
+ "epoch": 2.7777777777777777,
2244
+ "grad_norm": 1.7734375,
2245
+ "learning_rate": 4.825249643366619e-05,
2246
+ "loss": 0.83553734,
2247
+ "memory(GiB)": 16.34,
2248
+ "step": 1025,
2249
+ "train_speed(iter/s)": 0.139478
2250
+ },
2251
+ {
2252
+ "acc": 0.73832054,
2253
+ "epoch": 2.7913279132791327,
2254
+ "grad_norm": 1.53125,
2255
+ "learning_rate": 4.771754636233951e-05,
2256
+ "loss": 0.94093418,
2257
+ "memory(GiB)": 16.34,
2258
+ "step": 1030,
2259
+ "train_speed(iter/s)": 0.139572
2260
+ },
2261
+ {
2262
+ "acc": 0.76351671,
2263
+ "epoch": 2.8048780487804876,
2264
+ "grad_norm": 1.2890625,
2265
+ "learning_rate": 4.718259629101283e-05,
2266
+ "loss": 0.84423561,
2267
+ "memory(GiB)": 16.34,
2268
+ "step": 1035,
2269
+ "train_speed(iter/s)": 0.139664
2270
+ },
2271
+ {
2272
+ "acc": 0.75100279,
2273
+ "epoch": 2.818428184281843,
2274
+ "grad_norm": 1.890625,
2275
+ "learning_rate": 4.6647646219686156e-05,
2276
+ "loss": 0.91757078,
2277
+ "memory(GiB)": 16.34,
2278
+ "step": 1040,
2279
+ "train_speed(iter/s)": 0.139754
2280
+ },
2281
+ {
2282
+ "acc": 0.73302913,
2283
+ "epoch": 2.8319783197831976,
2284
+ "grad_norm": 1.3671875,
2285
+ "learning_rate": 4.6112696148359487e-05,
2286
+ "loss": 0.94331446,
2287
+ "memory(GiB)": 16.34,
2288
+ "step": 1045,
2289
+ "train_speed(iter/s)": 0.139841
2290
+ },
2291
+ {
2292
+ "acc": 0.73944206,
2293
+ "epoch": 2.845528455284553,
2294
+ "grad_norm": 1.59375,
2295
+ "learning_rate": 4.55777460770328e-05,
2296
+ "loss": 0.98726377,
2297
+ "memory(GiB)": 16.34,
2298
+ "step": 1050,
2299
+ "train_speed(iter/s)": 0.139931
2300
+ },
2301
+ {
2302
+ "epoch": 2.845528455284553,
2303
+ "eval_acc": 0.6145921297078196,
2304
+ "eval_loss": 1.6571751832962036,
2305
+ "eval_runtime": 44.3705,
2306
+ "eval_samples_per_second": 0.856,
2307
+ "eval_steps_per_second": 0.856,
2308
+ "step": 1050
2309
+ },
2310
+ {
2311
+ "acc": 0.73031135,
2312
+ "epoch": 2.859078590785908,
2313
+ "grad_norm": 2.25,
2314
+ "learning_rate": 4.5042796005706126e-05,
2315
+ "loss": 0.95681438,
2316
+ "memory(GiB)": 16.34,
2317
+ "step": 1055,
2318
+ "train_speed(iter/s)": 0.139201
2319
+ },
2320
+ {
2321
+ "acc": 0.71694627,
2322
+ "epoch": 2.872628726287263,
2323
+ "grad_norm": 1.6328125,
2324
+ "learning_rate": 4.4507845934379456e-05,
2325
+ "loss": 1.02266083,
2326
+ "memory(GiB)": 16.34,
2327
+ "step": 1060,
2328
+ "train_speed(iter/s)": 0.139294
2329
+ },
2330
+ {
2331
+ "acc": 0.72791638,
2332
+ "epoch": 2.886178861788618,
2333
+ "grad_norm": 1.7578125,
2334
+ "learning_rate": 4.397289586305278e-05,
2335
+ "loss": 0.99341927,
2336
+ "memory(GiB)": 16.34,
2337
+ "step": 1065,
2338
+ "train_speed(iter/s)": 0.139385
2339
+ },
2340
+ {
2341
+ "acc": 0.74939594,
2342
+ "epoch": 2.899728997289973,
2343
+ "grad_norm": 1.921875,
2344
+ "learning_rate": 4.34379457917261e-05,
2345
+ "loss": 0.91077061,
2346
+ "memory(GiB)": 16.34,
2347
+ "step": 1070,
2348
+ "train_speed(iter/s)": 0.139478
2349
+ },
2350
+ {
2351
+ "acc": 0.72694654,
2352
+ "epoch": 2.913279132791328,
2353
+ "grad_norm": 1.7265625,
2354
+ "learning_rate": 4.290299572039942e-05,
2355
+ "loss": 0.98774853,
2356
+ "memory(GiB)": 16.34,
2357
+ "step": 1075,
2358
+ "train_speed(iter/s)": 0.139564
2359
+ },
2360
+ {
2361
+ "acc": 0.70588508,
2362
+ "epoch": 2.926829268292683,
2363
+ "grad_norm": 2.15625,
2364
+ "learning_rate": 4.236804564907275e-05,
2365
+ "loss": 1.07887812,
2366
+ "memory(GiB)": 16.34,
2367
+ "step": 1080,
2368
+ "train_speed(iter/s)": 0.139657
2369
+ },
2370
+ {
2371
+ "acc": 0.74654303,
2372
+ "epoch": 2.940379403794038,
2373
+ "grad_norm": 1.7109375,
2374
+ "learning_rate": 4.183309557774607e-05,
2375
+ "loss": 0.91062069,
2376
+ "memory(GiB)": 16.34,
2377
+ "step": 1085,
2378
+ "train_speed(iter/s)": 0.139743
2379
+ },
2380
+ {
2381
+ "acc": 0.73493595,
2382
+ "epoch": 2.953929539295393,
2383
+ "grad_norm": 2.03125,
2384
+ "learning_rate": 4.1298145506419396e-05,
2385
+ "loss": 0.92819033,
2386
+ "memory(GiB)": 16.34,
2387
+ "step": 1090,
2388
+ "train_speed(iter/s)": 0.139829
2389
+ },
2390
+ {
2391
+ "acc": 0.71466756,
2392
+ "epoch": 2.9674796747967482,
2393
+ "grad_norm": 2.109375,
2394
+ "learning_rate": 4.0763195435092727e-05,
2395
+ "loss": 1.01220913,
2396
+ "memory(GiB)": 16.34,
2397
+ "step": 1095,
2398
+ "train_speed(iter/s)": 0.139916
2399
+ },
2400
+ {
2401
+ "acc": 0.7607831,
2402
+ "epoch": 2.9810298102981028,
2403
+ "grad_norm": 2.375,
2404
+ "learning_rate": 4.022824536376604e-05,
2405
+ "loss": 0.8505785,
2406
+ "memory(GiB)": 16.34,
2407
+ "step": 1100,
2408
+ "train_speed(iter/s)": 0.140004
2409
+ },
2410
+ {
2411
+ "epoch": 2.9810298102981028,
2412
+ "eval_acc": 0.6206721837527445,
2413
+ "eval_loss": 1.6525288820266724,
2414
+ "eval_runtime": 44.2124,
2415
+ "eval_samples_per_second": 0.859,
2416
+ "eval_steps_per_second": 0.859,
2417
+ "step": 1100
2418
+ },
2419
+ {
2420
+ "acc": 0.74116454,
2421
+ "epoch": 2.994579945799458,
2422
+ "grad_norm": 2.65625,
2423
+ "learning_rate": 3.9693295292439366e-05,
2424
+ "loss": 0.92264805,
2425
+ "memory(GiB)": 16.34,
2426
+ "step": 1105,
2427
+ "train_speed(iter/s)": 0.139311
2428
+ },
2429
+ {
2430
+ "acc": 0.8182869,
2431
+ "epoch": 3.008130081300813,
2432
+ "grad_norm": 1.3671875,
2433
+ "learning_rate": 3.915834522111269e-05,
2434
+ "loss": 0.6927434,
2435
+ "memory(GiB)": 16.34,
2436
+ "step": 1110,
2437
+ "train_speed(iter/s)": 0.13923
2438
+ },
2439
+ {
2440
+ "acc": 0.83515587,
2441
+ "epoch": 3.021680216802168,
2442
+ "grad_norm": 1.9140625,
2443
+ "learning_rate": 3.862339514978602e-05,
2444
+ "loss": 0.58284421,
2445
+ "memory(GiB)": 16.34,
2446
+ "step": 1115,
2447
+ "train_speed(iter/s)": 0.139319
2448
+ },
2449
+ {
2450
+ "acc": 0.84008894,
2451
+ "epoch": 3.035230352303523,
2452
+ "grad_norm": 1.8828125,
2453
+ "learning_rate": 3.808844507845934e-05,
2454
+ "loss": 0.59128432,
2455
+ "memory(GiB)": 16.34,
2456
+ "step": 1120,
2457
+ "train_speed(iter/s)": 0.139405
2458
+ },
2459
+ {
2460
+ "acc": 0.84298267,
2461
+ "epoch": 3.048780487804878,
2462
+ "grad_norm": 2.4375,
2463
+ "learning_rate": 3.755349500713266e-05,
2464
+ "loss": 0.56685705,
2465
+ "memory(GiB)": 16.34,
2466
+ "step": 1125,
2467
+ "train_speed(iter/s)": 0.139493
2468
+ },
2469
+ {
2470
+ "acc": 0.83680372,
2471
+ "epoch": 3.062330623306233,
2472
+ "grad_norm": 3.171875,
2473
+ "learning_rate": 3.701854493580599e-05,
2474
+ "loss": 0.55599914,
2475
+ "memory(GiB)": 16.34,
2476
+ "step": 1130,
2477
+ "train_speed(iter/s)": 0.139571
2478
+ },
2479
+ {
2480
+ "acc": 0.81269236,
2481
+ "epoch": 3.075880758807588,
2482
+ "grad_norm": 2.40625,
2483
+ "learning_rate": 3.648359486447931e-05,
2484
+ "loss": 0.63413367,
2485
+ "memory(GiB)": 16.34,
2486
+ "step": 1135,
2487
+ "train_speed(iter/s)": 0.139659
2488
+ },
2489
+ {
2490
+ "acc": 0.8300642,
2491
+ "epoch": 3.089430894308943,
2492
+ "grad_norm": 2.71875,
2493
+ "learning_rate": 3.5948644793152636e-05,
2494
+ "loss": 0.59288554,
2495
+ "memory(GiB)": 16.34,
2496
+ "step": 1140,
2497
+ "train_speed(iter/s)": 0.139738
2498
+ },
2499
+ {
2500
+ "acc": 0.84225779,
2501
+ "epoch": 3.102981029810298,
2502
+ "grad_norm": 2.703125,
2503
+ "learning_rate": 3.541369472182596e-05,
2504
+ "loss": 0.57104263,
2505
+ "memory(GiB)": 16.34,
2506
+ "step": 1145,
2507
+ "train_speed(iter/s)": 0.13982
2508
+ },
2509
+ {
2510
+ "acc": 0.86128368,
2511
+ "epoch": 3.116531165311653,
2512
+ "grad_norm": 3.0,
2513
+ "learning_rate": 3.487874465049928e-05,
2514
+ "loss": 0.47776127,
2515
+ "memory(GiB)": 16.34,
2516
+ "step": 1150,
2517
+ "train_speed(iter/s)": 0.139903
2518
+ },
2519
+ {
2520
+ "epoch": 3.116531165311653,
2521
+ "eval_acc": 0.6091876372234419,
2522
+ "eval_loss": 1.9067459106445312,
2523
+ "eval_runtime": 44.213,
2524
+ "eval_samples_per_second": 0.859,
2525
+ "eval_steps_per_second": 0.859,
2526
+ "step": 1150
2527
+ },
2528
+ {
2529
+ "acc": 0.81826935,
2530
+ "epoch": 3.130081300813008,
2531
+ "grad_norm": 2.234375,
2532
+ "learning_rate": 3.4343794579172606e-05,
2533
+ "loss": 0.64635777,
2534
+ "memory(GiB)": 16.34,
2535
+ "step": 1155,
2536
+ "train_speed(iter/s)": 0.13924
2537
+ },
2538
+ {
2539
+ "acc": 0.79423141,
2540
+ "epoch": 3.1436314363143634,
2541
+ "grad_norm": 2.859375,
2542
+ "learning_rate": 3.380884450784593e-05,
2543
+ "loss": 0.71202464,
2544
+ "memory(GiB)": 16.34,
2545
+ "step": 1160,
2546
+ "train_speed(iter/s)": 0.139319
2547
+ },
2548
+ {
2549
+ "acc": 0.81948729,
2550
+ "epoch": 3.1571815718157183,
2551
+ "grad_norm": 2.046875,
2552
+ "learning_rate": 3.327389443651925e-05,
2553
+ "loss": 0.62771091,
2554
+ "memory(GiB)": 16.34,
2555
+ "step": 1165,
2556
+ "train_speed(iter/s)": 0.1394
2557
+ },
2558
+ {
2559
+ "acc": 0.85081501,
2560
+ "epoch": 3.1707317073170733,
2561
+ "grad_norm": 2.125,
2562
+ "learning_rate": 3.2738944365192576e-05,
2563
+ "loss": 0.55290155,
2564
+ "memory(GiB)": 16.34,
2565
+ "step": 1170,
2566
+ "train_speed(iter/s)": 0.139484
2567
+ },
2568
+ {
2569
+ "acc": 0.84410248,
2570
+ "epoch": 3.1842818428184283,
2571
+ "grad_norm": 2.140625,
2572
+ "learning_rate": 3.2203994293865906e-05,
2573
+ "loss": 0.57252893,
2574
+ "memory(GiB)": 16.34,
2575
+ "step": 1175,
2576
+ "train_speed(iter/s)": 0.139567
2577
+ },
2578
+ {
2579
+ "acc": 0.78565025,
2580
+ "epoch": 3.1978319783197833,
2581
+ "grad_norm": 3.203125,
2582
+ "learning_rate": 3.166904422253922e-05,
2583
+ "loss": 0.73501797,
2584
+ "memory(GiB)": 16.34,
2585
+ "step": 1180,
2586
+ "train_speed(iter/s)": 0.13965
2587
+ },
2588
+ {
2589
+ "acc": 0.78738356,
2590
+ "epoch": 3.2113821138211383,
2591
+ "grad_norm": 2.953125,
2592
+ "learning_rate": 3.113409415121255e-05,
2593
+ "loss": 0.72265806,
2594
+ "memory(GiB)": 16.34,
2595
+ "step": 1185,
2596
+ "train_speed(iter/s)": 0.139732
2597
+ },
2598
+ {
2599
+ "acc": 0.818221,
2600
+ "epoch": 3.2249322493224932,
2601
+ "grad_norm": 2.21875,
2602
+ "learning_rate": 3.0599144079885876e-05,
2603
+ "loss": 0.61062384,
2604
+ "memory(GiB)": 16.34,
2605
+ "step": 1190,
2606
+ "train_speed(iter/s)": 0.139813
2607
+ },
2608
+ {
2609
+ "acc": 0.82556448,
2610
+ "epoch": 3.238482384823848,
2611
+ "grad_norm": 2.3125,
2612
+ "learning_rate": 3.00641940085592e-05,
2613
+ "loss": 0.61080799,
2614
+ "memory(GiB)": 16.34,
2615
+ "step": 1195,
2616
+ "train_speed(iter/s)": 0.139893
2617
+ },
2618
+ {
2619
+ "acc": 0.82586126,
2620
+ "epoch": 3.252032520325203,
2621
+ "grad_norm": 2.34375,
2622
+ "learning_rate": 2.952924393723252e-05,
2623
+ "loss": 0.6164794,
2624
+ "memory(GiB)": 16.34,
2625
+ "step": 1200,
2626
+ "train_speed(iter/s)": 0.139974
2627
+ },
2628
+ {
2629
+ "epoch": 3.252032520325203,
2630
+ "eval_acc": 0.6044587062996115,
2631
+ "eval_loss": 1.8586076498031616,
2632
+ "eval_runtime": 44.1645,
2633
+ "eval_samples_per_second": 0.86,
2634
+ "eval_steps_per_second": 0.86,
2635
+ "step": 1200
2636
+ },
2637
+ {
2638
+ "acc": 0.82835131,
2639
+ "epoch": 3.265582655826558,
2640
+ "grad_norm": 6.46875,
2641
+ "learning_rate": 2.8994293865905846e-05,
2642
+ "loss": 0.61753302,
2643
+ "memory(GiB)": 16.34,
2644
+ "step": 1205,
2645
+ "train_speed(iter/s)": 0.139339
2646
+ },
2647
+ {
2648
+ "acc": 0.82603951,
2649
+ "epoch": 3.279132791327913,
2650
+ "grad_norm": 2.90625,
2651
+ "learning_rate": 2.8459343794579173e-05,
2652
+ "loss": 0.58360481,
2653
+ "memory(GiB)": 16.34,
2654
+ "step": 1210,
2655
+ "train_speed(iter/s)": 0.139423
2656
+ },
2657
+ {
2658
+ "acc": 0.79192553,
2659
+ "epoch": 3.292682926829268,
2660
+ "grad_norm": 2.5,
2661
+ "learning_rate": 2.7924393723252493e-05,
2662
+ "loss": 0.75090132,
2663
+ "memory(GiB)": 16.34,
2664
+ "step": 1215,
2665
+ "train_speed(iter/s)": 0.1395
2666
+ },
2667
+ {
2668
+ "acc": 0.83583679,
2669
+ "epoch": 3.306233062330623,
2670
+ "grad_norm": 2.15625,
2671
+ "learning_rate": 2.738944365192582e-05,
2672
+ "loss": 0.56217456,
2673
+ "memory(GiB)": 16.34,
2674
+ "step": 1220,
2675
+ "train_speed(iter/s)": 0.139581
2676
+ },
2677
+ {
2678
+ "acc": 0.80481911,
2679
+ "epoch": 3.3197831978319785,
2680
+ "grad_norm": 4.125,
2681
+ "learning_rate": 2.685449358059914e-05,
2682
+ "loss": 0.73073688,
2683
+ "memory(GiB)": 16.34,
2684
+ "step": 1225,
2685
+ "train_speed(iter/s)": 0.139659
2686
+ },
2687
+ {
2688
+ "acc": 0.8173975,
2689
+ "epoch": 3.3333333333333335,
2690
+ "grad_norm": 3.1875,
2691
+ "learning_rate": 2.6319543509272466e-05,
2692
+ "loss": 0.59854188,
2693
+ "memory(GiB)": 16.34,
2694
+ "step": 1230,
2695
+ "train_speed(iter/s)": 0.139735
2696
+ },
2697
+ {
2698
+ "acc": 0.88763266,
2699
+ "epoch": 3.3468834688346885,
2700
+ "grad_norm": 2.078125,
2701
+ "learning_rate": 2.5784593437945793e-05,
2702
+ "loss": 0.3878314,
2703
+ "memory(GiB)": 16.34,
2704
+ "step": 1235,
2705
+ "train_speed(iter/s)": 0.139826
2706
+ },
2707
+ {
2708
+ "acc": 0.82199507,
2709
+ "epoch": 3.3604336043360434,
2710
+ "grad_norm": 2.28125,
2711
+ "learning_rate": 2.5249643366619113e-05,
2712
+ "loss": 0.65350094,
2713
+ "memory(GiB)": 16.34,
2714
+ "step": 1240,
2715
+ "train_speed(iter/s)": 0.139901
2716
+ },
2717
+ {
2718
+ "acc": 0.83115234,
2719
+ "epoch": 3.3739837398373984,
2720
+ "grad_norm": 2.484375,
2721
+ "learning_rate": 2.471469329529244e-05,
2722
+ "loss": 0.61815515,
2723
+ "memory(GiB)": 16.34,
2724
+ "step": 1245,
2725
+ "train_speed(iter/s)": 0.13998
2726
+ },
2727
+ {
2728
+ "acc": 0.81266117,
2729
+ "epoch": 3.3875338753387534,
2730
+ "grad_norm": 1.78125,
2731
+ "learning_rate": 2.417974322396576e-05,
2732
+ "loss": 0.65289016,
2733
+ "memory(GiB)": 16.34,
2734
+ "step": 1250,
2735
+ "train_speed(iter/s)": 0.140052
2736
+ },
2737
+ {
2738
+ "epoch": 3.3875338753387534,
2739
+ "eval_acc": 0.6037831447390644,
2740
+ "eval_loss": 1.893083095550537,
2741
+ "eval_runtime": 44.1639,
2742
+ "eval_samples_per_second": 0.86,
2743
+ "eval_steps_per_second": 0.86,
2744
+ "step": 1250
2745
+ },
2746
+ {
2747
+ "acc": 0.84776039,
2748
+ "epoch": 3.4010840108401084,
2749
+ "grad_norm": 2.046875,
2750
+ "learning_rate": 2.3644793152639086e-05,
2751
+ "loss": 0.53162961,
2752
+ "memory(GiB)": 16.34,
2753
+ "step": 1255,
2754
+ "train_speed(iter/s)": 0.139443
2755
+ },
2756
+ {
2757
+ "acc": 0.84318447,
2758
+ "epoch": 3.4146341463414633,
2759
+ "grad_norm": 2.234375,
2760
+ "learning_rate": 2.3109843081312406e-05,
2761
+ "loss": 0.52226324,
2762
+ "memory(GiB)": 16.34,
2763
+ "step": 1260,
2764
+ "train_speed(iter/s)": 0.139515
2765
+ },
2766
+ {
2767
+ "acc": 0.80521078,
2768
+ "epoch": 3.4281842818428183,
2769
+ "grad_norm": 2.25,
2770
+ "learning_rate": 2.2574893009985733e-05,
2771
+ "loss": 0.62338996,
2772
+ "memory(GiB)": 16.34,
2773
+ "step": 1265,
2774
+ "train_speed(iter/s)": 0.13959
2775
+ },
2776
+ {
2777
+ "acc": 0.83542995,
2778
+ "epoch": 3.4417344173441733,
2779
+ "grad_norm": 3.0625,
2780
+ "learning_rate": 2.203994293865906e-05,
2781
+ "loss": 0.60965805,
2782
+ "memory(GiB)": 16.34,
2783
+ "step": 1270,
2784
+ "train_speed(iter/s)": 0.139638
2785
+ },
2786
+ {
2787
+ "acc": 0.8333952,
2788
+ "epoch": 3.4552845528455283,
2789
+ "grad_norm": 2.625,
2790
+ "learning_rate": 2.150499286733238e-05,
2791
+ "loss": 0.60031776,
2792
+ "memory(GiB)": 16.34,
2793
+ "step": 1275,
2794
+ "train_speed(iter/s)": 0.139705
2795
+ },
2796
+ {
2797
+ "acc": 0.84367867,
2798
+ "epoch": 3.4688346883468837,
2799
+ "grad_norm": 1.859375,
2800
+ "learning_rate": 2.0970042796005706e-05,
2801
+ "loss": 0.54981174,
2802
+ "memory(GiB)": 16.34,
2803
+ "step": 1280,
2804
+ "train_speed(iter/s)": 0.13978
2805
+ },
2806
+ {
2807
+ "acc": 0.8121892,
2808
+ "epoch": 3.4823848238482387,
2809
+ "grad_norm": 2.046875,
2810
+ "learning_rate": 2.0435092724679026e-05,
2811
+ "loss": 0.67483497,
2812
+ "memory(GiB)": 16.34,
2813
+ "step": 1285,
2814
+ "train_speed(iter/s)": 0.139851
2815
+ },
2816
+ {
2817
+ "acc": 0.87883768,
2818
+ "epoch": 3.4959349593495936,
2819
+ "grad_norm": 1.6875,
2820
+ "learning_rate": 1.9900142653352353e-05,
2821
+ "loss": 0.45111437,
2822
+ "memory(GiB)": 16.34,
2823
+ "step": 1290,
2824
+ "train_speed(iter/s)": 0.139924
2825
+ },
2826
+ {
2827
+ "acc": 0.81952734,
2828
+ "epoch": 3.5094850948509486,
2829
+ "grad_norm": 2.640625,
2830
+ "learning_rate": 1.9365192582025676e-05,
2831
+ "loss": 0.66096244,
2832
+ "memory(GiB)": 16.34,
2833
+ "step": 1295,
2834
+ "train_speed(iter/s)": 0.139995
2835
+ },
2836
+ {
2837
+ "acc": 0.82709446,
2838
+ "epoch": 3.5230352303523036,
2839
+ "grad_norm": 2.140625,
2840
+ "learning_rate": 1.8830242510699e-05,
2841
+ "loss": 0.58278303,
2842
+ "memory(GiB)": 16.34,
2843
+ "step": 1300,
2844
+ "train_speed(iter/s)": 0.140068
2845
+ },
2846
+ {
2847
+ "epoch": 3.5230352303523036,
2848
+ "eval_acc": 0.603276473568654,
2849
+ "eval_loss": 1.8986655473709106,
2850
+ "eval_runtime": 44.3091,
2851
+ "eval_samples_per_second": 0.858,
2852
+ "eval_steps_per_second": 0.858,
2853
+ "step": 1300
2854
+ },
2855
+ {
2856
+ "acc": 0.83689537,
2857
+ "epoch": 3.5365853658536586,
2858
+ "grad_norm": 1.9453125,
2859
+ "learning_rate": 1.8295292439372323e-05,
2860
+ "loss": 0.55435977,
2861
+ "memory(GiB)": 16.34,
2862
+ "step": 1305,
2863
+ "train_speed(iter/s)": 0.139476
2864
+ },
2865
+ {
2866
+ "acc": 0.8525279,
2867
+ "epoch": 3.5501355013550135,
2868
+ "grad_norm": 2.21875,
2869
+ "learning_rate": 1.776034236804565e-05,
2870
+ "loss": 0.51831579,
2871
+ "memory(GiB)": 16.34,
2872
+ "step": 1310,
2873
+ "train_speed(iter/s)": 0.139549
2874
+ },
2875
+ {
2876
+ "acc": 0.83099527,
2877
+ "epoch": 3.5636856368563685,
2878
+ "grad_norm": 1.8828125,
2879
+ "learning_rate": 1.7225392296718973e-05,
2880
+ "loss": 0.61766572,
2881
+ "memory(GiB)": 16.34,
2882
+ "step": 1315,
2883
+ "train_speed(iter/s)": 0.139617
2884
+ },
2885
+ {
2886
+ "acc": 0.82115297,
2887
+ "epoch": 3.5772357723577235,
2888
+ "grad_norm": 2.765625,
2889
+ "learning_rate": 1.6690442225392296e-05,
2890
+ "loss": 0.62109137,
2891
+ "memory(GiB)": 16.34,
2892
+ "step": 1320,
2893
+ "train_speed(iter/s)": 0.13969
2894
+ },
2895
+ {
2896
+ "acc": 0.85132771,
2897
+ "epoch": 3.5907859078590785,
2898
+ "grad_norm": 2.421875,
2899
+ "learning_rate": 1.615549215406562e-05,
2900
+ "loss": 0.55242,
2901
+ "memory(GiB)": 16.34,
2902
+ "step": 1325,
2903
+ "train_speed(iter/s)": 0.139763
2904
+ },
2905
+ {
2906
+ "acc": 0.82085381,
2907
+ "epoch": 3.6043360433604335,
2908
+ "grad_norm": 2.8125,
2909
+ "learning_rate": 1.5620542082738943e-05,
2910
+ "loss": 0.62965093,
2911
+ "memory(GiB)": 16.34,
2912
+ "step": 1330,
2913
+ "train_speed(iter/s)": 0.139837
2914
+ },
2915
+ {
2916
+ "acc": 0.85592861,
2917
+ "epoch": 3.617886178861789,
2918
+ "grad_norm": 1.875,
2919
+ "learning_rate": 1.5085592011412266e-05,
2920
+ "loss": 0.54980545,
2921
+ "memory(GiB)": 16.34,
2922
+ "step": 1335,
2923
+ "train_speed(iter/s)": 0.139907
2924
+ },
2925
+ {
2926
+ "acc": 0.80786858,
2927
+ "epoch": 3.6314363143631434,
2928
+ "grad_norm": 2.328125,
2929
+ "learning_rate": 1.4550641940085591e-05,
2930
+ "loss": 0.67223382,
2931
+ "memory(GiB)": 16.34,
2932
+ "step": 1340,
2933
+ "train_speed(iter/s)": 0.139974
2934
+ },
2935
+ {
2936
+ "acc": 0.85045481,
2937
+ "epoch": 3.644986449864499,
2938
+ "grad_norm": 2.078125,
2939
+ "learning_rate": 1.4015691868758914e-05,
2940
+ "loss": 0.50758219,
2941
+ "memory(GiB)": 16.34,
2942
+ "step": 1345,
2943
+ "train_speed(iter/s)": 0.140054
2944
+ },
2945
+ {
2946
+ "acc": 0.85536547,
2947
+ "epoch": 3.658536585365854,
2948
+ "grad_norm": 1.7734375,
2949
+ "learning_rate": 1.348074179743224e-05,
2950
+ "loss": 0.53356686,
2951
+ "memory(GiB)": 16.34,
2952
+ "step": 1350,
2953
+ "train_speed(iter/s)": 0.140124
2954
+ },
2955
+ {
2956
+ "epoch": 3.658536585365854,
2957
+ "eval_acc": 0.6010808984968755,
2958
+ "eval_loss": 1.8972593545913696,
2959
+ "eval_runtime": 44.2915,
2960
+ "eval_samples_per_second": 0.858,
2961
+ "eval_steps_per_second": 0.858,
2962
+ "step": 1350
2963
+ },
2964
+ {
2965
+ "acc": 0.8181016,
2966
+ "epoch": 3.6720867208672088,
2967
+ "grad_norm": 2.3125,
2968
+ "learning_rate": 1.2945791726105563e-05,
2969
+ "loss": 0.59544349,
2970
+ "memory(GiB)": 16.34,
2971
+ "step": 1355,
2972
+ "train_speed(iter/s)": 0.139555
2973
+ },
2974
+ {
2975
+ "acc": 0.87799835,
2976
+ "epoch": 3.6856368563685638,
2977
+ "grad_norm": 2.03125,
2978
+ "learning_rate": 1.2410841654778886e-05,
2979
+ "loss": 0.42857742,
2980
+ "memory(GiB)": 16.34,
2981
+ "step": 1360,
2982
+ "train_speed(iter/s)": 0.139631
2983
+ },
2984
+ {
2985
+ "acc": 0.81485729,
2986
+ "epoch": 3.6991869918699187,
2987
+ "grad_norm": 1.65625,
2988
+ "learning_rate": 1.187589158345221e-05,
2989
+ "loss": 0.64030995,
2990
+ "memory(GiB)": 16.34,
2991
+ "step": 1365,
2992
+ "train_speed(iter/s)": 0.139703
2993
+ },
2994
+ {
2995
+ "acc": 0.82011805,
2996
+ "epoch": 3.7127371273712737,
2997
+ "grad_norm": 2.125,
2998
+ "learning_rate": 1.1340941512125534e-05,
2999
+ "loss": 0.63428659,
3000
+ "memory(GiB)": 16.34,
3001
+ "step": 1370,
3002
+ "train_speed(iter/s)": 0.139771
3003
+ },
3004
+ {
3005
+ "acc": 0.78548884,
3006
+ "epoch": 3.7262872628726287,
3007
+ "grad_norm": 3.984375,
3008
+ "learning_rate": 1.0805991440798858e-05,
3009
+ "loss": 0.74558139,
3010
+ "memory(GiB)": 16.34,
3011
+ "step": 1375,
3012
+ "train_speed(iter/s)": 0.139836
3013
+ },
3014
+ {
3015
+ "acc": 0.85822716,
3016
+ "epoch": 3.7398373983739837,
3017
+ "grad_norm": 2.328125,
3018
+ "learning_rate": 1.0271041369472183e-05,
3019
+ "loss": 0.47486768,
3020
+ "memory(GiB)": 16.34,
3021
+ "step": 1380,
3022
+ "train_speed(iter/s)": 0.139902
3023
+ },
3024
+ {
3025
+ "acc": 0.83469105,
3026
+ "epoch": 3.7533875338753386,
3027
+ "grad_norm": 2.53125,
3028
+ "learning_rate": 9.736091298145506e-06,
3029
+ "loss": 0.56590233,
3030
+ "memory(GiB)": 16.34,
3031
+ "step": 1385,
3032
+ "train_speed(iter/s)": 0.139971
3033
+ },
3034
+ {
3035
+ "acc": 0.80486965,
3036
+ "epoch": 3.7669376693766936,
3037
+ "grad_norm": 3.28125,
3038
+ "learning_rate": 9.20114122681883e-06,
3039
+ "loss": 0.68021841,
3040
+ "memory(GiB)": 16.34,
3041
+ "step": 1390,
3042
+ "train_speed(iter/s)": 0.140036
3043
+ },
3044
+ {
3045
+ "acc": 0.84560328,
3046
+ "epoch": 3.7804878048780486,
3047
+ "grad_norm": 2.421875,
3048
+ "learning_rate": 8.666191155492154e-06,
3049
+ "loss": 0.52926989,
3050
+ "memory(GiB)": 16.34,
3051
+ "step": 1395,
3052
+ "train_speed(iter/s)": 0.140102
3053
+ },
3054
+ {
3055
+ "acc": 0.83154497,
3056
+ "epoch": 3.794037940379404,
3057
+ "grad_norm": 2.671875,
3058
+ "learning_rate": 8.131241084165478e-06,
3059
+ "loss": 0.5866107,
3060
+ "memory(GiB)": 16.34,
3061
+ "step": 1400,
3062
+ "train_speed(iter/s)": 0.14017
3063
+ },
3064
+ {
3065
+ "epoch": 3.794037940379404,
3066
+ "eval_acc": 0.6036142543489276,
3067
+ "eval_loss": 1.8983793258666992,
3068
+ "eval_runtime": 44.4268,
3069
+ "eval_samples_per_second": 0.855,
3070
+ "eval_steps_per_second": 0.855,
3071
+ "step": 1400
3072
+ },
3073
+ {
3074
+ "acc": 0.84980259,
3075
+ "epoch": 3.8075880758807585,
3076
+ "grad_norm": 2.859375,
3077
+ "learning_rate": 7.596291012838801e-06,
3078
+ "loss": 0.56505013,
3079
+ "memory(GiB)": 16.34,
3080
+ "step": 1405,
3081
+ "train_speed(iter/s)": 0.139621
3082
+ },
3083
+ {
3084
+ "acc": 0.80126667,
3085
+ "epoch": 3.821138211382114,
3086
+ "grad_norm": 2.21875,
3087
+ "learning_rate": 7.061340941512125e-06,
3088
+ "loss": 0.67650762,
3089
+ "memory(GiB)": 16.34,
3090
+ "step": 1410,
3091
+ "train_speed(iter/s)": 0.139688
3092
+ },
3093
+ {
3094
+ "acc": 0.79974804,
3095
+ "epoch": 3.834688346883469,
3096
+ "grad_norm": 2.3125,
3097
+ "learning_rate": 6.5263908701854486e-06,
3098
+ "loss": 0.743855,
3099
+ "memory(GiB)": 16.34,
3100
+ "step": 1415,
3101
+ "train_speed(iter/s)": 0.139749
3102
+ },
3103
+ {
3104
+ "acc": 0.84023552,
3105
+ "epoch": 3.848238482384824,
3106
+ "grad_norm": 2.234375,
3107
+ "learning_rate": 5.991440798858773e-06,
3108
+ "loss": 0.57722979,
3109
+ "memory(GiB)": 16.34,
3110
+ "step": 1420,
3111
+ "train_speed(iter/s)": 0.139813
3112
+ },
3113
+ {
3114
+ "acc": 0.82472792,
3115
+ "epoch": 3.861788617886179,
3116
+ "grad_norm": 2.484375,
3117
+ "learning_rate": 5.456490727532097e-06,
3118
+ "loss": 0.56691737,
3119
+ "memory(GiB)": 16.34,
3120
+ "step": 1425,
3121
+ "train_speed(iter/s)": 0.139877
3122
+ },
3123
+ {
3124
+ "acc": 0.83573132,
3125
+ "epoch": 3.875338753387534,
3126
+ "grad_norm": 1.9765625,
3127
+ "learning_rate": 4.92154065620542e-06,
3128
+ "loss": 0.60298834,
3129
+ "memory(GiB)": 16.34,
3130
+ "step": 1430,
3131
+ "train_speed(iter/s)": 0.13994
3132
+ },
3133
+ {
3134
+ "acc": 0.83143454,
3135
+ "epoch": 3.888888888888889,
3136
+ "grad_norm": 1.9453125,
3137
+ "learning_rate": 4.386590584878744e-06,
3138
+ "loss": 0.61832671,
3139
+ "memory(GiB)": 16.34,
3140
+ "step": 1435,
3141
+ "train_speed(iter/s)": 0.140003
3142
+ },
3143
+ {
3144
+ "acc": 0.83209105,
3145
+ "epoch": 3.902439024390244,
3146
+ "grad_norm": 3.265625,
3147
+ "learning_rate": 3.851640513552068e-06,
3148
+ "loss": 0.6038147,
3149
+ "memory(GiB)": 16.34,
3150
+ "step": 1440,
3151
+ "train_speed(iter/s)": 0.140065
3152
+ },
3153
+ {
3154
+ "acc": 0.79715924,
3155
+ "epoch": 3.915989159891599,
3156
+ "grad_norm": 2.015625,
3157
+ "learning_rate": 3.316690442225392e-06,
3158
+ "loss": 0.70867634,
3159
+ "memory(GiB)": 16.34,
3160
+ "step": 1445,
3161
+ "train_speed(iter/s)": 0.140125
3162
+ },
3163
+ {
3164
+ "acc": 0.85556803,
3165
+ "epoch": 3.9295392953929538,
3166
+ "grad_norm": 1.7734375,
3167
+ "learning_rate": 2.781740370898716e-06,
3168
+ "loss": 0.52723417,
3169
+ "memory(GiB)": 16.34,
3170
+ "step": 1450,
3171
+ "train_speed(iter/s)": 0.140182
3172
+ },
3173
+ {
3174
+ "epoch": 3.9295392953929538,
3175
+ "eval_acc": 0.6029386927883803,
3176
+ "eval_loss": 1.8965932130813599,
3177
+ "eval_runtime": 44.4125,
3178
+ "eval_samples_per_second": 0.856,
3179
+ "eval_steps_per_second": 0.856,
3180
+ "step": 1450
3181
+ },
3182
+ {
3183
+ "acc": 0.83306837,
3184
+ "epoch": 3.943089430894309,
3185
+ "grad_norm": 2.25,
3186
+ "learning_rate": 2.2467902995720398e-06,
3187
+ "loss": 0.55906692,
3188
+ "memory(GiB)": 16.34,
3189
+ "step": 1455,
3190
+ "train_speed(iter/s)": 0.139645
3191
+ },
3192
+ {
3193
+ "acc": 0.81574478,
3194
+ "epoch": 3.9566395663956637,
3195
+ "grad_norm": 2.6875,
3196
+ "learning_rate": 1.7118402282453637e-06,
3197
+ "loss": 0.62746248,
3198
+ "memory(GiB)": 16.34,
3199
+ "step": 1460,
3200
+ "train_speed(iter/s)": 0.139708
3201
+ },
3202
+ {
3203
+ "acc": 0.83905201,
3204
+ "epoch": 3.970189701897019,
3205
+ "grad_norm": 2.46875,
3206
+ "learning_rate": 1.1768901569186875e-06,
3207
+ "loss": 0.56978045,
3208
+ "memory(GiB)": 16.34,
3209
+ "step": 1465,
3210
+ "train_speed(iter/s)": 0.139771
3211
+ },
3212
+ {
3213
+ "acc": 0.85569115,
3214
+ "epoch": 3.983739837398374,
3215
+ "grad_norm": 1.90625,
3216
+ "learning_rate": 6.419400855920114e-07,
3217
+ "loss": 0.48882861,
3218
+ "memory(GiB)": 16.34,
3219
+ "step": 1470,
3220
+ "train_speed(iter/s)": 0.139834
3221
+ },
3222
+ {
3223
+ "acc": 0.82850809,
3224
+ "epoch": 3.997289972899729,
3225
+ "grad_norm": 1.828125,
3226
+ "learning_rate": 1.0699001426533523e-07,
3227
+ "loss": 0.56869435,
3228
+ "memory(GiB)": 16.34,
3229
+ "step": 1475,
3230
+ "train_speed(iter/s)": 0.139901
3231
+ },
3232
+ {
3233
+ "epoch": 4.0,
3234
+ "eval_acc": 0.6037831447390644,
3235
+ "eval_loss": 1.8978389501571655,
3236
+ "eval_runtime": 44.2826,
3237
+ "eval_samples_per_second": 0.858,
3238
+ "eval_steps_per_second": 0.858,
3239
+ "step": 1476
3240
+ }
3241
+ ],
3242
+ "logging_steps": 5,
3243
+ "max_steps": 1476,
3244
+ "num_input_tokens_seen": 0,
3245
+ "num_train_epochs": 4,
3246
+ "save_steps": 369,
3247
+ "total_flos": 1.991427194376192e+17,
3248
+ "train_batch_size": 1,
3249
+ "trial_name": null,
3250
+ "trial_params": null
3251
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e164dfb4f067ebefb257c06a7010f5749952eef991c3155070cfd3ae753eb94f
3
+ size 7032