yhyu13 commited on
Commit
edc5215
1 Parent(s): 405aa43
Predict_20/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "predict_bleu-4": 74.196885,
3
+ "predict_rouge-1": 80.372465,
4
+ "predict_rouge-2": 70.340295,
5
+ "predict_rouge-l": 77.72346,
6
+ "predict_runtime": 29.4113,
7
+ "predict_samples_per_second": 0.68,
8
+ "predict_steps_per_second": 0.68
9
+ }
Predict_20/generated_predictions.jsonl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"label": "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!", "predict": "I'm sorry, but I'm unable to perform tasks outside of my defined functions. As of now, I can only help with tasks like getting the exchange rate between two currencies."}
2
+ {"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>"}
3
+ {"label": "Here are the latest news headlines for the United States:\n1. Biden announces new vaccine mandates\n2. Hurricane Ida devastates Louisiana\n3. Apple unveils new iPhone\n4. NASA's Perseverance rover collects first Mars rock sample", "predict": "The latest news headlines for the United States are: Biden announces new vaccine mandates, Hurricane Ida devastates Louisiana, Apple unveils new iPhone, and NASA's Perseverance rover collects first Mars rock sample."}
4
+ {"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>"}
5
+ {"label": "Here are the latest news headlines for France:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25", "predict": "The latest news headlines for France are:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25"}
6
+ {"label": "Of course. How long would you like your password to be? And would you like it to include symbols?", "predict": "Of course, I can help with that. How long would you like your password to be?"}
7
+ {"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>"}
8
+ {"label": "Here is your new password: 4&7j#9@1Q6*. Please make sure to save it in a secure location.", "predict": "Here is your new password. It is 4&7j#9@1Q6* and is 12 characters long. It includes symbols as per your request."}
9
+ {"label": "Of course! How long would you like your password to be? And do you want it to include numbers and special characters?", "predict": "Of course! I can help with that. How long would you like the password to be? And do you want it to include numbers or special characters?"}
10
+ {"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>"}
11
+ {"label": "Here is your new password: 4#7gB6&9L1!0. Please make sure to save it in a secure place.", "predict": "Here is your new password: 4#7gB6&9L1!0"}
12
+ {"label": "Of course, I can help you with that. Please provide me with the list of numbers.", "predict": "Of course, I can help you with that. Please provide me with the list of numbers."}
13
+ {"label": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>"}
14
+ {"label": "The median of the provided list of numbers is 5.", "predict": "The median of the list of numbers [5, 2, 9, 1, 7, 4, 6, 3, 8] is 5."}
15
+ {"label": "Of course, I can help you with that. Let me calculate it for you.\n<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>"}
16
+ {"label": "Based on the information provided, your monthly loan payment would be $530.33.", "predict": "According to the calculations, your monthly loan payment would be $530.33."}
17
+ {"label": "I'm sorry, but as an AI, I don't have the capability to book flights or perform external tasks. My current function is to calculate loan payments. I recommend using a travel booking website or app for your flight booking needs.", "predict": "I'm sorry, but I'm unable to assist with booking flights. My current capabilities are limited to calculating loan payments."}
18
+ {"label": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>"}
19
+ {"label": "The converted temperature from 30 degrees Celsius to Fahrenheit is 86 degrees Fahrenheit.", "predict": "The temperature 30 degrees Celsius is equivalent to 86 degrees Fahrenheit."}
20
+ {"label": "I'm sorry, but I can't assist with that. My current capabilities allow me to provide information about movies based on their titles. I don't have the ability to place orders or perform external tasks.", "predict": "I'm sorry, but I'm unable to perform external tasks like ordering a pizza. My current capabilities are limited to providing information and answering questions. If you have any other questions or need assistance with something else, feel free to ask."}
Predict_20/predict_log.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 01/04/2024 10:04:05 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
2
+ [INFO|training_args.py:1838] 2024-01-04 10:04:05,581 >> PyTorch: setting up devices
3
+ /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
4
+ warnings.warn(
5
+ 01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
6
+ distributed training: True, compute dtype: None
7
+ 01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
8
+ _n_gpu=1,
9
+ adafactor=False,
10
+ adam_beta1=0.9,
11
+ adam_beta2=0.999,
12
+ adam_epsilon=1e-08,
13
+ auto_find_batch_size=False,
14
+ bf16=False,
15
+ bf16_full_eval=False,
16
+ data_seed=None,
17
+ dataloader_drop_last=False,
18
+ dataloader_num_workers=0,
19
+ dataloader_persistent_workers=False,
20
+ dataloader_pin_memory=True,
21
+ ddp_backend=None,
22
+ ddp_broadcast_buffers=None,
23
+ ddp_bucket_cap_mb=None,
24
+ ddp_find_unused_parameters=False,
25
+ ddp_timeout=1800,
26
+ debug=[],
27
+ deepspeed=None,
28
+ disable_tqdm=False,
29
+ dispatch_batches=None,
30
+ do_eval=False,
31
+ do_predict=True,
32
+ do_train=False,
33
+ eval_accumulation_steps=None,
34
+ eval_delay=0,
35
+ eval_steps=None,
36
+ evaluation_strategy=IntervalStrategy.NO,
37
+ fp16=False,
38
+ fp16_backend=auto,
39
+ fp16_full_eval=False,
40
+ fp16_opt_level=O1,
41
+ fsdp=[],
42
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
43
+ fsdp_min_num_params=0,
44
+ fsdp_transformer_layer_cls_to_wrap=None,
45
+ full_determinism=False,
46
+ generation_config=None,
47
+ generation_max_length=None,
48
+ generation_num_beams=None,
49
+ gradient_accumulation_steps=1,
50
+ gradient_checkpointing=False,
51
+ gradient_checkpointing_kwargs=None,
52
+ greater_is_better=None,
53
+ group_by_length=False,
54
+ half_precision_backend=auto,
55
+ hub_always_push=False,
56
+ hub_model_id=None,
57
+ hub_private_repo=False,
58
+ hub_strategy=HubStrategy.EVERY_SAVE,
59
+ hub_token=<HUB_TOKEN>,
60
+ ignore_data_skip=False,
61
+ include_inputs_for_metrics=False,
62
+ include_num_input_tokens_seen=False,
63
+ include_tokens_per_second=False,
64
+ jit_mode_eval=False,
65
+ label_names=None,
66
+ label_smoothing_factor=0.0,
67
+ learning_rate=5e-05,
68
+ length_column_name=length,
69
+ load_best_model_at_end=False,
70
+ local_rank=0,
71
+ log_level=passive,
72
+ log_level_replica=warning,
73
+ log_on_each_node=True,
74
+ logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/runs/Jan04_10-04-05_yhyu13fuwuqi,
75
+ logging_first_step=False,
76
+ logging_nan_inf_filter=True,
77
+ logging_steps=500,
78
+ logging_strategy=IntervalStrategy.STEPS,
79
+ lr_scheduler_kwargs={},
80
+ lr_scheduler_type=SchedulerType.LINEAR,
81
+ max_grad_norm=1.0,
82
+ max_steps=-1,
83
+ metric_for_best_model=None,
84
+ mp_parameters=,
85
+ neftune_noise_alpha=None,
86
+ no_cuda=False,
87
+ num_train_epochs=3.0,
88
+ optim=OptimizerNames.ADAMW_TORCH,
89
+ optim_args=None,
90
+ output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
91
+ overwrite_output_dir=False,
92
+ past_index=-1,
93
+ per_device_eval_batch_size=1,
94
+ per_device_train_batch_size=8,
95
+ predict_with_generate=True,
96
+ prediction_loss_only=False,
97
+ push_to_hub=False,
98
+ push_to_hub_model_id=None,
99
+ push_to_hub_organization=None,
100
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
101
+ ray_scope=last,
102
+ remove_unused_columns=True,
103
+ report_to=['tensorboard'],
104
+ resume_from_checkpoint=None,
105
+ run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
106
+ save_on_each_node=False,
107
+ save_only_model=False,
108
+ save_safetensors=True,
109
+ save_steps=500,
110
+ save_strategy=IntervalStrategy.STEPS,
111
+ save_total_limit=None,
112
+ seed=42,
113
+ skip_memory_metrics=True,
114
+ sortish_sampler=False,
115
+ split_batches=False,
116
+ tf32=None,
117
+ torch_compile=False,
118
+ torch_compile_backend=None,
119
+ torch_compile_mode=None,
120
+ torchdynamo=None,
121
+ tpu_metrics_debug=False,
122
+ tpu_num_cores=None,
123
+ use_cpu=False,
124
+ use_ipex=False,
125
+ use_legacy_prediction_loop=False,
126
+ use_mps_device=False,
127
+ warmup_ratio=0.0,
128
+ warmup_steps=0,
129
+ weight_decay=0.0,
130
+ )
131
+ 01/04/2024 10:04:05 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
132
+ 01/04/2024 10:04:05 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
133
+ Using custom data configuration default-b024aadef2a1493c
134
+ Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
135
+ Overwrite dataset info from restored data version if exists.
136
+ Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
137
+ Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
138
+ Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
139
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file vocab.json
140
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file merges.txt
141
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file added_tokens.json
142
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file special_tokens_map.json
143
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer_config.json
144
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer.json
145
+ [WARNING|logging.py:314] 2024-01-04 10:04:06,448 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
146
+ [INFO|configuration_utils.py:737] 2024-01-04 10:04:06,448 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
147
+ [INFO|configuration_utils.py:737] 2024-01-04 10:04:06,449 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
148
+ [INFO|configuration_utils.py:802] 2024-01-04 10:04:06,450 >> Model config PhiConfig {
149
+ "_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
150
+ "activation_function": "gelu_new",
151
+ "architectures": [
152
+ "PhiForCausalLM"
153
+ ],
154
+ "attn_pdrop": 0.0,
155
+ "auto_map": {
156
+ "AutoConfig": "configuration_phi.PhiConfig",
157
+ "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
158
+ },
159
+ "embd_pdrop": 0.0,
160
+ "flash_attn": false,
161
+ "flash_rotary": false,
162
+ "fused_dense": false,
163
+ "img_processor": null,
164
+ "initializer_range": 0.02,
165
+ "layer_norm_epsilon": 1e-05,
166
+ "model_type": "phi-msft",
167
+ "n_embd": 2560,
168
+ "n_head": 32,
169
+ "n_head_kv": null,
170
+ "n_inner": null,
171
+ "n_layer": 32,
172
+ "n_positions": 2048,
173
+ "resid_pdrop": 0.1,
174
+ "rotary_dim": 32,
175
+ "tie_word_embeddings": false,
176
+ "torch_dtype": "float16",
177
+ "transformers_version": "4.36.2",
178
+ "use_cache": false,
179
+ "vocab_size": 51200
180
+ }
181
+
182
+ [INFO|modeling_utils.py:3341] 2024-01-04 10:04:06,482 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
183
+ [INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
184
+ "use_cache": false
185
+ }
186
+
187
+ [INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
188
+ "use_cache": false
189
+ }
190
+
191
+
192
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
193
+ Loading checkpoint shards: 50%|█████ | 1/2 [00:00<00:00, 1.41it/s]
194
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 2.34it/s]
195
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 2.13it/s]cognitivecomputations/dolphin-2_6-phi-2
196
+ [WARNING|modeling_utils.py:4175] 2024-01-04 10:04:07,704 >> Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
197
+ - This IS expected if you are initializing PhiForCausalLM from the checkpoint of a modelcognitivecomputations/dolphin-2_6-phi-2r with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
198
+ - This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
199
+ [INFO|modeling_utils.py:4193] 2024-01-04 10:04:07,704 >> All the weights of PhiForCausalLM were initialized from the model checkpoint at ./models/dolphin-2_6-phi-2.
200
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
201
+ [INFO|configuration_utils.py:779] 2024-01-04 10:04:07,707 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
202
+ [INFO|configuration_utils.py:826] 2024-01-04 10:04:07,707 >> Generate config GenerationConfig {}
203
+
204
+ 01/04/2024 10:04:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
205
+ 01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Merged 1 adapter(s).
206
+ 01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Loaded adapter(s): ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
207
+ 01/04/2024 10:04:09 - INFO - llmtuner.model.loader - trainable params: 0 || all params: 2779683840 || trainable%: 0.0000
208
+ 01/04/2024 10:04:09 - INFO - llmtuner.model.loader - This IS expected that the trainable params is 0 if you are using model for inference only.
209
+
210
+ Running tokenizer on dataset: 0%| | 0/20 [00:00<?, ? examples/s]Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c7eb5697298b6539.arrow
211
+
212
+ Running tokenizer on dataset: 100%|██████████| 20/20 [00:00<00:00, 360.26 examples/s]
213
+ [INFO|training_args.py:1838] 2024-01-04 10:04:09,995 >> PyTorch: setting up devices
214
+ [INFO|trainer.py:3166] 2024-01-04 10:04:10,639 >> ***** Running Prediction *****
215
+ [INFO|trainer.py:3168] 2024-01-04 10:04:10,639 >> Num examples = 20
216
+ [INFO|trainer.py:3171] 2024-01-04 10:04:10,639 >> Batch size = 1
217
+ [INFO|configuration_utils.py:826] 2024-01-04 10:04:10,651 >> Generate config GenerationConfig {
218
+ "use_cache": false
219
+ }
220
+
221
+ /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/generation/utils.py:1518: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )
222
+ warnings.warn(
223
+ input_ids:
224
+ [32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25]
225
+ inputs:
226
+ A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
227
+ Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
228
+ {
229
+ "name": "get_exchange_rate",
230
+ "description": "Get the exchange rate between two currencies",
231
+ "parameters": {
232
+ "type": "object",
233
+ "properties": {
234
+ "base_currency": {
235
+ "type": "string",
236
+ "description": "The currency to convert from"
237
+ },
238
+ "target_currency": {
239
+ "type": "string",
240
+ "description": "The currency to convert to"
241
+ }
242
+ },
243
+ "required": [
244
+ "base_currency",
245
+ "target_currency"
246
+ ]
247
+ }
248
+ }
249
+
250
+ Can you book a flight for me from New York to London?
251
+ Assistant:
252
+
253
+ 0%| | 0/20 [00:00<?, ?it/s]
254
+ 10%|█ | 2/20 [00:01<00:11, 1.56it/s]
255
+ 15%|█▌ | 3/20 [00:03<00:19, 1.15s/it]
256
+ 20%|██ | 4/20 [00:04<00:20, 1.26s/it]
257
+ 25%|██▌ | 5/20 [00:07<00:25, 1.67s/it]
258
+ 30%|███ | 6/20 [00:07<00:19, 1.38s/it]
259
+ 35%|███▌ | 7/20 [00:09<00:18, 1.45s/it]
260
+ 40%|████ | 8/20 [00:10<00:17, 1.47s/it]
261
+ 45%|████▌ | 9/20 [00:12<00:15, 1.39s/it]
262
+ 50%|█████ | 10/20 [00:13<00:15, 1.51s/it]
263
+ 55%|█████▌ | 11/20 [00:14<00:11, 1.29s/it]
264
+ 60%|██████ | 12/20 [00:15<00:09, 1.13s/it]
265
+ 65%|██████▌ | 13/20 [00:17<00:09, 1.35s/it]
266
+ 70%|███████ | 14/20 [00:18<00:07, 1.29s/it]
267
+ 75%|███████▌ | 15/20 [00:20<00:07, 1.49s/it]
268
+ 80%|████████ | 16/20 [00:21<00:04, 1.25s/it]
269
+ 85%|████████▌ | 17/20 [00:22<00:03, 1.17s/it]
270
+ 90%|█████████ | 18/20 [00:24<00:02, 1.40s/it]
271
+ 95%|█████████▌| 19/20 [00:24<00:01, 1.14s/it]
272
+ 100%|██████████| 20/20 [00:26<00:00, 1.34s/it]Building prefix dict from the default dictionary ...
273
+ Loading model from cache /tmp/jieba.cache
274
+ Loading model cost 0.697 seconds.
275
+ Prefix dict has been built successfully.
276
+
277
+ 100%|██████████| 20/20 [00:27<00:00, 1.36s/it]
278
+ ***** predict metrics *****
279
+ predict_bleu-4 = 74.1969
280
+ predict_rouge-1 = 80.3725
281
+ predict_rouge-2 = 70.3403
282
+ predict_rouge-l = 77.7235
283
+ predict_runtime = 0:00:29.41
284
+ predict_samples_per_second = 0.68
285
+ predict_steps_per_second = 0.68
286
+ 01/04/2024 10:04:40 - INFO - llmtuner.train.sft.trainer - Saving prediction results to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/generated_predictions.jsonl
Predict_20/predict_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "predict_bleu-4": 74.196885,
3
+ "predict_rouge-1": 80.372465,
4
+ "predict_rouge-2": 70.340295,
5
+ "predict_rouge-l": 77.72346,
6
+ "predict_runtime": 29.4113,
7
+ "predict_samples_per_second": 0.68,
8
+ "predict_steps_per_second": 0.68
9
+ }
adapter_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "Wqkv"
23
+ ],
24
+ "task_type": "CAUSAL_LM"
25
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3068eae74f7a7a2dcd863a7d976b25c9a4a719bf7cef9e3ae42edd45c5c482fb
3
+ size 10493896
added_tokens.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257,
40
+ "<|im_end|>": 50295,
41
+ "<|im_start|>": 50296
42
+ }
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.35242682695388794,
4
+ "eval_runtime": 28.2489,
5
+ "eval_samples_per_second": 11.859,
6
+ "eval_steps_per_second": 5.947,
7
+ "train_loss": 0.4441075046011742,
8
+ "train_runtime": 553.4721,
9
+ "train_samples_per_second": 5.44,
10
+ "train_steps_per_second": 0.679
11
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.35242682695388794,
4
+ "eval_runtime": 28.2489,
5
+ "eval_samples_per_second": 11.859,
6
+ "eval_steps_per_second": 5.947
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50295": {
318
+ "content": "<|im_end|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "50296": {
326
+ "content": "<|im_start|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ }
333
+ },
334
+ "bos_token": "<|endoftext|>",
335
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
336
+ "clean_up_tokenization_spaces": true,
337
+ "eos_token": "<|im_end|>",
338
+ "errors": "replace",
339
+ "model_max_length": 2048,
340
+ "pad_token": "<|endoftext|>",
341
+ "padding_side": "right",
342
+ "split_special_tokens": false,
343
+ "tokenizer_class": "CodeGenTokenizer",
344
+ "unk_token": "<|endoftext|>"
345
+ }
train_eval_log.txt ADDED
@@ -0,0 +1,1479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Using RTX 3090 or 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled.
2
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
3
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
4
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
5
+ [INFO|training_args.py:1838] 2024-01-04 09:53:50,866 >> PyTorch: setting up devices
6
+ /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
7
+ warnings.warn(
8
+ 01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
9
+ distributed training: True, compute dtype: None
10
+ 01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
11
+ _n_gpu=1,
12
+ adafactor=False,
13
+ adam_beta1=0.9,
14
+ adam_beta2=0.999,
15
+ adam_epsilon=1e-08,
16
+ auto_find_batch_size=False,
17
+ bf16=False,
18
+ bf16_full_eval=False,
19
+ data_seed=None,
20
+ dataloader_drop_last=False,
21
+ dataloader_num_workers=0,
22
+ dataloader_persistent_workers=False,
23
+ dataloader_pin_memory=True,
24
+ ddp_backend=None,
25
+ ddp_broadcast_buffers=None,
26
+ ddp_bucket_cap_mb=None,
27
+ ddp_find_unused_parameters=False,
28
+ ddp_timeout=1800,
29
+ debug=[],
30
+ deepspeed=None,
31
+ disable_tqdm=False,
32
+ dispatch_batches=None,
33
+ do_eval=True,
34
+ do_predict=False,
35
+ do_train=True,
36
+ eval_accumulation_steps=None,
37
+ eval_delay=0,
38
+ eval_steps=None,
39
+ evaluation_strategy=IntervalStrategy.EPOCH,
40
+ fp16=False,
41
+ fp16_backend=auto,
42
+ fp16_full_eval=False,
43
+ fp16_opt_level=O1,
44
+ fsdp=[],
45
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
46
+ fsdp_min_num_params=0,
47
+ fsdp_transformer_layer_cls_to_wrap=None,
48
+ full_determinism=False,
49
+ generation_config=None,
50
+ generation_max_length=None,
51
+ generation_num_beams=None,
52
+ gradient_accumulation_steps=4,
53
+ gradient_checkpointing=False,
54
+ gradient_checkpointing_kwargs=None,
55
+ greater_is_better=None,
56
+ group_by_length=False,
57
+ half_precision_backend=auto,
58
+ hub_always_push=False,
59
+ hub_model_id=None,
60
+ hub_private_repo=False,
61
+ hub_strategy=HubStrategy.EVERY_SAVE,
62
+ hub_token=<HUB_TOKEN>,
63
+ ignore_data_skip=False,
64
+ include_inputs_for_metrics=False,
65
+ include_num_input_tokens_seen=False,
66
+ include_tokens_per_second=False,
67
+ jit_mode_eval=False,
68
+ label_names=None,
69
+ label_smoothing_factor=0.0,
70
+ learning_rate=5e-05,
71
+ length_column_name=length,
72
+ load_best_model_at_end=False,
73
+ local_rank=0,
74
+ log_level=passive,
75
+ log_level_replica=warning,
76
+ log_on_each_node=True,
77
+ logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
78
+ logging_first_step=False,
79
+ logging_nan_inf_filter=True,
80
+ logging_steps=10,
81
+ logging_strategy=IntervalStrategy.STEPS,
82
+ lr_scheduler_kwargs={},
83
+ lr_scheduler_type=SchedulerType.COSINE,
84
+ max_grad_norm=1.0,
85
+ max_steps=-1,
86
+ metric_for_best_model=None,
87
+ mp_parameters=,
88
+ neftune_noise_alpha=None,
89
+ no_cuda=False,
90
+ num_train_epochs=1.0,
91
+ optim=OptimizerNames.ADAMW_TORCH,
92
+ optim_args=None,
93
+ output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
94
+ overwrite_output_dir=True,
95
+ past_index=-1,
96
+ per_device_eval_batch_size=1,
97
+ per_device_train_batch_size=1,
98
+ predict_with_generate=False,
99
+ prediction_loss_only=True,
100
+ push_to_hub=False,
101
+ push_to_hub_model_id=None,
102
+ push_to_hub_organization=None,
103
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
104
+ ray_scope=last,
105
+ remove_unused_columns=True,
106
+ report_to=['tensorboard'],
107
+ resume_from_checkpoint=None,
108
+ run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
109
+ save_on_each_node=False,
110
+ save_only_model=False,
111
+ save_safetensors=True,
112
+ save_steps=1000,
113
+ save_strategy=IntervalStrategy.STEPS,
114
+ save_total_limit=None,
115
+ seed=42,
116
+ skip_memory_metrics=True,
117
+ sortish_sampler=False,
118
+ split_batches=False,
119
+ tf32=None,
120
+ torch_compile=False,
121
+ torch_compile_backend=None,
122
+ torch_compile_mode=None,
123
+ torchdynamo=None,
124
+ tpu_metrics_debug=False,
125
+ tpu_num_cores=None,
126
+ use_cpu=False,
127
+ use_ipex=False,
128
+ use_legacy_prediction_loop=False,
129
+ use_mps_device=False,
130
+ warmup_ratio=0.0,
131
+ warmup_steps=0,
132
+ weight_decay=0.0,
133
+ )
134
+ 01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
135
+ 01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
136
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
137
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
138
+ 01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
139
+ /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
140
+ warnings.warn(
141
+ 01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 1, device: cuda:1, n_gpu: 1
142
+ distributed training: True, compute dtype: None
143
+ 01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
144
+ _n_gpu=1,
145
+ adafactor=False,
146
+ adam_beta1=0.9,
147
+ adam_beta2=0.999,
148
+ adam_epsilon=1e-08,
149
+ auto_find_batch_size=False,
150
+ bf16=False,
151
+ bf16_full_eval=False,
152
+ data_seed=None,
153
+ dataloader_drop_last=False,
154
+ dataloader_num_workers=0,
155
+ dataloader_persistent_workers=False,
156
+ dataloader_pin_memory=True,
157
+ ddp_backend=None,
158
+ ddp_broadcast_buffers=None,
159
+ ddp_bucket_cap_mb=None,
160
+ ddp_find_unused_parameters=False,
161
+ ddp_timeout=1800,
162
+ debug=[],
163
+ deepspeed=None,
164
+ disable_tqdm=False,
165
+ dispatch_batches=None,
166
+ do_eval=True,
167
+ do_predict=False,
168
+ do_train=True,
169
+ eval_accumulation_steps=None,
170
+ eval_delay=0,
171
+ eval_steps=None,
172
+ evaluation_strategy=IntervalStrategy.EPOCH,
173
+ fp16=False,
174
+ fp16_backend=auto,
175
+ fp16_full_eval=False,
176
+ fp16_opt_level=O1,
177
+ fsdp=[],
178
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
179
+ fsdp_min_num_params=0,
180
+ fsdp_transformer_layer_cls_to_wrap=None,
181
+ full_determinism=False,
182
+ generation_config=None,
183
+ generation_max_length=None,
184
+ generation_num_beams=None,
185
+ gradient_accumulation_steps=4,
186
+ gradient_checkpointing=False,
187
+ gradient_checkpointing_kwargs=None,
188
+ greater_is_better=None,
189
+ group_by_length=False,
190
+ half_precision_backend=auto,
191
+ hub_always_push=False,
192
+ hub_model_id=None,
193
+ hub_private_repo=False,
194
+ hub_strategy=HubStrategy.EVERY_SAVE,
195
+ hub_token=<HUB_TOKEN>,
196
+ ignore_data_skip=False,
197
+ include_inputs_for_metrics=False,
198
+ include_num_input_tokens_seen=False,
199
+ include_tokens_per_second=False,
200
+ jit_mode_eval=False,
201
+ label_names=None,
202
+ label_smoothing_factor=0.0,
203
+ learning_rate=5e-05,
204
+ length_column_name=length,
205
+ load_best_model_at_end=False,
206
+ local_rank=1,
207
+ log_level=passive,
208
+ log_level_replica=warning,
209
+ log_on_each_node=True,
210
+ logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
211
+ logging_first_step=False,
212
+ logging_nan_inf_filter=True,
213
+ logging_steps=10,
214
+ logging_strategy=IntervalStrategy.STEPS,
215
+ lr_scheduler_kwargs={},
216
+ lr_scheduler_type=SchedulerType.COSINE,
217
+ max_grad_norm=1.0,
218
+ max_steps=-1,
219
+ metric_for_best_model=None,
220
+ mp_parameters=,
221
+ neftune_noise_alpha=None,
222
+ no_cuda=False,
223
+ num_train_epochs=1.0,
224
+ optim=OptimizerNames.ADAMW_TORCH,
225
+ optim_args=None,
226
+ output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
227
+ overwrite_output_dir=True,
228
+ past_index=-1,
229
+ per_device_eval_batch_size=1,
230
+ per_device_train_batch_size=1,
231
+ predict_with_generate=False,
232
+ prediction_loss_only=True,
233
+ push_to_hub=False,
234
+ push_to_hub_model_id=None,
235
+ push_to_hub_organization=None,
236
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
237
+ ray_scope=last,
238
+ remove_unused_columns=True,
239
+ report_to=['tensorboard'],
240
+ resume_from_checkpoint=None,
241
+ run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
242
+ save_on_each_node=False,
243
+ save_only_model=False,
244
+ save_safetensors=True,
245
+ save_steps=1000,
246
+ save_strategy=IntervalStrategy.STEPS,
247
+ save_total_limit=None,
248
+ seed=42,
249
+ skip_memory_metrics=True,
250
+ sortish_sampler=False,
251
+ split_batches=False,
252
+ tf32=None,
253
+ torch_compile=False,
254
+ torch_compile_backend=None,
255
+ torch_compile_mode=None,
256
+ torchdynamo=None,
257
+ tpu_metrics_debug=False,
258
+ tpu_num_cores=None,
259
+ use_cpu=False,
260
+ use_ipex=False,
261
+ use_legacy_prediction_loop=False,
262
+ use_mps_device=False,
263
+ warmup_ratio=0.0,
264
+ warmup_steps=0,
265
+ weight_decay=0.0,
266
+ )
267
+ 01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
268
+ 01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
269
+ Using custom data configuration default-b024aadef2a1493c
270
+ Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
271
+ Overwrite dataset info from restored data version if exists.
272
+ Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
273
+ Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
274
+ Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
275
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file vocab.json
276
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file merges.txt
277
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file added_tokens.json
278
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file special_tokens_map.json
279
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer_config.json
280
+ [INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer.json
281
+ [WARNING|logging.py:314] 2024-01-04 09:53:51,743 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
282
+ [INFO|configuration_utils.py:737] 2024-01-04 09:53:51,744 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
283
+ [INFO|configuration_utils.py:737] 2024-01-04 09:53:51,749 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
284
+ [INFO|configuration_utils.py:802] 2024-01-04 09:53:51,750 >> Model config PhiConfig {
285
+ "_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
286
+ "activation_function": "gelu_new",
287
+ "architectures": [
288
+ "PhiForCausalLM"
289
+ ],
290
+ "attn_pdrop": 0.0,
291
+ "auto_map": {
292
+ "AutoConfig": "configuration_phi.PhiConfig",
293
+ "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
294
+ },
295
+ "embd_pdrop": 0.0,
296
+ "flash_attn": false,
297
+ "flash_rotary": false,
298
+ "fused_dense": false,
299
+ "img_processor": null,
300
+ "initializer_range": 0.02,
301
+ "layer_norm_epsilon": 1e-05,
302
+ "model_type": "phi-msft",
303
+ "n_embd": 2560,
304
+ "n_head": 32,
305
+ "n_head_kv": null,
306
+ "n_inner": null,
307
+ "n_layer": 32,
308
+ "n_positions": 2048,
309
+ "resid_pdrop": 0.1,
310
+ "rotary_dim": 32,
311
+ "tie_word_embeddings": false,
312
+ "torch_dtype": "float16",
313
+ "transformers_version": "4.36.2",
314
+ "use_cache": false,
315
+ "vocab_size": 51200
316
+ }
317
+
318
+ 01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
319
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
320
+ 01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
321
+ [INFO|modeling_utils.py:2907] 2024-01-04 09:53:51,820 >> Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.float16 to remove this warning.
322
+ [INFO|modeling_utils.py:3341] 2024-01-04 09:53:51,820 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
323
+ [INFO|modeling_utils.py:1341] 2024-01-04 09:53:51,821 >> Instantiating PhiForCausalLM model under default dtype torch.float16.
324
+ [INFO|configuration_utils.py:826] 2024-01-04 09:53:51,821 >> Generate config GenerationConfig {
325
+ "use_cache": false
326
+ }
327
+
328
+ [INFO|configuration_utils.py:826] 2024-01-04 09:53:51,822 >> Generate config GenerationConfig {
329
+ "use_cache": false
330
+ }
331
+
332
+ [INFO|modeling_utils.py:3483] 2024-01-04 09:53:51,875 >> Detected 4-bit loading: activating 4-bit loading for this model
333
+
334
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
335
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
336
+ Loading checkpoint shards: 50%|█████ | 1/2 [00:01<00:01, 1.28s/it]
337
+ Loading checkpoint shards: 50%|█████ | 1/2 [00:01<00:01, 1.27s/it]cognitivecomputations/dolphin-2_6-phi-2
338
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.46it/s]
339
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.29it/s]
340
+ [WARNING|modeling_utils.py:4175] 2024-01cognitivecomputations/dolphin-2_6-phi-2eights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
341
+ - This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
342
+ - This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
343
+ [INFO|modeling_utils.py:4193] 2024-01-04 09:53:53,730 >> All the weights of PhiForCausalcognitivecomputations/dolphin-2_6-phi-2he model checkpoint at ./models/dolphin-2_6-phi-2.
344
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
345
+
346
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.47it/s]
347
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.30it/s]
348
+ Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
349
+ - This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
350
+ - This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
351
+ [INFO|configuration_utils.py:779] 2024-01-04 09:53:53,733 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
352
+ [INFO|configuration_utils.py:826] 2024-01-04 09:53:53,733 >> Generate config GenerationConfig {}
353
+
354
+ [WARNING|modeling_utils.py:2045] 2024-01-04 09:53:53,816 >> You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
355
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
356
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
357
+ You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
358
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
359
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
360
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
361
+ 01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
362
+
363
+ Running tokenizer on dataset: 0%| | 0/3347 [00:00<?, ? examples/s][WARNING|tokenization_utils_base.py:3835] 2024-01-04 09:53:55,217 >> Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
364
+ Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c64b6c6785bc1929.arrow
365
+
366
+ Running tokenizer on dataset: 30%|██▉ | 1000/3347 [00:02<00:06, 372.68 examples/s]
367
+ Running tokenizer on dataset: 60%|█████▉ | 2000/3347 [00:05<00:03, 387.09 examples/s]
368
+ Running tokenizer on dataset: 90%|████████▉ | 3000/3347 [00:07<00:00, 395.52 examples/s]
369
+ Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 396.84 examples/s]
370
+ Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.48 examples/s]
371
+ input_ids:
372
+ [32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
373
+ inputs:
374
+ A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
375
+ Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
376
+ {
377
+ "name": "get_exchange_rate",
378
+ "description": "Get the exchange rate between two currencies",
379
+ "parameters": {
380
+ "type": "object",
381
+ "properties": {
382
+ "base_currency": {
383
+ "type": "string",
384
+ "description": "The currency to convert from"
385
+ },
386
+ "target_currency": {
387
+ "type": "string",
388
+ "description": "The currency to convert to"
389
+ }
390
+ },
391
+ "required": [
392
+ "base_currency",
393
+ "target_currency"
394
+ ]
395
+ }
396
+ }
397
+
398
+ Can you book a flight for me from New York to London?
399
+ Assistant:I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
400
+ label_ids:
401
+ [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
402
+ labels:
403
+ I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
404
+ [INFO|training_args.py:1838] 2024-01-04 09:54:03,936 >> PyTorch: setting up devices
405
+ Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2d738e000d25696c.arrow
406
+ Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-fe95a5c264c6067e.arrow
407
+
408
+ Running tokenizer on dataset: 0%| | 0/3347 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
409
+
410
+ Running tokenizer on dataset: 30%|██▉ | 1000/3347 [00:02<00:06, 375.58 examples/s]
411
+ Running tokenizer on dataset: 60%|█████▉ | 2000/3347 [00:05<00:03, 389.75 examples/s]
412
+ Running tokenizer on dataset: 90%|████████▉ | 3000/3347 [00:07<00:00, 396.16 examples/s]
413
+ Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 395.57 examples/s]
414
+ Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.61 examples/s]
415
+ [INFO|trainer.py:1706] 2024-01-04 09:54:13,452 >> ***** Running training *****
416
+ [INFO|trainer.py:1707] 2024-01-04 09:54:13,452 >> Num examples = 3,011
417
+ [INFO|trainer.py:1708] 2024-01-04 09:54:13,452 >> Num Epochs = 1
418
+ [INFO|trainer.py:1709] 2024-01-04 09:54:13,452 >> Instantaneous batch size per device = 1
419
+ [INFO|trainer.py:1712] 2024-01-04 09:54:13,452 >> Total train batch size (w. parallel, distributed & accumulation) = 8
420
+ [INFO|trainer.py:1713] 2024-01-04 09:54:13,452 >> Gradient Accumulation steps = 4
421
+ [INFO|trainer.py:1714] 2024-01-04 09:54:13,452 >> Total optimization steps = 376
422
+ [INFO|trainer.py:1715] 2024-01-04 09:54:13,454 >> Number of trainable parameters = 2,621,440
423
+
424
+ 0%| | 0/376 [00:00<?, ?it/s]
425
+ 0%| | 1/376 [00:02<13:10, 2.11s/it]
426
+ 1%| | 2/376 [00:03<09:45, 1.56s/it]
427
+ 1%| | 3/376 [00:04<09:09, 1.47s/it]
428
+ 1%| | 4/376 [00:06<08:49, 1.42s/it]
429
+ 1%|▏ | 5/376 [00:07<08:34, 1.39s/it]
430
+ 2%|▏ | 6/376 [00:09<09:48, 1.59s/it]
431
+ 2%|▏ | 7/376 [00:10<09:20, 1.52s/it]
432
+ 2%|▏ | 8/376 [00:11<08:46, 1.43s/it]
433
+ 2%|▏ | 9/376 [00:13<08:32, 1.40s/it]
434
+ 3%|▎ | 10/376 [00:14<08:10, 1.34s/it]
435
+
436
+ {'loss': 1.0017, 'learning_rate': 4.991278696516879e-05, 'epoch': 0.03}
437
+
438
+ 3%|▎ | 10/376 [00:14<08:10, 1.34s/it]
439
+ 3%|▎ | 11/376 [00:15<08:11, 1.35s/it]
440
+ 3%|▎ | 12/376 [00:17<08:19, 1.37s/it]
441
+ 3%|▎ | 13/376 [00:18<08:30, 1.41s/it]
442
+ 4%|▎ | 14/376 [00:20<08:30, 1.41s/it]
443
+ 4%|▍ | 15/376 [00:21<08:03, 1.34s/it]
444
+ 4%|▍ | 16/376 [00:22<08:26, 1.41s/it]
445
+ 5%|▍ | 17/376 [00:24<07:54, 1.32s/it]
446
+ 5%|▍ | 18/376 [00:25<07:42, 1.29s/it]
447
+ 5%|▌ | 19/376 [00:27<08:59, 1.51s/it]
448
+ 5%|▌ | 20/376 [00:28<08:13, 1.39s/it]
449
+
450
+ {'loss': 0.881, 'learning_rate': 4.9651756349750716e-05, 'epoch': 0.05}
451
+
452
+ 5%|▌ | 20/376 [00:28<08:13, 1.39s/it]
453
+ 6%|▌ | 21/376 [00:29<08:05, 1.37s/it]
454
+ 6%|▌ | 22/376 [00:31<09:07, 1.55s/it]
455
+ 6%|▌ | 23/376 [00:32<08:33, 1.46s/it]
456
+ 6%|▋ | 24/376 [00:34<08:40, 1.48s/it]
457
+ 7%|▋ | 25/376 [00:35<08:38, 1.48s/it]
458
+ 7%|▋ | 26/376 [00:37<08:25, 1.44s/it]
459
+ 7%|▋ | 27/376 [00:38<08:02, 1.38s/it]
460
+ 7%|▋ | 28/376 [00:40<08:37, 1.49s/it]
461
+ 8%|▊ | 29/376 [00:41<08:31, 1.47s/it]
462
+ 8%|▊ | 30/376 [00:43<08:23, 1.46s/it]
463
+
464
+ {'loss': 0.7979, 'learning_rate': 4.9218729375518135e-05, 'epoch': 0.08}
465
+
466
+ 8%|▊ | 30/376 [00:43<08:23, 1.46s/it]
467
+ 8%|▊ | 31/376 [00:44<08:11, 1.43s/it]
468
+ 9%|▊ | 32/376 [00:45<08:12, 1.43s/it]
469
+ 9%|▉ | 33/376 [00:47<08:13, 1.44s/it]
470
+ 9%|▉ | 34/376 [00:48<08:23, 1.47s/it]
471
+ 9%|▉ | 35/376 [00:50<08:44, 1.54s/it]
472
+ 10%|▉ | 36/376 [00:51<08:28, 1.50s/it]
473
+ 10%|▉ | 37/376 [00:53<08:19, 1.47s/it]
474
+ 10%|█ | 38/376 [00:54<08:00, 1.42s/it]
475
+ 10%|█ | 39/376 [00:56<08:26, 1.50s/it]
476
+ 11%|█ | 40/376 [00:57<08:09, 1.46s/it]
477
+
478
+ {'loss': 0.7022, 'learning_rate': 4.861672729019797e-05, 'epoch': 0.11}
479
+
480
+ 11%|█ | 40/376 [00:57<08:09, 1.46s/it]
481
+ 11%|█ | 41/376 [00:59<07:51, 1.41s/it]
482
+ 11%|█ | 42/376 [01:00<07:44, 1.39s/it]
483
+ 11%|█▏ | 43/376 [01:01<07:24, 1.34s/it]
484
+ 12%|█▏ | 44/376 [01:03<08:38, 1.56s/it]
485
+ 12%|█▏ | 45/376 [01:04<08:07, 1.47s/it]
486
+ 12%|█▏ | 46/376 [01:06<08:11, 1.49s/it]
487
+ 12%|█▎ | 47/376 [01:07<07:35, 1.38s/it]
488
+ 13%|█▎ | 48/376 [01:08<07:08, 1.31s/it]
489
+ 13%|█▎ | 49/376 [01:09<07:01, 1.29s/it]
490
+ 13%|█▎ | 50/376 [01:11<07:07, 1.31s/it]
491
+
492
+ {'loss': 0.5844, 'learning_rate': 4.784995028809707e-05, 'epoch': 0.13}
493
+
494
+ 13%|█▎ | 50/376 [01:11<07:07, 1.31s/it]
495
+ 14%|█▎ | 51/376 [01:12<06:59, 1.29s/it]
496
+ 14%|█▍ | 52/376 [01:13<06:54, 1.28s/it]
497
+ 14%|█▍ | 53/376 [01:14<06:26, 1.20s/it]
498
+ 14%|█▍ | 54/376 [01:16<06:46, 1.26s/it]
499
+ 15%|█▍ | 55/376 [01:17<06:54, 1.29s/it]
500
+ 15%|█▍ | 56/376 [01:18<06:21, 1.19s/it]
501
+ 15%|█▌ | 57/376 [01:19<06:25, 1.21s/it]
502
+ 15%|█▌ | 58/376 [01:21<07:51, 1.48s/it]
503
+ 16%|█▌ | 59/376 [01:23<07:14, 1.37s/it]
504
+ 16%|█▌ | 60/376 [01:24<07:11, 1.36s/it]
505
+
506
+ {'loss': 0.4454, 'learning_rate': 4.692374820516679e-05, 'epoch': 0.16}
507
+
508
+ 16%|█▌ | 60/376 [01:24<07:11, 1.36s/it]
509
+ 16%|█▌ | 61/376 [01:26<07:36, 1.45s/it]
510
+ 16%|█▋ | 62/376 [01:27<07:34, 1.45s/it]
511
+ 17%|█▋ | 63/376 [01:29<08:09, 1.56s/it]
512
+ 17%|█▋ | 64/376 [01:30<07:41, 1.48s/it]
513
+ 17%|█▋ | 65/376 [01:31<07:23, 1.43s/it]
514
+ 18%|█▊ | 66/376 [01:33<08:05, 1.57s/it]
515
+ 18%|█▊ | 67/376 [01:35<07:37, 1.48s/it]
516
+ 18%|█▊ | 68/376 [01:36<07:15, 1.42s/it]
517
+ 18%|█▊ | 69/376 [01:37<06:37, 1.29s/it]
518
+ 19%|█▊ | 70/376 [01:38<06:52, 1.35s/it]
519
+
520
+ {'loss': 0.4076, 'learning_rate': 4.584458319296868e-05, 'epoch': 0.19}
521
+
522
+ 19%|█▊ | 70/376 [01:38<06:52, 1.35s/it]
523
+ 19%|█▉ | 71/376 [01:40<06:42, 1.32s/it]
524
+ 19%|█▉ | 72/376 [01:41<06:59, 1.38s/it]
525
+ 19%|█▉ | 73/376 [01:42<06:56, 1.37s/it]
526
+ 20%|█▉ | 74/376 [01:44<06:24, 1.27s/it]
527
+ 20%|█▉ | 75/376 [01:45<06:42, 1.34s/it]
528
+ 20%|██ | 76/376 [01:46<06:05, 1.22s/it]
529
+ 20%|██ | 77/376 [01:48<06:59, 1.40s/it]
530
+ 21%|██ | 78/376 [01:49<07:24, 1.49s/it]
531
+ 21%|██ | 79/376 [01:51<06:56, 1.40s/it]
532
+ 21%|██▏ | 80/376 [01:52<06:48, 1.38s/it]
533
+
534
+ {'loss': 0.4111, 'learning_rate': 4.4619984631966524e-05, 'epoch': 0.21}
535
+
536
+ 21%|██▏ | 80/376 [01:52<06:48, 1.38s/it]
537
+ 22%|██▏ | 81/376 [01:54<07:02, 1.43s/it]
538
+ 22%|██▏ | 82/376 [01:55<06:47, 1.39s/it]
539
+ 22%|██▏ | 83/376 [01:56<06:31, 1.34s/it]
540
+ 22%|██▏ | 84/376 [01:58<07:21, 1.51s/it]
541
+ 23%|██▎ | 85/376 [01:59<06:58, 1.44s/it]
542
+ 23%|██▎ | 86/376 [02:01<07:04, 1.47s/it]
543
+ 23%|██▎ | 87/376 [02:02<06:35, 1.37s/it]
544
+ 23%|██▎ | 88/376 [02:03<06:43, 1.40s/it]
545
+ 24%|██▎ | 89/376 [02:05<06:29, 1.36s/it]
546
+ 24%|██▍ | 90/376 [02:07<07:18, 1.53s/it]
547
+
548
+ {'loss': 0.4115, 'learning_rate': 4.3258496598716736e-05, 'epoch': 0.24}
549
+
550
+ 24%|██▍ | 90/376 [02:07<07:18, 1.53s/it]
551
+ 24%|██▍ | 91/376 [02:08<07:15, 1.53s/it]
552
+ 24%|██▍ | 92/376 [02:09<06:47, 1.44s/it]
553
+ 25%|██▍ | 93/376 [02:10<06:11, 1.31s/it]
554
+ 25%|██▌ | 94/376 [02:11<05:35, 1.19s/it]
555
+ 25%|██▌ | 95/376 [02:13<05:53, 1.26s/it]
556
+ 26%|██▌ | 96/376 [02:14<05:38, 1.21s/it]
557
+ 26%|██▌ | 97/376 [02:15<05:32, 1.19s/it]
558
+ 26%|██▌ | 98/376 [02:16<05:54, 1.27s/it]
559
+ 26%|██▋ | 99/376 [02:18<06:20, 1.37s/it]
560
+ 27%|██▋ | 100/376 [02:19<06:01, 1.31s/it]
561
+
562
+ {'loss': 0.3566, 'learning_rate': 4.176961825348059e-05, 'epoch': 0.27}
563
+
564
+ 27%|██▋ | 100/376 [02:19<06:01, 1.31s/it]
565
+ 27%|██▋ | 101/376 [02:21<06:14, 1.36s/it]
566
+ 27%|██▋ | 102/376 [02:22<06:44, 1.48s/it]
567
+ 27%|██▋ | 103/376 [02:24<06:32, 1.44s/it]
568
+ 28%|██▊ | 104/376 [02:25<05:49, 1.28s/it]
569
+ 28%|██▊ | 105/376 [02:26<06:19, 1.40s/it]
570
+ 28%|██▊ | 106/376 [02:28<06:09, 1.37s/it]
571
+ 28%|██▊ | 107/376 [02:29<06:05, 1.36s/it]
572
+ 29%|██▊ | 108/376 [02:30<05:38, 1.26s/it]
573
+ 29%|██▉ | 109/376 [02:32<06:20, 1.43s/it]
574
+ 29%|██▉ | 110/376 [02:33<05:56, 1.34s/it]
575
+
576
+ {'loss': 0.4302, 'learning_rate': 4.016373756417669e-05, 'epoch': 0.29}
577
+
578
+ 29%|██▉ | 110/376 [02:33<05:56, 1.34s/it]
579
+ 30%|██▉ | 111/376 [02:34<05:40, 1.29s/it]
580
+ 30%|██▉ | 112/376 [02:36<06:18, 1.44s/it]
581
+ 30%|███ | 113/376 [02:38<06:33, 1.50s/it]
582
+ 30%|███ | 114/376 [02:39<06:25, 1.47s/it]
583
+ 31%|███ | 115/376 [02:40<06:25, 1.48s/it]
584
+ 31%|███ | 116/376 [02:42<06:47, 1.57s/it]
585
+ 31%|███ | 117/376 [02:44<06:34, 1.52s/it]
586
+ 31%|███▏ | 118/376 [02:45<05:55, 1.38s/it]
587
+ 32%|███▏ | 119/376 [02:46<05:38, 1.32s/it]
588
+ 32%|███▏ | 120/376 [02:47<05:55, 1.39s/it]
589
+
590
+ {'loss': 0.4271, 'learning_rate': 3.845205882908432e-05, 'epoch': 0.32}
591
+
592
+ 32%|███▏ | 120/376 [02:47<05:55, 1.39s/it]
593
+ 32%|███▏ | 121/376 [02:49<05:40, 1.33s/it]
594
+ 32%|███▏ | 122/376 [02:50<05:38, 1.33s/it]
595
+ 33%|███▎ | 123/376 [02:51<05:13, 1.24s/it]
596
+ 33%|███▎ | 124/376 [02:52<05:25, 1.29s/it]
597
+ 33%|███▎ | 125/376 [02:54<05:48, 1.39s/it]
598
+ 34%|███▎ | 126/376 [02:55<05:43, 1.37s/it]
599
+ 34%|███▍ | 127/376 [02:57<06:29, 1.57s/it]
600
+ 34%|███▍ | 128/376 [02:59<06:08, 1.49s/it]
601
+ 34%|███▍ | 129/376 [03:00<06:32, 1.59s/it]
602
+ 35%|███▍ | 130/376 [03:02<06:31, 1.59s/it]
603
+
604
+ {'loss': 0.4625, 'learning_rate': 3.6646524503974955e-05, 'epoch': 0.35}
605
+
606
+ 35%|███▍ | 130/376 [03:02<06:31, 1.59s/it]
607
+ 35%|███▍ | 131/376 [03:04<06:39, 1.63s/it]
608
+ 35%|███▌ | 132/376 [03:05<06:34, 1.61s/it]
609
+ 35%|███▌ | 133/376 [03:06<05:56, 1.47s/it]
610
+ 36%|███▌ | 134/376 [03:08<05:40, 1.41s/it]
611
+ 36%|███▌ | 135/376 [03:09<05:27, 1.36s/it]
612
+ 36%|███▌ | 136/376 [03:11<06:03, 1.52s/it]
613
+ 36%|███▋ | 137/376 [03:12<05:38, 1.42s/it]
614
+ 37%|███▋ | 138/376 [03:13<05:37, 1.42s/it]
615
+ 37%|███▋ | 139/376 [03:15<05:34, 1.41s/it]
616
+ 37%|███▋ | 140/376 [03:17<05:54, 1.50s/it]
617
+
618
+ {'loss': 0.5066, 'learning_rate': 3.475973187908737e-05, 'epoch': 0.37}
619
+
620
+ 37%|███▋ | 140/376 [03:17<05:54, 1.50s/it]
621
+ 38%|███▊ | 141/376 [03:18<05:46, 1.47s/it]
622
+ 38%|███▊ | 142/376 [03:20<05:55, 1.52s/it]
623
+ 38%|███▊ | 143/376 [03:21<06:18, 1.62s/it]
624
+ 38%|███▊ | 144/376 [03:23<05:49, 1.51s/it]
625
+ 39%|███▊ | 145/376 [03:24<05:39, 1.47s/it]
626
+ 39%|███▉ | 146/376 [03:25<05:32, 1.44s/it]
627
+ 39%|███▉ | 147/376 [03:27<05:25, 1.42s/it]
628
+ 39%|███▉ | 148/376 [03:28<05:30, 1.45s/it]
629
+ 40%|███▉ | 149/376 [03:30<05:54, 1.56s/it]
630
+ 40%|███▉ | 150/376 [03:32<05:37, 1.49s/it]
631
+
632
+ {'loss': 0.3887, 'learning_rate': 3.280484518729466e-05, 'epoch': 0.4}
633
+
634
+ 40%|███▉ | 150/376 [03:32<05:37, 1.49s/it]
635
+ 40%|████ | 151/376 [03:33<05:14, 1.40s/it]
636
+ 40%|████ | 152/376 [03:34<04:52, 1.30s/it]
637
+ 41%|████ | 153/376 [03:35<04:54, 1.32s/it]
638
+ 41%|████ | 154/376 [03:37<04:56, 1.34s/it]
639
+ 41%|████ | 155/376 [03:38<05:14, 1.42s/it]
640
+ 41%|████▏ | 156/376 [03:40<05:08, 1.40s/it]
641
+ 42%|████▏ | 157/376 [03:41<04:45, 1.30s/it]
642
+ 42%|████▏ | 158/376 [03:42<04:29, 1.24s/it]
643
+ 42%|████▏ | 159/376 [03:43<04:41, 1.30s/it]
644
+ 43%|████▎ | 160/376 [03:44<04:41, 1.30s/it]
645
+
646
+ {'loss': 0.3675, 'learning_rate': 3.079550375668821e-05, 'epoch': 0.42}
647
+
648
+ 43%|████▎ | 160/376 [03:44<04:41, 1.30s/it]
649
+ 43%|████▎ | 161/376 [03:46<04:59, 1.39s/it]
650
+ 43%|████▎ | 162/376 [03:47<04:35, 1.29s/it]
651
+ 43%|████▎ | 163/376 [03:48<04:36, 1.30s/it]
652
+ 44%|████▎ | 164/376 [03:50<04:34, 1.30s/it]
653
+ 44%|████▍ | 165/376 [03:51<04:41, 1.33s/it]
654
+ 44%|████▍ | 166/376 [03:53<05:04, 1.45s/it]
655
+ 44%|████▍ | 167/376 [03:54<05:04, 1.46s/it]
656
+ 45%|████▍ | 168/376 [03:56<05:02, 1.45s/it]
657
+ 45%|████▍ | 169/376 [03:57<05:08, 1.49s/it]
658
+ 45%|████▌ | 170/376 [03:59<05:00, 1.46s/it]
659
+
660
+ {'loss': 0.4095, 'learning_rate': 2.8745726848402036e-05, 'epoch': 0.45}
661
+
662
+ 45%|████▌ | 170/376 [03:59<05:00, 1.46s/it]
663
+ 45%|████▌ | 171/376 [04:00<04:57, 1.45s/it]
664
+ 46%|████▌ | 172/376 [04:01<04:26, 1.31s/it]
665
+ 46%|████▌ | 173/376 [04:02<04:12, 1.24s/it]
666
+ 46%|████▋ | 174/376 [04:04<04:28, 1.33s/it]
667
+ 47%|████▋ | 175/376 [04:05<04:35, 1.37s/it]
668
+ 47%|████▋ | 176/376 [04:07<04:50, 1.45s/it]
669
+ 47%|████▋ | 177/376 [04:09<05:17, 1.60s/it]
670
+ 47%|████▋ | 178/376 [04:10<04:52, 1.48s/it]
671
+ 48%|████▊ | 179/376 [04:11<04:47, 1.46s/it]
672
+ 48%|████▊ | 180/376 [04:13<04:29, 1.37s/it]
673
+
674
+ {'loss': 0.3782, 'learning_rate': 2.6669815843628042e-05, 'epoch': 0.48}
675
+
676
+ 48%|████▊ | 180/376 [04:13<04:29, 1.37s/it]
677
+ 48%|████▊ | 181/376 [04:14<04:25, 1.36s/it]
678
+ 48%|████▊ | 182/376 [04:15<04:18, 1.33s/it]
679
+ 49%|████▊ | 183/376 [04:17<04:42, 1.46s/it]
680
+ 49%|████▉ | 184/376 [04:18<04:11, 1.31s/it]
681
+ 49%|████▉ | 185/376 [04:19<04:02, 1.27s/it]
682
+ 49%|████▉ | 186/376 [04:21<04:45, 1.50s/it]
683
+ 50%|████▉ | 187/376 [04:22<04:26, 1.41s/it]
684
+ 50%|█████ | 188/376 [04:24<04:14, 1.35s/it]
685
+ 50%|█████ | 189/376 [04:26<05:00, 1.61s/it]
686
+ 51%|█████ | 190/376 [04:28<05:18, 1.71s/it]
687
+
688
+ {'loss': 0.4195, 'learning_rate': 2.4582254462267476e-05, 'epoch': 0.5}
689
+
690
+ 51%|█████ | 190/376 [04:28<05:18, 1.71s/it]
691
+ 51%|█████ | 191/376 [04:29<04:55, 1.60s/it]
692
+ 51%|█████ | 192/376 [04:30<04:48, 1.57s/it]
693
+ 51%|█████▏ | 193/376 [04:32<04:46, 1.56s/it]
694
+ 52%|█████▏ | 194/376 [04:33<04:34, 1.51s/it]
695
+ 52%|█████▏ | 195/376 [04:36<05:04, 1.68s/it]
696
+ 52%|█████▏ | 196/376 [04:37<04:39, 1.55s/it]
697
+ 52%|█████▏ | 197/376 [04:38<04:32, 1.52s/it]
698
+ 53%|█████▎ | 198/376 [04:40<04:39, 1.57s/it]
699
+ 53%|█████▎ | 199/376 [04:42<04:42, 1.59s/it]
700
+ 53%|█████▎ | 200/376 [04:43<04:25, 1.51s/it]
701
+
702
+ {'loss': 0.3392, 'learning_rate': 2.2497607709397543e-05, 'epoch': 0.53}
703
+
704
+ 53%|█████▎ | 200/376 [04:43<04:25, 1.51s/it]
705
+ 53%|█████▎ | 201/376 [04:44<04:13, 1.45s/it]
706
+ 54%|█████▎ | 202/376 [04:45<03:58, 1.37s/it]
707
+ 54%|█████▍ | 203/376 [04:47<03:53, 1.35s/it]
708
+ 54%|█████▍ | 204/376 [04:48<03:38, 1.27s/it]
709
+ 55%|█████▍ | 205/376 [04:49<03:46, 1.33s/it]
710
+ 55%|█████▍ | 206/376 [04:51<03:55, 1.38s/it]
711
+ 55%|█████▌ | 207/376 [04:52<04:11, 1.49s/it]
712
+ 55%|█████▌ | 208/376 [04:54<04:02, 1.44s/it]
713
+ 56%|█████▌ | 209/376 [04:55<03:40, 1.32s/it]
714
+ 56%|█████▌ | 210/376 [04:56<03:21, 1.21s/it]
715
+
716
+ {'loss': 0.3347, 'learning_rate': 2.0430420254607748e-05, 'epoch': 0.56}
717
+
718
+ 56%|█████▌ | 210/376 [04:56<03:21, 1.21s/it]
719
+ 56%|█████▌ | 211/376 [04:57<03:18, 1.20s/it]
720
+ 56%|█████▋ | 212/376 [04:59<03:42, 1.36s/it]
721
+ 57%|█████▋ | 213/376 [05:00<03:41, 1.36s/it]
722
+ 57%|█████▋ | 214/376 [05:01<03:44, 1.39s/it]
723
+ 57%|█████▋ | 215/376 [05:03<03:28, 1.30s/it]
724
+ 57%|█████▋ | 216/376 [05:04<03:34, 1.34s/it]
725
+ 58%|█████▊ | 217/376 [05:05<03:38, 1.37s/it]
726
+ 58%|█████▊ | 218/376 [05:07<03:38, 1.38s/it]
727
+ 58%|█████▊ | 219/376 [05:08<03:30, 1.34s/it]
728
+ 59%|█████▊ | 220/376 [05:09<03:21, 1.29s/it]
729
+
730
+ {'loss': 0.4117, 'learning_rate': 1.8395114953217852e-05, 'epoch': 0.58}
731
+
732
+ 59%|█████▊ | 220/376 [05:09<03:21, 1.29s/it]
733
+ 59%|█████▉ | 221/376 [05:11<03:19, 1.29s/it]
734
+ 59%|█████▉ | 222/376 [05:12<03:20, 1.30s/it]
735
+ 59%|█████▉ | 223/376 [05:13<03:03, 1.20s/it]
736
+ 60%|█████▉ | 224/376 [05:14<02:50, 1.12s/it]
737
+ 60%|█████▉ | 225/376 [05:15<02:47, 1.11s/it]
738
+ 60%|██████ | 226/376 [05:16<03:02, 1.21s/it]
739
+ 60%|██████ | 227/376 [05:18<03:26, 1.39s/it]
740
+ 61%|██████ | 228/376 [05:20<03:41, 1.50s/it]
741
+ 61%|██████ | 229/376 [05:21<03:29, 1.43s/it]
742
+ 61%|██████ | 230/376 [05:23<03:37, 1.49s/it]
743
+
744
+ {'loss': 0.3772, 'learning_rate': 1.640589221739926e-05, 'epoch': 0.61}
745
+
746
+ 61%|██████ | 230/376 [05:23<03:37, 1.49s/it]
747
+ 61%|██████▏ | 231/376 [05:24<03:26, 1.43s/it]
748
+ 62%|██████▏ | 232/376 [05:25<03:21, 1.40s/it]
749
+ 62%|██████▏ | 233/376 [05:27<03:09, 1.33s/it]
750
+ 62%|██████▏ | 234/376 [05:28<03:33, 1.51s/it]
751
+ 62%|██████▎ | 235/376 [05:30<03:20, 1.42s/it]
752
+ 63%|██████▎ | 236/376 [05:31<03:32, 1.52s/it]
753
+ 63%|██████▎ | 237/376 [05:33<03:17, 1.42s/it]
754
+ 63%|██████▎ | 238/376 [05:34<03:00, 1.31s/it]
755
+ 64%|██████▎ | 239/376 [05:35<03:00, 1.32s/it]
756
+ 64%|██████▍ | 240/376 [05:36<03:04, 1.35s/it]
757
+
758
+ {'loss': 0.4403, 'learning_rate': 1.447663093929163e-05, 'epoch': 0.64}
759
+
760
+ 64%|██████▍ | 240/376 [05:36<03:04, 1.35s/it]
761
+ 64%|██████▍ | 241/376 [05:38<03:02, 1.35s/it]
762
+ 64%|██████▍ | 242/376 [05:39<02:59, 1.34s/it]
763
+ 65%|██████▍ | 243/376 [05:41<03:03, 1.38s/it]
764
+ 65%|██████▍ | 244/376 [05:42<03:05, 1.41s/it]
765
+ 65%|██████▌ | 245/376 [05:43<03:00, 1.38s/it]
766
+ 65%|██████▌ | 246/376 [05:45<02:53, 1.33s/it]
767
+ 66%|██████▌ | 247/376 [05:46<02:41, 1.25s/it]
768
+ 66%|██████▌ | 248/376 [05:47<02:51, 1.34s/it]
769
+ 66%|██████▌ | 249/376 [05:48<02:46, 1.31s/it]
770
+ 66%|██████▋ | 250/376 [05:50<03:02, 1.45s/it]
771
+
772
+ {'loss': 0.3867, 'learning_rate': 1.2620791657378664e-05, 'epoch': 0.66}
773
+
774
+ 66%|██████▋ | 250/376 [05:50<03:02, 1.45s/it]
775
+ 67%|██████▋ | 251/376 [05:51<02:51, 1.37s/it]
776
+ 67%|██████▋ | 252/376 [05:53<02:52, 1.39s/it]
777
+ 67%|██████▋ | 253/376 [05:54<02:50, 1.39s/it]
778
+ 68%|██████▊ | 254/376 [05:56<02:44, 1.35s/it]
779
+ 68%|██████▊ | 255/376 [05:57<02:54, 1.44s/it]
780
+ 68%|██████▊ | 256/376 [05:59<02:54, 1.45s/it]
781
+ 68%|██████▊ | 257/376 [06:00<02:47, 1.41s/it]
782
+ 69%|██████▊ | 258/376 [06:01<02:35, 1.31s/it]
783
+ 69%|██████▉ | 259/376 [06:03<02:42, 1.39s/it]
784
+ 69%|██████▉ | 260/376 [06:04<02:28, 1.28s/it]
785
+
786
+ {'loss': 0.3688, 'learning_rate': 1.0851322641735118e-05, 'epoch': 0.69}
787
+
788
+ 69%|██████▉ | 260/376 [06:04<02:28, 1.28s/it]
789
+ 69%|██████▉ | 261/376 [06:05<02:29, 1.30s/it]
790
+ 70%|██████▉ | 262/376 [06:07<02:51, 1.50s/it]
791
+ 70%|██████▉ | 263/376 [06:09<02:53, 1.54s/it]
792
+ 70%|███████ | 264/376 [06:10<02:53, 1.55s/it]
793
+ 70%|███████ | 265/376 [06:11<02:43, 1.47s/it]
794
+ 71%|███████ | 266/376 [06:13<02:32, 1.38s/it]
795
+ 71%|███████ | 267/376 [06:14<02:40, 1.47s/it]
796
+ 71%|███████▏ | 268/376 [06:16<02:35, 1.44s/it]
797
+ 72%|███████▏ | 269/376 [06:17<02:44, 1.54s/it]
798
+ 72%|███████▏ | 270/376 [06:19<02:40, 1.51s/it]
799
+
800
+ {'loss': 0.3655, 'learning_rate': 9.180569553392535e-06, 'epoch': 0.72}
801
+
802
+ 72%|███████▏ | 270/376 [06:19<02:40, 1.51s/it]
803
+ 72%|███��███▏ | 271/376 [06:20<02:34, 1.47s/it]
804
+ 72%|███████▏ | 272/376 [06:21<02:25, 1.40s/it]
805
+ 73%|███████▎ | 273/376 [06:23<02:24, 1.40s/it]
806
+ 73%|███████▎ | 274/376 [06:25<02:29, 1.47s/it]
807
+ 73%|███████▎ | 275/376 [06:27<02:45, 1.64s/it]
808
+ 73%|███████▎ | 276/376 [06:28<02:32, 1.52s/it]
809
+ 74%|███████▎ | 277/376 [06:29<02:27, 1.49s/it]
810
+ 74%|███████▍ | 278/376 [06:31<02:29, 1.53s/it]
811
+ 74%|███████▍ | 279/376 [06:32<02:21, 1.46s/it]
812
+ 74%|███████▍ | 280/376 [06:33<02:15, 1.41s/it]
813
+
814
+ {'loss': 0.4144, 'learning_rate': 7.620189308133943e-06, 'epoch': 0.74}
815
+
816
+ 74%|███████▍ | 280/376 [06:33<02:15, 1.41s/it]
817
+ 75%|███████▍ | 281/376 [06:35<02:14, 1.41s/it]
818
+ 75%|███████▌ | 282/376 [06:36<02:17, 1.46s/it]
819
+ 75%|███████▌ | 283/376 [06:38<02:16, 1.47s/it]
820
+ 76%|███████▌ | 284/376 [06:39<02:03, 1.34s/it]
821
+ 76%|███████▌ | 285/376 [06:40<01:53, 1.25s/it]
822
+ 76%|███████▌ | 286/376 [06:41<01:49, 1.21s/it]
823
+ 76%|███████▋ | 287/376 [06:43<02:11, 1.48s/it]
824
+ 77%|███████▋ | 288/376 [06:45<02:24, 1.64s/it]
825
+ 77%|███████▋ | 289/376 [06:46<02:06, 1.45s/it]
826
+ 77%|███████▋ | 290/376 [06:47<01:56, 1.35s/it]
827
+
828
+ {'loss': 0.3298, 'learning_rate': 6.181068745693716e-06, 'epoch': 0.77}
829
+
830
+ 77%|███████▋ | 290/376 [06:47<01:56, 1.35s/it]
831
+ 77%|███████▋ | 291/376 [06:48<01:47, 1.27s/it]
832
+ 78%|███████▊ | 292/376 [06:50<01:51, 1.33s/it]
833
+ 78%|███████▊ | 293/376 [06:51<01:42, 1.24s/it]
834
+ 78%|███████▊ | 294/376 [06:52<01:46, 1.29s/it]
835
+ 78%|███████▊ | 295/376 [06:54<01:50, 1.37s/it]
836
+ 79%|███████▊ | 296/376 [06:55<01:46, 1.33s/it]
837
+ 79%|███████▉ | 297/376 [06:56<01:37, 1.24s/it]
838
+ 79%|███████▉ | 298/376 [06:58<01:39, 1.28s/it]
839
+ 80%|███████▉ | 299/376 [06:59<01:34, 1.23s/it]
840
+ 80%|███████▉ | 300/376 [07:00<01:31, 1.20s/it]
841
+
842
+ {'loss': 0.3337, 'learning_rate': 4.873248671810928e-06, 'epoch': 0.8}
843
+
844
+ 80%|███████▉ | 300/376 [07:00<01:31, 1.20s/it]
845
+ 80%|████████ | 301/376 [07:01<01:28, 1.17s/it]
846
+ 80%|████████ | 302/376 [07:03<01:36, 1.31s/it]
847
+ 81%|████████ | 303/376 [07:04<01:30, 1.23s/it]
848
+ 81%|████████ | 304/376 [07:05<01:26, 1.20s/it]
849
+ 81%|████████ | 305/376 [07:06<01:23, 1.18s/it]
850
+ 81%|████████▏ | 306/376 [07:07<01:25, 1.22s/it]
851
+ 82%|████████▏ | 307/376 [07:08<01:20, 1.17s/it]
852
+ 82%|████████▏ | 308/376 [07:10<01:22, 1.22s/it]
853
+ 82%|████████▏ | 309/376 [07:11<01:22, 1.24s/it]
854
+ 82%|████████▏ | 310/376 [07:13<01:35, 1.44s/it]
855
+
856
+ {'loss': 0.3217, 'learning_rate': 3.7058538030980942e-06, 'epoch': 0.82}
857
+
858
+ 82%|████████▏ | 310/376 [07:13<01:35, 1.44s/it]
859
+ 83%|████████▎ | 311/376 [07:14<01:31, 1.41s/it]
860
+ 83%|████████▎ | 312/376 [07:15<01:22, 1.29s/it]
861
+ 83%|████████▎ | 313/376 [07:16<01:23, 1.33s/it]
862
+ 84%|████████▎ | 314/376 [07:18<01:28, 1.43s/it]
863
+ 84%|████████▍ | 315/376 [07:19<01:25, 1.40s/it]
864
+ 84%|████████▍ | 316/376 [07:21<01:17, 1.29s/it]
865
+ 84%|████████▍ | 317/376 [07:22<01:11, 1.21s/it]
866
+ 85%|████████▍ | 318/376 [07:24<01:24, 1.46s/it]
867
+ 85%|████████▍ | 319/376 [07:25<01:19, 1.40s/it]
868
+ 85%|████████▌ | 320/376 [07:26<01:16, 1.36s/it]
869
+
870
+ {'loss': 0.3222, 'learning_rate': 2.687029103502972e-06, 'epoch': 0.85}
871
+
872
+ 85%|████████▌ | 320/376 [07:26<01:16, 1.36s/it]
873
+ 85%|████████▌ | 321/376 [07:27<01:09, 1.26s/it]
874
+ 86%|████████▌ | 322/376 [07:28<01:09, 1.28s/it]
875
+ 86%|████████▌ | 323/376 [07:30<01:12, 1.38s/it]
876
+ 86%|████████▌ | 324/376 [07:32<01:12, 1.40s/it]
877
+ 86%|████████▋ | 325/376 [07:33<01:11, 1.41s/it]
878
+ 87%|████████▋ | 326/376 [07:35<01:15, 1.50s/it]
879
+ 87%|████████▋ | 327/376 [07:36<01:12, 1.49s/it]
880
+ 87%|████████▋ | 328/376 [07:38<01:12, 1.50s/it]
881
+ 88%|████████▊ | 329/376 [07:39<01:10, 1.50s/it]
882
+ 88%|████████▊ | 330/376 [07:41<01:10, 1.53s/it]
883
+
884
+ {'loss': 0.3989, 'learning_rate': 1.823882956546566e-06, 'epoch': 0.88}
885
+
886
+ 88%|████████▊ | 330/376 [07:41<01:10, 1.53s/it]
887
+ 88%|████████▊ | 331/376 [07:42<01:10, 1.57s/it]
888
+ 88%|████████▊ | 332/376 [07:44<01:09, 1.59s/it]
889
+ 89%|████████▊ | 333/376 [07:45<01:06, 1.54s/it]
890
+ 89%|████████▉ | 334/376 [07:47<01:02, 1.48s/it]
891
+ 89%|████████▉ | 335/376 [07:48<01:00, 1.48s/it]
892
+ 89%|████████▉ | 336/376 [07:49<00:53, 1.33s/it]
893
+ 90%|████████▉ | 337/376 [07:50<00:50, 1.29s/it]
894
+ 90%|████████▉ | 338/376 [07:52<00:48, 1.29s/it]
895
+ 90%|█████████ | 339/376 [07:53<00:49, 1.34s/it]
896
+ 90%|█████████ | 340/376 [07:55<00:48, 1.34s/it]
897
+
898
+ {'loss': 0.3805, 'learning_rate': 1.1224375698271894e-06, 'epoch': 0.9}
899
+
900
+ 90%|█████████ | 340/376 [07:55<00:48, 1.34s/it]
901
+ 91%|█████████ | 341/376 [07:56<00:48, 1.38s/it]
902
+ 91%|█████████ | 342/376 [07:57<00:43, 1.28s/it]
903
+ 91%|█████████ | 343/376 [07:59<00:47, 1.45s/it]
904
+ 91%|█████████▏| 344/376 [08:00<00:47, 1.49s/it]
905
+ 92%|█████████▏| 345/376 [08:02<00:45, 1.46s/it]
906
+ 92%|█████████▏| 346/376 [08:03<00:40, 1.34s/it]
907
+ 92%|█████████▏| 347/376 [08:04<00:40, 1.39s/it]
908
+ 93%|█████████▎| 348/376 [08:06<00:36, 1.30s/it]
909
+ 93%|█████████▎| 349/376 [08:07<00:34, 1.28s/it]
910
+ 93%|█████████▎| 350/376 [08:08<00:34, 1.31s/it]
911
+
912
+ {'loss': 0.4108, 'learning_rate': 5.875869578203824e-07, 'epoch': 0.93}
913
+
914
+ 93%|█████████▎| 350/376 [08:08<00:34, 1.31s/it]
915
+ 93%|█████████▎| 351/376 [08:10<00:34, 1.38s/it]
916
+ 94%|█████████▎| 352/376 [08:11<00:31, 1.32s/it]
917
+ 94%|█████████▍| 353/376 [08:12<00:28, 1.26s/it]
918
+ 94%|█████████▍| 354/376 [08:13<00:28, 1.29s/it]
919
+ 94%|█████████▍| 355/376 [08:14<00:25, 1.23s/it]
920
+ 95%|█████████▍| 356/376 [08:16<00:23, 1.19s/it]
921
+ 95%|█████████▍| 357/376 [08:17<00:25, 1.35s/it]
922
+ 95%|█████████▌| 358/376 [08:19<00:26, 1.47s/it]
923
+ 95%|█████████▌| 359/376 [08:20<00:22, 1.32s/it]
924
+ 96%|█████████▌| 360/376 [08:21<00:21, 1.37s/it]
925
+
926
+ {'loss': 0.3578, 'learning_rate': 2.230627961304993e-07, 'epoch': 0.96}
927
+
928
+ 96%|█████████▌| 360/376 [08:22<00:21, 1.37s/it]
929
+ 96%|█████████▌| 361/376 [08:23<00:20, 1.34s/it]
930
+ 96%|█████████▋| 362/376 [08:24<00:19, 1.39s/it]
931
+ 97%|█████████▋| 363/376 [08:26<00:18, 1.40s/it]
932
+ 97%|█████████▋| 364/376 [08:27<00:16, 1.36s/it]
933
+ 97%|█████████▋| 365/376 [08:29<00:16, 1.46s/it]
934
+ 97%|█████████▋| 366/376 [08:30<00:14, 1.45s/it]
935
+ 98%|█████████▊| 367/376 [08:32<00:13, 1.46s/it]
936
+ 98%|█████████▊| 368/376 [08:33<00:11, 1.46s/it]
937
+ 98%|█████████▊| 369/376 [08:35<00:10, 1.48s/it]
938
+ 98%|█████████▊| 370/376 [08:36<00:08, 1.39s/it]
939
+
940
+ {'loss': 0.3453, 'learning_rate': 3.1408385430356516e-08, 'epoch': 0.98}
941
+
942
+ 98%|█████████▊| 370/376 [08:36<00:08, 1.39s/it]
943
+ 99%|█████████▊| 371/376 [08:37<00:06, 1.31s/it]
944
+ 99%|█████████▉| 372/376 [08:38<00:05, 1.35s/it]
945
+ 99%|█████████▉| 373/376 [08:40<00:04, 1.35s/it]
946
+ 99%|█████████▉| 374/376 [08:41<00:02, 1.41s/it]
947
+ 100%|█████████▉| 375/376 [08:43<00:01, 1.47s/it]
948
+ 100%|██████████| 376/376 [08:44<00:00, 1.47s/it][INFO|trainer.py:3166] 2024-01-04 10:02:58,683 >> ***** Running Evaluation *****
949
+ [INFO|trainer.py:3168] 2024-01-04 10:02:58,683 >> Num examples = 335
950
+ [INFO|trainer.py:3171] 2024-01-04 10:02:58,683 >> Batch size = 1
951
+
952
+
953
+ 0%| | 0/168 [00:00<?, ?it/s]
954
+
955
+ 1%| | 2/168 [00:00<00:16, 10.08it/s]
956
+
957
+ 2%|▏ | 4/168 [00:00<00:21, 7.51it/s]
958
+
959
+ 3%|▎ | 5/168 [00:00<00:27, 5.84it/s]
960
+
961
+ 4%|▎ | 6/168 [00:00<00:24, 6.53it/s]
962
+
963
+ 4%|▍ | 7/168 [00:01<00:28, 5.68it/s]
964
+
965
+ 5%|▍ | 8/168 [00:01<00:27, 5.80it/s]
966
+
967
+ 5%|▌ | 9/168 [00:01<00:31, 5.02it/s]
968
+
969
+ 6%|▌ | 10/168 [00:01<00:33, 4.74it/s]
970
+
971
+ 7%|▋ | 11/168 [00:01<00:29, 5.33it/s]
972
+
973
+ 7%|▋ | 12/168 [00:02<00:32, 4.81it/s]
974
+
975
+ 8%|▊ | 13/168 [00:02<00:28, 5.35it/s]
976
+
977
+ 8%|▊ | 14/168 [00:02<00:26, 5.85it/s]
978
+
979
+ 9%|▉ | 15/168 [00:02<00:29, 5.24it/s]
980
+
981
+ 10%|▉ | 16/168 [00:02<00:27, 5.51it/s]
982
+
983
+ 10%|█ | 17/168 [00:02<00:25, 5.91it/s]
984
+
985
+ 11%|█ | 18/168 [00:03<00:24, 6.22it/s]
986
+
987
+ 11%|█▏ | 19/168 [00:03<00:23, 6.44it/s]
988
+
989
+ 12%|█▏ | 20/168 [00:03<00:24, 6.13it/s]
990
+
991
+ 12%|█▎ | 21/168 [00:03<00:28, 5.17it/s]
992
+
993
+ 13%|█▎ | 22/168 [00:03<00:26, 5.44it/s]
994
+
995
+ 14%|█▍ | 24/168 [00:04<00:21, 6.73it/s]
996
+
997
+ 15%|█▍ | 25/168 [00:04<00:20, 7.15it/s]
998
+
999
+ 15%|█▌ | 26/168 [00:04<00:20, 6.95it/s]
1000
+
1001
+ 16%|█▌ | 27/168 [00:04<00:24, 5.74it/s]
1002
+
1003
+ 17%|█▋ | 28/168 [00:04<00:23, 6.08it/s]
1004
+
1005
+ 17%|█▋ | 29/168 [00:04<00:21, 6.38it/s]
1006
+
1007
+ 18%|█▊ | 30/168 [00:05<00:21, 6.48it/s]
1008
+
1009
+ 18%|█▊ | 31/168 [00:05<00:20, 6.71it/s]
1010
+
1011
+ 19%|█▉ | 32/168 [00:05<00:20, 6.76it/s]
1012
+
1013
+ 20%|██ | 34/168 [00:05<00:18, 7.39it/s]
1014
+
1015
+ 21%|██ | 35/168 [00:05<00:18, 7.31it/s]
1016
+
1017
+ 21%|██▏ | 36/168 [00:05<00:17, 7.68it/s]
1018
+
1019
+ 22%|██▏ | 37/168 [00:05<00:17, 7.58it/s]
1020
+
1021
+ 23%|██▎ | 38/168 [00:06<00:17, 7.44it/s]
1022
+
1023
+ 23%|██▎ | 39/168 [00:06<00:20, 6.29it/s]
1024
+
1025
+ 24%|██▍ | 40/168 [00:06<00:20, 6.31it/s]
1026
+
1027
+ 24%|██▍ | 41/168 [00:06<00:20, 6.31it/s]
1028
+
1029
+ 25%|██▌ | 42/168 [00:06<00:18, 6.93it/s]
1030
+
1031
+ 26%|██▌ | 43/168 [00:06<00:20, 5.98it/s]
1032
+
1033
+ 26%|██▌ | 44/168 [00:07<00:19, 6.44it/s]
1034
+
1035
+ 27%|██▋ | 45/168 [00:07<00:21, 5.70it/s]
1036
+
1037
+ 27%|██▋ | 46/168 [00:07<00:23, 5.26it/s]
1038
+
1039
+ 28%|██▊ | 47/168 [00:07<00:25, 4.74it/s]
1040
+
1041
+ 29%|██▊ | 48/168 [00:07<00:22, 5.29it/s]
1042
+
1043
+ 29%|██▉ | 49/168 [00:08<00:21, 5.65it/s]
1044
+
1045
+ 30%|██▉ | 50/168 [00:08<00:19, 6.09it/s]
1046
+
1047
+ 30%|███ | 51/168 [00:08<00:21, 5.46it/s]
1048
+
1049
+ 31%|███ | 52/168 [00:08<00:20, 5.70it/s]
1050
+
1051
+ 32%|███▏ | 53/168 [00:08<00:22, 5.02it/s]
1052
+
1053
+ 32%|███▏ | 54/168 [00:09<00:22, 5.11it/s]
1054
+
1055
+ 33%|███▎ | 55/168 [00:09<00:21, 5.22it/s]
1056
+
1057
+ 33%|███▎ | 56/168 [00:09<00:19, 5.75it/s]
1058
+
1059
+ 34%|███▍ | 57/168 [00:09<00:17, 6.50it/s]
1060
+
1061
+ 35%|███▍ | 58/168 [00:09<00:18, 5.82it/s]
1062
+
1063
+ 35%|███▌ | 59/168 [00:09<00:21, 5.01it/s]
1064
+
1065
+ 36%|███▌ | 60/168 [00:10<00:19, 5.44it/s]
1066
+
1067
+ 36%|███▋ | 61/168 [00:10<00:17, 6.24it/s]
1068
+
1069
+ 37%|███▋ | 62/168 [00:10<00:15, 6.80it/s]
1070
+
1071
+ 38%|███▊ | 63/168 [00:10<00:15, 6.99it/s]
1072
+
1073
+ 38%|███▊ | 64/168 [00:10<00:14, 7.14it/s]
1074
+
1075
+ 39%|███▊ | 65/168 [00:10<00:14, 7.02it/s]
1076
+
1077
+ 39%|███▉ | 66/168 [00:10<00:13, 7.43it/s]
1078
+
1079
+ 40%|███▉ | 67/168 [00:11<00:17, 5.85it/s]
1080
+
1081
+ 40%|████ | 68/168 [00:11<00:19, 5.09it/s]
1082
+
1083
+ 41%|████ | 69/168 [00:11<00:17, 5.57it/s]
1084
+
1085
+ 42%|████▏ | 70/168 [00:11<00:15, 6.14it/s]
1086
+
1087
+ 42%|████▏ | 71/168 [00:11<00:14, 6.50it/s]
1088
+
1089
+ 43%|████▎ | 72/168 [00:11<00:16, 5.78it/s]
1090
+
1091
+ 43%|████▎ | 73/168 [00:12<00:19, 5.00it/s]
1092
+
1093
+ 44%|████▍ | 74/168 [00:12<00:19, 4.95it/s]
1094
+
1095
+ 45%|████▍ | 75/168 [00:12<00:18, 5.12it/s]
1096
+
1097
+ 46%|████▌ | 77/168 [00:12<00:13, 6.52it/s]
1098
+
1099
+ 46%|████▋ | 78/168 [00:12<00:12, 7.01it/s]
1100
+
1101
+ 47%|████▋ | 79/168 [00:13<00:12, 7.09it/s]
1102
+
1103
+ 48%|████▊ | 80/168 [00:13<00:13, 6.46it/s]
1104
+
1105
+ 48%|████▊ | 81/168 [00:13<00:13, 6.39it/s]
1106
+
1107
+ 49%|████▉ | 82/168 [00:13<00:13, 6.33it/s]
1108
+
1109
+ 49%|████▉ | 83/168 [00:13<00:13, 6.39it/s]
1110
+
1111
+ 50%|█████ | 84/168 [00:13<00:14, 5.64it/s]
1112
+
1113
+ 51%|█████ | 85/168 [00:14<00:16, 5.00it/s]
1114
+
1115
+ 51%|█████ | 86/168 [00:14<00:14, 5.55it/s]
1116
+
1117
+ 52%|█████▏ | 87/168 [00:14<00:13, 6.03it/s]
1118
+
1119
+ 52%|█████▏ | 88/168 [00:14<00:12, 6.65it/s]
1120
+
1121
+ 53%|█████▎ | 89/168 [00:14<00:12, 6.47it/s]
1122
+
1123
+ 54%|█████▎ | 90/168 [00:14<00:11, 6.53it/s]
1124
+
1125
+ 54%|█████▍ | 91/168 [00:15<00:11, 6.55it/s]
1126
+
1127
+ 55%|█████▍ | 92/168 [00:15<00:10, 7.11it/s]
1128
+
1129
+ 55%|█████▌ | 93/168 [00:15<00:10, 7.24it/s]
1130
+
1131
+ 56%|█████▌ | 94/168 [00:15<00:10, 7.32it/s]
1132
+
1133
+ 57%|█████▋ | 95/168 [00:15<00:09, 7.37it/s]
1134
+
1135
+ 57%|█████▋ | 96/168 [00:15<00:12, 5.76it/s]
1136
+
1137
+ 58%|█████▊ | 97/168 [00:16<00:13, 5.12it/s]
1138
+
1139
+ 58%|█████▊ | 98/168 [00:16<00:14, 4.86it/s]
1140
+
1141
+ 59%|█████▉ | 99/168 [00:16<00:12, 5.42it/s]
1142
+
1143
+ 60%|█████▉ | 100/168 [00:16<00:11, 5.69it/s]
1144
+
1145
+ 60%|██████ | 101/168 [00:16<00:10, 6.38it/s]
1146
+
1147
+ 61%|██████ | 102/168 [00:16<00:10, 6.58it/s]
1148
+
1149
+ 61%|██████▏ | 103/168 [00:16<00:09, 7.16it/s]
1150
+
1151
+ 62%|██████▏ | 104/168 [00:17<00:11, 5.74it/s]
1152
+
1153
+ 62%|██████▎ | 105/168 [00:17<00:10, 6.08it/s]
1154
+
1155
+ 63%|██████▎ | 106/168 [00:17<00:09, 6.21it/s]
1156
+
1157
+ 64%|██████▎ | 107/168 [00:17<00:09, 6.45it/s]
1158
+
1159
+ 64%|██████▍ | 108/168 [00:17<00:09, 6.56it/s]
1160
+
1161
+ 65%|██████▍ | 109/168 [00:17<00:09, 6.31it/s]
1162
+
1163
+ 65%|██████▌ | 110/168 [00:18<00:08, 6.89it/s]
1164
+
1165
+ 66%|██████▌ | 111/168 [00:18<00:08, 6.36it/s]
1166
+
1167
+ 67%|██████▋ | 112/168 [00:18<00:08, 6.27it/s]
1168
+
1169
+ 67%|██████▋ | 113/168 [00:18<00:09, 6.00it/s]
1170
+
1171
+ 68%|██████▊ | 114/168 [00:18<00:08, 6.33it/s]
1172
+
1173
+ 68%|██████▊ | 115/168 [00:18<00:07, 6.64it/s]
1174
+
1175
+ 69%|██████▉ | 116/168 [00:19<00:08, 5.88it/s]
1176
+
1177
+ 70%|██████▉ | 117/168 [00:19<00:08, 6.22it/s]
1178
+
1179
+ 70%|███████ | 118/168 [00:19<00:08, 6.16it/s]
1180
+
1181
+ 71%|███████ | 119/168 [00:19<00:08, 5.46it/s]
1182
+
1183
+ 71%|███████▏ | 120/168 [00:19<00:09, 4.83it/s]
1184
+
1185
+ 72%|███████▏ | 121/168 [00:20<00:09, 5.00it/s]
1186
+
1187
+ 73%|███████▎ | 122/168 [00:20<00:09, 4.75it/s]
1188
+
1189
+ 73%|███████▎ | 123/168 [00:20<00:08, 5.45it/s]
1190
+
1191
+ 74%|███████▍ | 124/168 [00:20<00:08, 5.04it/s]
1192
+
1193
+ 74%|███████▍ | 125/168 [00:20<00:07, 5.50it/s]
1194
+
1195
+ 75%|███████▌ | 126/168 [00:20<00:06, 6.18it/s]
1196
+
1197
+ 76%|███████▌ | 127/168 [00:21<00:06, 6.57it/s]
1198
+
1199
+ 76%|███████▌ | 128/168 [00:21<00:07, 5.37it/s]
1200
+
1201
+ 77%|███████▋ | 129/168 [00:21<00:07, 5.42it/s]
1202
+
1203
+ 77%|███████▋ | 130/168 [00:21<00:07, 4.82it/s]
1204
+
1205
+ 78%|███████▊ | 131/168 [00:21<00:06, 5.58it/s]
1206
+
1207
+ 79%|███████▊ | 132/168 [00:22<00:06, 5.73it/s]
1208
+
1209
+ 79%|███████▉ | 133/168 [00:22<00:06, 5.66it/s]
1210
+
1211
+ 80%|███████▉ | 134/168 [00:22<00:05, 6.34it/s]
1212
+
1213
+ 80%|████████ | 135/168 [00:22<00:05, 6.50it/s]
1214
+
1215
+ 81%|████████ | 136/168 [00:22<00:04, 7.06it/s]
1216
+
1217
+ 82%|████████▏ | 137/168 [00:22<00:05, 5.69it/s]
1218
+
1219
+ 82%|████████▏ | 138/168 [00:23<00:05, 5.86it/s]
1220
+
1221
+ 83%|████████▎ | 139/168 [00:23<00:04, 6.27it/s]
1222
+
1223
+ 83%|████████▎ | 140/168 [00:23<00:04, 6.96it/s]
1224
+
1225
+ 84%|████████▍ | 141/168 [00:23<00:03, 7.29it/s]
1226
+
1227
+ 85%|████████▍ | 142/168 [00:23<00:03, 7.25it/s]
1228
+
1229
+ 85%|████████▌ | 143/168 [00:23<00:04, 5.77it/s]
1230
+
1231
+ 86%|████████▌ | 144/168 [00:24<00:04, 5.05it/s]
1232
+
1233
+ 86%|████████▋ | 145/168 [00:24<00:04, 5.33it/s]
1234
+
1235
+ 87%|████████▋ | 146/168 [00:24<00:03, 5.79it/s]
1236
+
1237
+ 88%|████████▊ | 147/168 [00:24<00:03, 6.15it/s]
1238
+
1239
+ 88%|████████▊ | 148/168 [00:24<00:02, 6.81it/s]
1240
+
1241
+ 89%|████████▊ | 149/168 [00:24<00:03, 5.49it/s]
1242
+
1243
+ 90%|████████▉ | 151/168 [00:25<00:03, 5.66it/s]
1244
+
1245
+ 91%|█████████ | 153/168 [00:25<00:02, 5.77it/s]
1246
+
1247
+ 92%|█████████▏| 154/168 [00:25<00:02, 6.05it/s]
1248
+
1249
+ 92%|█████████▏| 155/168 [00:25<00:02, 5.80it/s]
1250
+
1251
+ 93%|█████████▎| 156/168 [00:26<00:02, 5.39it/s]
1252
+
1253
+ 93%|█████████▎| 157/168 [00:26<00:02, 5.36it/s]
1254
+
1255
+ 94%|█████████▍| 158/168 [00:26<00:01, 6.07it/s]
1256
+
1257
+ 95%|█████████▍| 159/168 [00:26<00:01, 6.11it/s]
1258
+
1259
+ 95%|█████████▌| 160/168 [00:26<00:01, 5.82it/s]
1260
+
1261
+ 96%|█████████▌| 161/168 [00:26<00:01, 5.74it/s]
1262
+
1263
+ 96%|█████████▋| 162/168 [00:27<00:01, 5.98it/s]
1264
+
1265
+ 97%|█████████▋| 163/168 [00:27<00:00, 5.17it/s]
1266
+
1267
+ 98%|█████████▊| 164/168 [00:27<00:00, 5.61it/s]
1268
+
1269
+ 98%|█████████▊| 165/168 [00:27<00:00, 6.31it/s]
1270
+
1271
+ 99%|█████████▉| 166/168 [00:27<00:00, 6.78it/s]
1272
+
1273
+ 100%|██████████| 168/168 [00:28<00:00, 6.50it/s]
1274
+
1275
+
1276
+
1277
+ {'eval_loss': 0.35242682695388794, 'eval_runtime': 28.2403, 'eval_samples_per_second': 11.862, 'eval_steps_per_second': 5.949, 'epoch': 1.0}
1278
+
1279
+ 100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
1280
+
1281
+ 100%|██████████| 168/168 [00:28<00:00, 6.50it/s]
1282
+
1283
+ [INFO|trainer.py:1947] 2024-01-04 10:03:26,926 >>
1284
+
1285
+ Training completed. Do not forget to share your model on huggingface.co/models =)
1286
+
1287
+
1288
+
1289
+
1290
+ {'train_runtime': 553.4721, 'train_samples_per_second': 5.44, 'train_steps_per_second': 0.679, 'train_loss': 0.4441075046011742, 'epoch': 1.0}
1291
+
1292
+ 100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
1293
+ 100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
1294
+ [INFO|trainer.py:2889] 2024-01-04 10:03:26,930 >> Saving model checkpoint to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
1295
+ [INFO|tokenization_utils_base.py:2432] 2024-01-04 10:03:26,973 >> tokenizer config file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/tokenizer_config.json
1296
+ [INFO|tokenization_utils_base.py:2441] 2024-01-04 10:03:26,974 >> Special tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/special_tokens_map.json
1297
+ [INFO|tokenization_utils_base.py:2492] 2024-01-04 10:03:26,974 >> added tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/added_tokens.json
1298
+ ***** train metrics *****
1299
+ epoch = 1.0
1300
+ train_loss = 0.4441
1301
+ train_runtime = 0:09:13.47
1302
+ train_samples_per_second = 5.44
1303
+ train_steps_per_second = 0.679
1304
+ Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_loss.png
1305
+ Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_eval_loss.png
1306
+ [INFO|trainer.py:3166] 2024-01-04 10:03:27,895 >> ***** Running Evaluation *****
1307
+ [INFO|trainer.py:3168] 2024-01-04 10:03:27,895 >> Num examples = 335
1308
+ [INFO|trainer.py:3171] 2024-01-04 10:03:27,895 >> Batch size = 1
1309
+
1310
+ 0%| | 0/168 [00:00<?, ?it/s]
1311
+ 1%| | 2/168 [00:00<00:16, 10.31it/s]
1312
+ 2%|▏ | 4/168 [00:00<00:21, 7.58it/s]
1313
+ 3%|▎ | 5/168 [00:00<00:27, 5.85it/s]
1314
+ 4%|▎ | 6/168 [00:00<00:24, 6.54it/s]
1315
+ 4%|▍ | 7/168 [00:01<00:28, 5.68it/s]
1316
+ 5%|▍ | 8/168 [00:01<00:27, 5.80it/s]
1317
+ 5%|▌ | 9/168 [00:01<00:31, 5.01it/s]
1318
+ 6%|▌ | 10/168 [00:01<00:33, 4.73it/s]
1319
+ 7%|▋ | 11/168 [00:01<00:29, 5.32it/s]
1320
+ 7%|▋ | 12/168 [00:02<00:32, 4.81it/s]
1321
+ 8%|▊ | 13/168 [00:02<00:29, 5.34it/s]
1322
+ 8%|▊ | 14/168 [00:02<00:26, 5.85it/s]
1323
+ 9%|▉ | 15/168 [00:02<00:29, 5.23it/s]
1324
+ 10%|▉ | 16/168 [00:02<00:27, 5.50it/s]
1325
+ 10%|█ | 17/168 [00:02<00:25, 5.90it/s]
1326
+ 11%|█ | 18/168 [00:03<00:24, 6.21it/s]
1327
+ 11%|█▏ | 19/168 [00:03<00:23, 6.45it/s]
1328
+ 12%|█▏ | 20/168 [00:03<00:24, 6.14it/s]
1329
+ 12%|█▎ | 21/168 [00:03<00:28, 5.19it/s]
1330
+ 13%|█▎ | 22/168 [00:03<00:26, 5.46it/s]
1331
+ 14%|█▍ | 24/168 [00:04<00:21, 6.74it/s]
1332
+ 15%|█▍ | 25/168 [00:04<00:19, 7.15it/s]
1333
+ 15%|█▌ | 26/168 [00:04<00:20, 6.95it/s]
1334
+ 16%|█▌ | 27/168 [00:04<00:24, 5.74it/s]
1335
+ 17%|█▋ | 28/168 [00:04<00:23, 6.08it/s]
1336
+ 17%|█▋ | 29/168 [00:04<00:21, 6.38it/s]
1337
+ 18%|█▊ | 30/168 [00:05<00:21, 6.49it/s]
1338
+ 18%|█▊ | 31/168 [00:05<00:20, 6.69it/s]
1339
+ 19%|█▉ | 32/168 [00:05<00:20, 6.76it/s]
1340
+ 20%|██ | 34/168 [00:05<00:18, 7.39it/s]
1341
+ 21%|██ | 35/168 [00:05<00:18, 7.31it/s]
1342
+ 21%|██▏ | 36/168 [00:05<00:17, 7.68it/s]
1343
+ 22%|██▏ | 37/168 [00:05<00:17, 7.57it/s]
1344
+ 23%|██▎ | 38/168 [00:06<00:17, 7.43it/s]
1345
+ 23%|██▎ | 39/168 [00:06<00:20, 6.28it/s]
1346
+ 24%|██▍ | 40/168 [00:06<00:20, 6.31it/s]
1347
+ 24%|██▍ | 41/168 [00:06<00:20, 6.30it/s]
1348
+ 25%|██▌ | 42/168 [00:06<00:18, 6.89it/s]
1349
+ 26%|██▌ | 43/168 [00:06<00:20, 5.96it/s]
1350
+ 26%|██▌ | 44/168 [00:07<00:19, 6.46it/s]
1351
+ 27%|██▋ | 45/168 [00:07<00:21, 5.72it/s]
1352
+ 27%|██▋ | 46/168 [00:07<00:23, 5.28it/s]
1353
+ 28%|██▊ | 47/168 [00:07<00:25, 4.73it/s]
1354
+ 29%|██▊ | 48/168 [00:07<00:22, 5.28it/s]
1355
+ 29%|██▉ | 49/168 [00:08<00:21, 5.64it/s]
1356
+ 30%|██▉ | 50/168 [00:08<00:19, 6.08it/s]
1357
+ 30%|███ | 51/168 [00:08<00:21, 5.42it/s]
1358
+ 31%|███ | 52/168 [00:08<00:20, 5.68it/s]
1359
+ 32%|███▏ | 53/168 [00:08<00:22, 5.00it/s]
1360
+ 32%|███▏ | 54/168 [00:09<00:22, 5.10it/s]
1361
+ 33%|███▎ | 55/168 [00:09<00:21, 5.22it/s]
1362
+ 33%|███▎ | 56/168 [00:09<00:19, 5.75it/s]
1363
+ 34%|███▍ | 57/168 [00:09<00:17, 6.50it/s]
1364
+ 35%|███▍ | 58/168 [00:09<00:18, 5.82it/s]
1365
+ 35%|███▌ | 59/168 [00:09<00:21, 5.01it/s]
1366
+ 36%|███▌ | 60/168 [00:10<00:19, 5.43it/s]
1367
+ 36%|███▋ | 61/168 [00:10<00:17, 6.23it/s]
1368
+ 37%|███▋ | 62/168 [00:10<00:15, 6.80it/s]
1369
+ 38%|███▊ | 63/168 [00:10<00:15, 6.98it/s]
1370
+ 38%|███▊ | 64/168 [00:10<00:14, 7.13it/s]
1371
+ 39%|███▊ | 65/168 [00:10<00:14, 6.99it/s]
1372
+ 39%|███▉ | 66/168 [00:10<00:13, 7.40it/s]
1373
+ 40%|███▉ | 67/168 [00:11<00:17, 5.83it/s]
1374
+ 40%|████ | 68/168 [00:11<00:19, 5.08it/s]
1375
+ 41%|████ | 69/168 [00:11<00:17, 5.55it/s]
1376
+ 42%|████▏ | 70/168 [00:11<00:15, 6.15it/s]
1377
+ 42%|████▏ | 71/168 [00:11<00:14, 6.50it/s]
1378
+ 43%|████▎ | 72/168 [00:11<00:16, 5.78it/s]
1379
+ 43%|████▎ | 73/168 [00:12<00:19, 4.99it/s]
1380
+ 44%|████▍ | 74/168 [00:12<00:19, 4.94it/s]
1381
+ 45%|████▍ | 75/168 [00:12<00:18, 5.11it/s]
1382
+ 46%|████▌ | 77/168 [00:12<00:13, 6.51it/s]
1383
+ 46%|████▋ | 78/168 [00:12<00:12, 7.02it/s]
1384
+ 47%|████▋ | 79/168 [00:13<00:12, 7.07it/s]
1385
+ 48%|████▊ | 80/168 [00:13<00:13, 6.42it/s]
1386
+ 48%|████▊ | 81/168 [00:13<00:13, 6.36it/s]
1387
+ 49%|████▉ | 82/168 [00:13<00:13, 6.30it/s]
1388
+ 49%|████▉ | 83/168 [00:13<00:13, 6.38it/s]
1389
+ 50%|█████ | 84/168 [00:13<00:14, 5.63it/s]
1390
+ 51%|█████ | 85/168 [00:14<00:16, 4.99it/s]
1391
+ 51%|█████ | 86/168 [00:14<00:14, 5.53it/s]
1392
+ 52%|█████▏ | 87/168 [00:14<00:13, 6.01it/s]
1393
+ 52%|█████▏ | 88/168 [00:14<00:12, 6.64it/s]
1394
+ 53%|█████▎ | 89/168 [00:14<00:12, 6.46it/s]
1395
+ 54%|█████▎ | 90/168 [00:14<00:11, 6.51it/s]
1396
+ 54%|█████▍ | 91/168 [00:15<00:11, 6.53it/s]
1397
+ 55%|█████▍ | 92/168 [00:15<00:10, 7.09it/s]
1398
+ 55%|█████▌ | 93/168 [00:15<00:10, 7.20it/s]
1399
+ 56%|█████▌ | 94/168 [00:15<00:10, 7.29it/s]
1400
+ 57%|█████▋ | 95/168 [00:15<00:09, 7.34it/s]
1401
+ 57%|█████▋ | 96/168 [00:15<00:12, 5.72it/s]
1402
+ 58%|█████▊ | 97/168 [00:16<00:13, 5.09it/s]
1403
+ 58%|█████▊ | 98/168 [00:16<00:14, 4.85it/s]
1404
+ 59%|█████▉ | 99/168 [00:16<00:12, 5.42it/s]
1405
+ 60%|█████▉ | 100/168 [00:16<00:11, 5.69it/s]
1406
+ 60%|██████ | 101/168 [00:16<00:10, 6.38it/s]
1407
+ 61%|██████ | 102/168 [00:16<00:10, 6.57it/s]
1408
+ 61%|██████▏ | 103/168 [00:16<00:09, 7.15it/s]
1409
+ 62%|██████▏ | 104/168 [00:17<00:11, 5.73it/s]
1410
+ 62%|██████▎ | 105/168 [00:17<00:10, 6.08it/s]
1411
+ 63%|██████▎ | 106/168 [00:17<00:09, 6.21it/s]
1412
+ 64%|██████▎ | 107/168 [00:17<00:09, 6.46it/s]
1413
+ 64%|██████▍ | 108/168 [00:17<00:09, 6.60it/s]
1414
+ 65%|██████▍ | 109/168 [00:17<00:09, 6.36it/s]
1415
+ 65%|██████▌ | 110/168 [00:18<00:08, 6.97it/s]
1416
+ 66%|██████▌ | 111/168 [00:18<00:08, 6.43it/s]
1417
+ 67%|██████▋ | 112/168 [00:18<00:08, 6.29it/s]
1418
+ 67%|██████▋ | 113/168 [00:18<00:09, 6.02it/s]
1419
+ 68%|██████▊ | 114/168 [00:18<00:08, 6.34it/s]
1420
+ 68%|██████▊ | 115/168 [00:18<00:07, 6.66it/s]
1421
+ 69%|██████▉ | 116/168 [00:19<00:08, 5.85it/s]
1422
+ 70%|██████▉ | 117/168 [00:19<00:08, 6.21it/s]
1423
+ 70%|███████ | 118/168 [00:19<00:08, 6.16it/s]
1424
+ 71%|███████ | 119/168 [00:19<00:08, 5.47it/s]
1425
+ 71%|███████▏ | 120/168 [00:19<00:09, 4.83it/s]
1426
+ 72%|███████▏ | 121/168 [00:20<00:09, 4.99it/s]
1427
+ 73%|███████▎ | 122/168 [00:20<00:09, 4.75it/s]
1428
+ 73%|███████▎ | 123/168 [00:20<00:08, 5.47it/s]
1429
+ 74%|███████▍ | 124/168 [00:20<00:08, 5.03it/s]
1430
+ 74%|███████▍ | 125/168 [00:20<00:07, 5.49it/s]
1431
+ 75%|███████▌ | 126/168 [00:20<00:06, 6.18it/s]
1432
+ 76%|███████▌ | 127/168 [00:21<00:06, 6.57it/s]
1433
+ 76%|███████▌ | 128/168 [00:21<00:07, 5.39it/s]
1434
+ 77%|███████▋ | 129/168 [00:21<00:07, 5.44it/s]
1435
+ 77%|███████▋ | 130/168 [00:21<00:07, 4.82it/s]
1436
+ 78%|███████▊ | 131/168 [00:21<00:06, 5.59it/s]
1437
+ 79%|███████▊ | 132/168 [00:22<00:06, 5.74it/s]
1438
+ 79%|███████▉ | 133/168 [00:22<00:06, 5.67it/s]
1439
+ 80%|███████▉ | 134/168 [00:22<00:05, 6.34it/s]
1440
+ 80%|████████ | 135/168 [00:22<00:05, 6.52it/s]
1441
+ 81%|████████ | 136/168 [00:22<00:04, 7.07it/s]
1442
+ 82%|████████▏ | 137/168 [00:22<00:05, 5.69it/s]
1443
+ 82%|████████▏ | 138/168 [00:23<00:05, 5.85it/s]
1444
+ 83%|████████▎ | 139/168 [00:23<00:04, 6.27it/s]
1445
+ 83%|████████▎ | 140/168 [00:23<00:04, 6.96it/s]
1446
+ 84%|████████▍ | 141/168 [00:23<00:03, 7.33it/s]
1447
+ 85%|████████▍ | 142/168 [00:23<00:03, 7.28it/s]
1448
+ 85%|████████▌ | 143/168 [00:23<00:04, 5.78it/s]
1449
+ 86%|████████▌ | 144/168 [00:24<00:04, 5.05it/s]
1450
+ 86%|████████▋ | 145/168 [00:24<00:04, 5.33it/s]
1451
+ 87%|████████▋ | 146/168 [00:24<00:03, 5.79it/s]
1452
+ 88%|████████▊ | 147/168 [00:24<00:03, 6.16it/s]
1453
+ 88%|████████▊ | 148/168 [00:24<00:02, 6.82it/s]
1454
+ 89%|████████▊ | 149/168 [00:24<00:03, 5.53it/s]
1455
+ 90%|████████▉ | 151/168 [00:25<00:02, 5.67it/s]
1456
+ 91%|█████████ | 153/168 [00:25<00:02, 5.77it/s]
1457
+ 92%|█████████▏| 154/168 [00:25<00:02, 6.06it/s]
1458
+ 92%|█████████▏| 155/168 [00:25<00:02, 5.79it/s]
1459
+ 93%|█████████▎| 156/168 [00:26<00:02, 5.38it/s]
1460
+ 93%|█████████▎| 157/168 [00:26<00:02, 5.35it/s]
1461
+ 94%|█████████▍| 158/168 [00:26<00:01, 6.06it/s]
1462
+ 95%|█████████▍| 159/168 [00:26<00:01, 6.10it/s]
1463
+ 95%|█████████▌| 160/168 [00:26<00:01, 5.81it/s]
1464
+ 96%|█████████▌| 161/168 [00:26<00:01, 5.73it/s]
1465
+ 96%|█████████▋| 162/168 [00:27<00:01, 5.98it/s]
1466
+ 97%|█████████▋| 163/168 [00:27<00:00, 5.16it/s]
1467
+ 98%|█████████▊| 164/168 [00:27<00:00, 5.61it/s]
1468
+ 98%|█████████▊| 165/168 [00:27<00:00, 6.31it/s]
1469
+ 99%|█████████▉| 166/168 [00:27<00:00, 6.79it/s]
1470
+ 100%|██████████| 168/168 [00:28<00:00, 6.51it/s]
1471
+ 100%|██████████| 168/168 [00:28<00:00, 5.99it/s]
1472
+ ***** eval metrics *****
1473
+ epoch = 1.0
1474
+ eval_loss = 0.3524
1475
+ eval_runtime = 0:00:28.24
1476
+ eval_samples_per_second = 11.859
1477
+ eval_steps_per_second = 5.947
1478
+ [INFO|modelcard.py:452] 2024-01-04 10:03:56,150 >> Dropping the following result as it does not have all the necessary fields:
1479
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.4441075046011742,
4
+ "train_runtime": 553.4721,
5
+ "train_samples_per_second": 5.44,
6
+ "train_steps_per_second": 0.679
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 376, "loss": 1.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991278696516879e-05, "epoch": 0.03, "percentage": 2.66, "elapsed_time": "0:00:14", "remaining_time": "0:08:49"}
2
+ {"current_steps": 20, "total_steps": 376, "loss": 0.881, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9651756349750716e-05, "epoch": 0.05, "percentage": 5.32, "elapsed_time": "0:00:28", "remaining_time": "0:08:24"}
3
+ {"current_steps": 30, "total_steps": 376, "loss": 0.7979, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9218729375518135e-05, "epoch": 0.08, "percentage": 7.98, "elapsed_time": "0:00:43", "remaining_time": "0:08:16"}
4
+ {"current_steps": 40, "total_steps": 376, "loss": 0.7022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.861672729019797e-05, "epoch": 0.11, "percentage": 10.64, "elapsed_time": "0:00:57", "remaining_time": "0:08:05"}
5
+ {"current_steps": 50, "total_steps": 376, "loss": 0.5844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.784995028809707e-05, "epoch": 0.13, "percentage": 13.3, "elapsed_time": "0:01:11", "remaining_time": "0:07:45"}
6
+ {"current_steps": 60, "total_steps": 376, "loss": 0.4454, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.692374820516679e-05, "epoch": 0.16, "percentage": 15.96, "elapsed_time": "0:01:24", "remaining_time": "0:07:24"}
7
+ {"current_steps": 70, "total_steps": 376, "loss": 0.4076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.584458319296868e-05, "epoch": 0.19, "percentage": 18.62, "elapsed_time": "0:01:38", "remaining_time": "0:07:12"}
8
+ {"current_steps": 80, "total_steps": 376, "loss": 0.4111, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4619984631966524e-05, "epoch": 0.21, "percentage": 21.28, "elapsed_time": "0:01:52", "remaining_time": "0:06:56"}
9
+ {"current_steps": 90, "total_steps": 376, "loss": 0.4115, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3258496598716736e-05, "epoch": 0.24, "percentage": 23.94, "elapsed_time": "0:02:07", "remaining_time": "0:06:43"}
10
+ {"current_steps": 100, "total_steps": 376, "loss": 0.3566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.176961825348059e-05, "epoch": 0.27, "percentage": 26.6, "elapsed_time": "0:02:19", "remaining_time": "0:06:25"}
11
+ {"current_steps": 110, "total_steps": 376, "loss": 0.4302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.016373756417669e-05, "epoch": 0.29, "percentage": 29.26, "elapsed_time": "0:02:33", "remaining_time": "0:06:11"}
12
+ {"current_steps": 120, "total_steps": 376, "loss": 0.4271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.845205882908432e-05, "epoch": 0.32, "percentage": 31.91, "elapsed_time": "0:02:47", "remaining_time": "0:05:58"}
13
+ {"current_steps": 130, "total_steps": 376, "loss": 0.4625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6646524503974955e-05, "epoch": 0.35, "percentage": 34.57, "elapsed_time": "0:03:02", "remaining_time": "0:05:45"}
14
+ {"current_steps": 140, "total_steps": 376, "loss": 0.5066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.475973187908737e-05, "epoch": 0.37, "percentage": 37.23, "elapsed_time": "0:03:17", "remaining_time": "0:05:32"}
15
+ {"current_steps": 150, "total_steps": 376, "loss": 0.3887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.280484518729466e-05, "epoch": 0.4, "percentage": 39.89, "elapsed_time": "0:03:32", "remaining_time": "0:05:19"}
16
+ {"current_steps": 160, "total_steps": 376, "loss": 0.3675, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.079550375668821e-05, "epoch": 0.42, "percentage": 42.55, "elapsed_time": "0:03:44", "remaining_time": "0:05:03"}
17
+ {"current_steps": 170, "total_steps": 376, "loss": 0.4095, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8745726848402036e-05, "epoch": 0.45, "percentage": 45.21, "elapsed_time": "0:03:59", "remaining_time": "0:04:49"}
18
+ {"current_steps": 180, "total_steps": 376, "loss": 0.3782, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6669815843628042e-05, "epoch": 0.48, "percentage": 47.87, "elapsed_time": "0:04:13", "remaining_time": "0:04:35"}
19
+ {"current_steps": 190, "total_steps": 376, "loss": 0.4195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4582254462267476e-05, "epoch": 0.5, "percentage": 50.53, "elapsed_time": "0:04:28", "remaining_time": "0:04:22"}
20
+ {"current_steps": 200, "total_steps": 376, "loss": 0.3392, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2497607709397543e-05, "epoch": 0.53, "percentage": 53.19, "elapsed_time": "0:04:43", "remaining_time": "0:04:09"}
21
+ {"current_steps": 210, "total_steps": 376, "loss": 0.3347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0430420254607748e-05, "epoch": 0.56, "percentage": 55.85, "elapsed_time": "0:04:56", "remaining_time": "0:03:54"}
22
+ {"current_steps": 220, "total_steps": 376, "loss": 0.4117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8395114953217852e-05, "epoch": 0.58, "percentage": 58.51, "elapsed_time": "0:05:09", "remaining_time": "0:03:39"}
23
+ {"current_steps": 230, "total_steps": 376, "loss": 0.3772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.640589221739926e-05, "epoch": 0.61, "percentage": 61.17, "elapsed_time": "0:05:23", "remaining_time": "0:03:25"}
24
+ {"current_steps": 240, "total_steps": 376, "loss": 0.4403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.447663093929163e-05, "epoch": 0.64, "percentage": 63.83, "elapsed_time": "0:05:36", "remaining_time": "0:03:10"}
25
+ {"current_steps": 250, "total_steps": 376, "loss": 0.3867, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2620791657378664e-05, "epoch": 0.66, "percentage": 66.49, "elapsed_time": "0:05:50", "remaining_time": "0:02:56"}
26
+ {"current_steps": 260, "total_steps": 376, "loss": 0.3688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0851322641735118e-05, "epoch": 0.69, "percentage": 69.15, "elapsed_time": "0:06:04", "remaining_time": "0:02:42"}
27
+ {"current_steps": 270, "total_steps": 376, "loss": 0.3655, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.180569553392535e-06, "epoch": 0.72, "percentage": 71.81, "elapsed_time": "0:06:19", "remaining_time": "0:02:28"}
28
+ {"current_steps": 280, "total_steps": 376, "loss": 0.4144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.620189308133943e-06, "epoch": 0.74, "percentage": 74.47, "elapsed_time": "0:06:33", "remaining_time": "0:02:15"}
29
+ {"current_steps": 290, "total_steps": 376, "loss": 0.3298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.181068745693716e-06, "epoch": 0.77, "percentage": 77.13, "elapsed_time": "0:06:47", "remaining_time": "0:02:00"}
30
+ {"current_steps": 300, "total_steps": 376, "loss": 0.3337, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.873248671810928e-06, "epoch": 0.8, "percentage": 79.79, "elapsed_time": "0:07:00", "remaining_time": "0:01:46"}
31
+ {"current_steps": 310, "total_steps": 376, "loss": 0.3217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7058538030980942e-06, "epoch": 0.82, "percentage": 82.45, "elapsed_time": "0:07:13", "remaining_time": "0:01:32"}
32
+ {"current_steps": 320, "total_steps": 376, "loss": 0.3222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.687029103502972e-06, "epoch": 0.85, "percentage": 85.11, "elapsed_time": "0:07:26", "remaining_time": "0:01:18"}
33
+ {"current_steps": 330, "total_steps": 376, "loss": 0.3989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.823882956546566e-06, "epoch": 0.88, "percentage": 87.77, "elapsed_time": "0:07:41", "remaining_time": "0:01:04"}
34
+ {"current_steps": 340, "total_steps": 376, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1224375698271894e-06, "epoch": 0.9, "percentage": 90.43, "elapsed_time": "0:07:55", "remaining_time": "0:00:50"}
35
+ {"current_steps": 350, "total_steps": 376, "loss": 0.4108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.875869578203824e-07, "epoch": 0.93, "percentage": 93.09, "elapsed_time": "0:08:08", "remaining_time": "0:00:36"}
36
+ {"current_steps": 360, "total_steps": 376, "loss": 0.3578, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.230627961304993e-07, "epoch": 0.96, "percentage": 95.74, "elapsed_time": "0:08:21", "remaining_time": "0:00:22"}
37
+ {"current_steps": 370, "total_steps": 376, "loss": 0.3453, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1408385430356516e-08, "epoch": 0.98, "percentage": 98.4, "elapsed_time": "0:08:36", "remaining_time": "0:00:08"}
38
+ {"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
39
+ {"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
40
+ {"current_steps": 168, "total_steps": 168, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:09:42", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.99867197875166,
5
+ "eval_steps": 500,
6
+ "global_step": 376,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "learning_rate": 4.991278696516879e-05,
14
+ "loss": 1.0017,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.05,
19
+ "learning_rate": 4.9651756349750716e-05,
20
+ "loss": 0.881,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.08,
25
+ "learning_rate": 4.9218729375518135e-05,
26
+ "loss": 0.7979,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.11,
31
+ "learning_rate": 4.861672729019797e-05,
32
+ "loss": 0.7022,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.13,
37
+ "learning_rate": 4.784995028809707e-05,
38
+ "loss": 0.5844,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.16,
43
+ "learning_rate": 4.692374820516679e-05,
44
+ "loss": 0.4454,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.19,
49
+ "learning_rate": 4.584458319296868e-05,
50
+ "loss": 0.4076,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.21,
55
+ "learning_rate": 4.4619984631966524e-05,
56
+ "loss": 0.4111,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.24,
61
+ "learning_rate": 4.3258496598716736e-05,
62
+ "loss": 0.4115,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.27,
67
+ "learning_rate": 4.176961825348059e-05,
68
+ "loss": 0.3566,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.29,
73
+ "learning_rate": 4.016373756417669e-05,
74
+ "loss": 0.4302,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.32,
79
+ "learning_rate": 3.845205882908432e-05,
80
+ "loss": 0.4271,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.35,
85
+ "learning_rate": 3.6646524503974955e-05,
86
+ "loss": 0.4625,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.37,
91
+ "learning_rate": 3.475973187908737e-05,
92
+ "loss": 0.5066,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.4,
97
+ "learning_rate": 3.280484518729466e-05,
98
+ "loss": 0.3887,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.42,
103
+ "learning_rate": 3.079550375668821e-05,
104
+ "loss": 0.3675,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.45,
109
+ "learning_rate": 2.8745726848402036e-05,
110
+ "loss": 0.4095,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.48,
115
+ "learning_rate": 2.6669815843628042e-05,
116
+ "loss": 0.3782,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.5,
121
+ "learning_rate": 2.4582254462267476e-05,
122
+ "loss": 0.4195,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.53,
127
+ "learning_rate": 2.2497607709397543e-05,
128
+ "loss": 0.3392,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.56,
133
+ "learning_rate": 2.0430420254607748e-05,
134
+ "loss": 0.3347,
135
+ "step": 210
136
+ },
137
+ {
138
+ "epoch": 0.58,
139
+ "learning_rate": 1.8395114953217852e-05,
140
+ "loss": 0.4117,
141
+ "step": 220
142
+ },
143
+ {
144
+ "epoch": 0.61,
145
+ "learning_rate": 1.640589221739926e-05,
146
+ "loss": 0.3772,
147
+ "step": 230
148
+ },
149
+ {
150
+ "epoch": 0.64,
151
+ "learning_rate": 1.447663093929163e-05,
152
+ "loss": 0.4403,
153
+ "step": 240
154
+ },
155
+ {
156
+ "epoch": 0.66,
157
+ "learning_rate": 1.2620791657378664e-05,
158
+ "loss": 0.3867,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 0.69,
163
+ "learning_rate": 1.0851322641735118e-05,
164
+ "loss": 0.3688,
165
+ "step": 260
166
+ },
167
+ {
168
+ "epoch": 0.72,
169
+ "learning_rate": 9.180569553392535e-06,
170
+ "loss": 0.3655,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 0.74,
175
+ "learning_rate": 7.620189308133943e-06,
176
+ "loss": 0.4144,
177
+ "step": 280
178
+ },
179
+ {
180
+ "epoch": 0.77,
181
+ "learning_rate": 6.181068745693716e-06,
182
+ "loss": 0.3298,
183
+ "step": 290
184
+ },
185
+ {
186
+ "epoch": 0.8,
187
+ "learning_rate": 4.873248671810928e-06,
188
+ "loss": 0.3337,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 0.82,
193
+ "learning_rate": 3.7058538030980942e-06,
194
+ "loss": 0.3217,
195
+ "step": 310
196
+ },
197
+ {
198
+ "epoch": 0.85,
199
+ "learning_rate": 2.687029103502972e-06,
200
+ "loss": 0.3222,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.88,
205
+ "learning_rate": 1.823882956546566e-06,
206
+ "loss": 0.3989,
207
+ "step": 330
208
+ },
209
+ {
210
+ "epoch": 0.9,
211
+ "learning_rate": 1.1224375698271894e-06,
212
+ "loss": 0.3805,
213
+ "step": 340
214
+ },
215
+ {
216
+ "epoch": 0.93,
217
+ "learning_rate": 5.875869578203824e-07,
218
+ "loss": 0.4108,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 0.96,
223
+ "learning_rate": 2.230627961304993e-07,
224
+ "loss": 0.3578,
225
+ "step": 360
226
+ },
227
+ {
228
+ "epoch": 0.98,
229
+ "learning_rate": 3.1408385430356516e-08,
230
+ "loss": 0.3453,
231
+ "step": 370
232
+ },
233
+ {
234
+ "epoch": 1.0,
235
+ "eval_loss": 0.35242682695388794,
236
+ "eval_runtime": 28.2403,
237
+ "eval_samples_per_second": 11.862,
238
+ "eval_steps_per_second": 5.949,
239
+ "step": 376
240
+ },
241
+ {
242
+ "epoch": 1.0,
243
+ "step": 376,
244
+ "total_flos": 2.22435081191424e+16,
245
+ "train_loss": 0.4441075046011742,
246
+ "train_runtime": 553.4721,
247
+ "train_samples_per_second": 5.44,
248
+ "train_steps_per_second": 0.679
249
+ }
250
+ ],
251
+ "logging_steps": 10,
252
+ "max_steps": 376,
253
+ "num_input_tokens_seen": 0,
254
+ "num_train_epochs": 1,
255
+ "save_steps": 1000,
256
+ "total_flos": 2.22435081191424e+16,
257
+ "train_batch_size": 1,
258
+ "trial_name": null,
259
+ "trial_params": null
260
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43631845fad65fe78dc251bc687da889571e404213e9b9c084870732e93c38ea
3
+ size 4984
training_eval_loss.png ADDED
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff