yhyu13 commited on Jan 4

Commit

edc5215

•

1 Parent(s): 405aa43

Upload

Browse files

Files changed (20) hide show

Predict_20/all_results.json +9 -0
Predict_20/generated_predictions.jsonl +20 -0
Predict_20/predict_log.txt +286 -0
Predict_20/predict_results.json +9 -0
adapter_config.json +25 -0
adapter_model.safetensors +3 -0
added_tokens.json +42 -0
all_results.json +11 -0
eval_results.json +7 -0
merges.txt +0 -0
special_tokens_map.json +30 -0
tokenizer_config.json +345 -0
train_eval_log.txt +1479 -0
train_results.json +7 -0
trainer_log.jsonl +40 -0
trainer_state.json +260 -0
training_args.bin +3 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0
vocab.json +0 -0

Predict_20/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "predict_bleu-4": 74.196885,
+    "predict_rouge-1": 80.372465,
+    "predict_rouge-2": 70.340295,
+    "predict_rouge-l": 77.72346,
+    "predict_runtime": 29.4113,
+    "predict_samples_per_second": 0.68,
+    "predict_steps_per_second": 0.68
+}

Predict_20/generated_predictions.jsonl ADDED Viewed

	@@ -0,0 +1,20 @@

+{"label": "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!", "predict": "I'm sorry, but I'm unable to perform tasks outside of my defined functions. As of now, I can only help with tasks like getting the exchange rate between two currencies."}
+{"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>"}
+{"label": "Here are the latest news headlines for the United States:\n1. Biden announces new vaccine mandates\n2. Hurricane Ida devastates Louisiana\n3. Apple unveils new iPhone\n4. NASA's Perseverance rover collects first Mars rock sample", "predict": "The latest news headlines for the United States are: Biden announces new vaccine mandates, Hurricane Ida devastates Louisiana, Apple unveils new iPhone, and NASA's Perseverance rover collects first Mars rock sample."}
+{"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>"}
+{"label": "Here are the latest news headlines for France:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25", "predict": "The latest news headlines for France are:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25"}
+{"label": "Of course. How long would you like your password to be? And would you like it to include symbols?", "predict": "Of course, I can help with that. How long would you like your password to be?"}
+{"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>"}
+{"label": "Here is your new password: 4&7j#9@1Q6*. Please make sure to save it in a secure location.", "predict": "Here is your new password. It is 4&7j#9@1Q6* and is 12 characters long. It includes symbols as per your request."}
+{"label": "Of course! How long would you like your password to be? And do you want it to include numbers and special characters?", "predict": "Of course! I can help with that. How long would you like the password to be? And do you want it to include numbers or special characters?"}
+{"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>"}
+{"label": "Here is your new password: 4#7gB6&9L1!0. Please make sure to save it in a secure place.", "predict": "Here is your new password: 4#7gB6&9L1!0"}
+{"label": "Of course, I can help you with that. Please provide me with the list of numbers.", "predict": "Of course, I can help you with that. Please provide me with the list of numbers."}
+{"label": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>"}
+{"label": "The median of the provided list of numbers is 5.", "predict": "The median of the list of numbers [5, 2, 9, 1, 7, 4, 6, 3, 8] is 5."}
+{"label": "Of course, I can help you with that. Let me calculate it for you.\n<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>"}
+{"label": "Based on the information provided, your monthly loan payment would be $530.33.", "predict": "According to the calculations, your monthly loan payment would be $530.33."}
+{"label": "I'm sorry, but as an AI, I don't have the capability to book flights or perform external tasks. My current function is to calculate loan payments. I recommend using a travel booking website or app for your flight booking needs.", "predict": "I'm sorry, but I'm unable to assist with booking flights. My current capabilities are limited to calculating loan payments."}
+{"label": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>"}
+{"label": "The converted temperature from 30 degrees Celsius to Fahrenheit is 86 degrees Fahrenheit.", "predict": "The temperature 30 degrees Celsius is equivalent to 86 degrees Fahrenheit."}
+{"label": "I'm sorry, but I can't assist with that. My current capabilities allow me to provide information about movies based on their titles. I don't have the ability to place orders or perform external tasks.", "predict": "I'm sorry, but I'm unable to perform external tasks like ordering a pizza. My current capabilities are limited to providing information and answering questions. If you have any other questions or need assistance with something else, feel free to ask."}

Predict_20/predict_log.txt ADDED Viewed

	@@ -0,0 +1,286 @@

+01/04/2024 10:04:05 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
+[INFO|training_args.py:1838] 2024-01-04 10:04:05,581 >> PyTorch: setting up devices
+/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
+  warnings.warn(
+01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
+  distributed training: True, compute dtype: None
+01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=False,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=False,
+do_predict=True,
+do_train=False,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=None,
+evaluation_strategy=IntervalStrategy.NO,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=1,
+gradient_checkpointing=False,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=False,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=5e-05,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/runs/Jan04_10-04-05_yhyu13fuwuqi,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=500,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=3.0,
+optim=OptimizerNames.ADAMW_TORCH,
+optim_args=None,
+output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
+overwrite_output_dir=False,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=8,
+predict_with_generate=True,
+prediction_loss_only=False,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=['tensorboard'],
+resume_from_checkpoint=None,
+run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=None,
+seed=42,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_batches=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=0,
+weight_decay=0.0,
+)
+01/04/2024 10:04:05 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
+01/04/2024 10:04:05 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
+Using custom data configuration default-b024aadef2a1493c
+Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
+Overwrite dataset info from restored data version if exists.
+Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
+Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
+Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file vocab.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file merges.txt
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer.json
+[WARNING|logging.py:314] 2024-01-04 10:04:06,448 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,448 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
+[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,449 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
+[INFO|configuration_utils.py:802] 2024-01-04 10:04:06,450 >> Model config PhiConfig {
+  "_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "PhiForCausalLM"
+  ],
+  "attn_pdrop": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi.PhiConfig",
+    "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
+  },
+  "embd_pdrop": 0.0,
+  "flash_attn": false,
+  "flash_rotary": false,
+  "fused_dense": false,
+  "img_processor": null,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "phi-msft",
+  "n_embd": 2560,
+  "n_head": 32,
+  "n_head_kv": null,
+  "n_inner": null,
+  "n_layer": 32,
+  "n_positions": 2048,
+  "resid_pdrop": 0.1,
+  "rotary_dim": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.36.2",
+  "use_cache": false,
+  "vocab_size": 51200
+}
+[INFO|modeling_utils.py:3341] 2024-01-04 10:04:06,482 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
+[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
+  "use_cache": false
+}
+[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
+  "use_cache": false
+}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
+Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.41it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s]cognitivecomputations/dolphin-2_6-phi-2
+[WARNING|modeling_utils.py:4175] 2024-01-04 10:04:07,704 >> Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
+- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a modelcognitivecomputations/dolphin-2_6-phi-2r with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+[INFO|modeling_utils.py:4193] 2024-01-04 10:04:07,704 >> All the weights of PhiForCausalLM were initialized from the model checkpoint at ./models/dolphin-2_6-phi-2.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
+[INFO|configuration_utils.py:779] 2024-01-04 10:04:07,707 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
+[INFO|configuration_utils.py:826] 2024-01-04 10:04:07,707 >> Generate config GenerationConfig {}
+01/04/2024 10:04:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
+01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Merged 1 adapter(s).
+01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Loaded adapter(s): ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
+01/04/2024 10:04:09 - INFO - llmtuner.model.loader - trainable params: 0 || all params: 2779683840 || trainable%: 0.0000
+01/04/2024 10:04:09 - INFO - llmtuner.model.loader - This IS expected that the trainable params is 0 if you are using model for inference only.
+Running tokenizer on dataset:   0%|          | 0/20 [00:00<?, ? examples/s]Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c7eb5697298b6539.arrow
+Running tokenizer on dataset: 100%|██████████| 20/20 [00:00<00:00, 360.26 examples/s]
+[INFO|training_args.py:1838] 2024-01-04 10:04:09,995 >> PyTorch: setting up devices
+[INFO|trainer.py:3166] 2024-01-04 10:04:10,639 >> ***** Running Prediction *****
+[INFO|trainer.py:3168] 2024-01-04 10:04:10,639 >>   Num examples = 20
+[INFO|trainer.py:3171] 2024-01-04 10:04:10,639 >>   Batch size = 1
+[INFO|configuration_utils.py:826] 2024-01-04 10:04:10,651 >> Generate config GenerationConfig {
+  "use_cache": false
+}
+/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/generation/utils.py:1518: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )
+  warnings.warn(
+input_ids:
+[32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25]
+inputs:
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
+{
+      "name": "get_exchange_rate",
+      "description": "Get the exchange rate between two currencies",
+      "parameters": {
+          "type": "object",
+          "properties": {
+              "base_currency": {
+                  "type": "string",
+                  "description": "The currency to convert from"
+              },
+              "target_currency": {
+                  "type": "string",
+                  "description": "The currency to convert to"
+              }
+          },
+          "required": [
+              "base_currency",
+              "target_currency"
+          ]
+      }
+}
+Can you book a flight for me from New York to London?
+Assistant:
+  0%|          | 0/20 [00:00<?, ?it/s]
+ 10%|█         | 2/20 [00:01<00:11,  1.56it/s]
+ 15%|█▌        | 3/20 [00:03<00:19,  1.15s/it]
+ 20%|██        | 4/20 [00:04<00:20,  1.26s/it]
+ 25%|██▌       | 5/20 [00:07<00:25,  1.67s/it]
+ 30%|███       | 6/20 [00:07<00:19,  1.38s/it]
+ 35%|███▌      | 7/20 [00:09<00:18,  1.45s/it]
+ 40%|████      | 8/20 [00:10<00:17,  1.47s/it]
+ 45%|████▌     | 9/20 [00:12<00:15,  1.39s/it]
+ 50%|█████     | 10/20 [00:13<00:15,  1.51s/it]
+ 55%|█████▌    | 11/20 [00:14<00:11,  1.29s/it]
+ 60%|██████    | 12/20 [00:15<00:09,  1.13s/it]
+ 65%|██████▌   | 13/20 [00:17<00:09,  1.35s/it]
+ 70%|███████   | 14/20 [00:18<00:07,  1.29s/it]
+ 75%|███████▌  | 15/20 [00:20<00:07,  1.49s/it]
+ 80%|████████  | 16/20 [00:21<00:04,  1.25s/it]
+ 85%|████████▌ | 17/20 [00:22<00:03,  1.17s/it]
+ 90%|█████████ | 18/20 [00:24<00:02,  1.40s/it]
+ 95%|█████████▌| 19/20 [00:24<00:01,  1.14s/it]
+100%|██████████| 20/20 [00:26<00:00,  1.34s/it]Building prefix dict from the default dictionary ...
+Loading model from cache /tmp/jieba.cache
+Loading model cost 0.697 seconds.
+Prefix dict has been built successfully.
+100%|██████████| 20/20 [00:27<00:00,  1.36s/it]
+***** predict metrics *****
+  predict_bleu-4             =    74.1969
+  predict_rouge-1            =    80.3725
+  predict_rouge-2            =    70.3403
+  predict_rouge-l            =    77.7235
+  predict_runtime            = 0:00:29.41
+  predict_samples_per_second =       0.68
+  predict_steps_per_second   =       0.68
+01/04/2024 10:04:40 - INFO - llmtuner.train.sft.trainer - Saving prediction results to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/generated_predictions.jsonl

Predict_20/predict_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "predict_bleu-4": 74.196885,
+    "predict_rouge-1": 80.372465,
+    "predict_rouge-2": 70.340295,
+    "predict_rouge-l": 77.72346,
+    "predict_runtime": 29.4113,
+    "predict_samples_per_second": 0.68,
+    "predict_steps_per_second": 0.68
+}

adapter_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "Wqkv"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3068eae74f7a7a2dcd863a7d976b25c9a4a719bf7cef9e3ae42edd45c5c482fb
+size 10493896

added_tokens.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "\t\t": 50294,
+  "\t\t\t": 50293,
+  "\t\t\t\t": 50292,
+  "\t\t\t\t\t": 50291,
+  "\t\t\t\t\t\t": 50290,
+  "\t\t\t\t\t\t\t": 50289,
+  "\t\t\t\t\t\t\t\t": 50288,
+  "\t\t\t\t\t\t\t\t\t": 50287,
+  "  ": 50286,
+  "   ": 50285,
+  "    ": 50284,
+  "     ": 50283,
+  "      ": 50282,
+  "       ": 50281,
+  "        ": 50280,
+  "         ": 50279,
+  "          ": 50278,
+  "           ": 50277,
+  "            ": 50276,
+  "             ": 50275,
+  "              ": 50274,
+  "               ": 50273,
+  "                ": 50272,
+  "                 ": 50271,
+  "                  ": 50270,
+  "                   ": 50269,
+  "                    ": 50268,
+  "                     ": 50267,
+  "                      ": 50266,
+  "                       ": 50265,
+  "                        ": 50264,
+  "                         ": 50263,
+  "                          ": 50262,
+  "                           ": 50261,
+  "                            ": 50260,
+  "                             ": 50259,
+  "                              ": 50258,
+  "                               ": 50257,
+  "<|im_end|>": 50295,
+  "<|im_start|>": 50296
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 1.0,
+    "eval_loss": 0.35242682695388794,
+    "eval_runtime": 28.2489,
+    "eval_samples_per_second": 11.859,
+    "eval_steps_per_second": 5.947,
+    "train_loss": 0.4441075046011742,
+    "train_runtime": 553.4721,
+    "train_samples_per_second": 5.44,
+    "train_steps_per_second": 0.679
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 1.0,
+    "eval_loss": 0.35242682695388794,
+    "eval_runtime": 28.2489,
+    "eval_samples_per_second": 11.859,
+    "eval_steps_per_second": 5.947
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,345 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "                               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "                         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50280": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50281": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50282": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50283": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50284": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50285": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50286": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50287": {
+      "content": "\t\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50288": {
+      "content": "\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50289": {
+      "content": "\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50290": {
+      "content": "\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50291": {
+      "content": "\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50292": {
+      "content": "\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50293": {
+      "content": "\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50294": {
+      "content": "\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50295": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50296": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 2048,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "CodeGenTokenizer",
+  "unk_token": "<|endoftext|>"
+}

train_eval_log.txt ADDED Viewed

	@@ -0,0 +1,1479 @@

+Using RTX 3090 or 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
+[INFO|training_args.py:1838] 2024-01-04 09:53:50,866 >> PyTorch: setting up devices
+/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
+  warnings.warn(
+01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
+  distributed training: True, compute dtype: None
+01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=False,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=None,
+evaluation_strategy=IntervalStrategy.EPOCH,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=False,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=False,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=5e-05,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.COSINE,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=1.0,
+optim=OptimizerNames.ADAMW_TORCH,
+optim_args=None,
+output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=1,
+predict_with_generate=False,
+prediction_loss_only=True,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=['tensorboard'],
+resume_from_checkpoint=None,
+run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=None,
+seed=42,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_batches=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=0,
+weight_decay=0.0,
+)
+01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
+01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
+01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
+/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
+  warnings.warn(
+01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 1, device: cuda:1, n_gpu: 1
+  distributed training: True, compute dtype: None
+01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=False,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=None,
+evaluation_strategy=IntervalStrategy.EPOCH,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=False,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=False,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=5e-05,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=1,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.COSINE,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=1.0,
+optim=OptimizerNames.ADAMW_TORCH,
+optim_args=None,
+output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=1,
+predict_with_generate=False,
+prediction_loss_only=True,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=['tensorboard'],
+resume_from_checkpoint=None,
+run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=None,
+seed=42,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_batches=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=0,
+weight_decay=0.0,
+)
+01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
+01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
+Using custom data configuration default-b024aadef2a1493c
+Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
+Overwrite dataset info from restored data version if exists.
+Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
+Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
+Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file vocab.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file merges.txt
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer.json
+[WARNING|logging.py:314] 2024-01-04 09:53:51,743 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:737] 2024-01-04 09:53:51,744 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
+[INFO|configuration_utils.py:737] 2024-01-04 09:53:51,749 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
+[INFO|configuration_utils.py:802] 2024-01-04 09:53:51,750 >> Model config PhiConfig {
+  "_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "PhiForCausalLM"
+  ],
+  "attn_pdrop": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi.PhiConfig",
+    "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
+  },
+  "embd_pdrop": 0.0,
+  "flash_attn": false,
+  "flash_rotary": false,
+  "fused_dense": false,
+  "img_processor": null,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "phi-msft",
+  "n_embd": 2560,
+  "n_head": 32,
+  "n_head_kv": null,
+  "n_inner": null,
+  "n_layer": 32,
+  "n_positions": 2048,
+  "resid_pdrop": 0.1,
+  "rotary_dim": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.36.2",
+  "use_cache": false,
+  "vocab_size": 51200
+}
+01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
+[INFO|modeling_utils.py:2907] 2024-01-04 09:53:51,820 >> Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.float16 to remove this warning.
+[INFO|modeling_utils.py:3341] 2024-01-04 09:53:51,820 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
+[INFO|modeling_utils.py:1341] 2024-01-04 09:53:51,821 >> Instantiating PhiForCausalLM model under default dtype torch.float16.
+[INFO|configuration_utils.py:826] 2024-01-04 09:53:51,821 >> Generate config GenerationConfig {
+  "use_cache": false
+}
+[INFO|configuration_utils.py:826] 2024-01-04 09:53:51,822 >> Generate config GenerationConfig {
+  "use_cache": false
+}
+[INFO|modeling_utils.py:3483] 2024-01-04 09:53:51,875 >> Detected 4-bit loading: activating 4-bit loading for this model
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.28s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.27s/it]cognitivecomputations/dolphin-2_6-phi-2
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
+[WARNING|modeling_utils.py:4175] 2024-01cognitivecomputations/dolphin-2_6-phi-2eights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
+- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+[INFO|modeling_utils.py:4193] 2024-01-04 09:53:53,730 >> All the weights of PhiForCausalcognitivecomputations/dolphin-2_6-phi-2he model checkpoint at ./models/dolphin-2_6-phi-2.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.47it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
+Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
+- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+[INFO|configuration_utils.py:779] 2024-01-04 09:53:53,733 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
+[INFO|configuration_utils.py:826] 2024-01-04 09:53:53,733 >> Generate config GenerationConfig {}
+[WARNING|modeling_utils.py:2045] 2024-01-04 09:53:53,816 >> You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
+01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
+01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
+You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
+01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
+01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
+01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
+01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
+Running tokenizer on dataset:   0%|          | 0/3347 [00:00<?, ? examples/s][WARNING|tokenization_utils_base.py:3835] 2024-01-04 09:53:55,217 >> Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
+Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c64b6c6785bc1929.arrow
+Running tokenizer on dataset:  30%|██▉       | 1000/3347 [00:02<00:06, 372.68 examples/s]
+Running tokenizer on dataset:  60%|█████▉    | 2000/3347 [00:05<00:03, 387.09 examples/s]
+Running tokenizer on dataset:  90%|████████▉ | 3000/3347 [00:07<00:00, 395.52 examples/s]
+Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 396.84 examples/s]
+Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.48 examples/s]
+input_ids:
+[32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
+inputs:
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
+{
+      "name": "get_exchange_rate",
+      "description": "Get the exchange rate between two currencies",
+      "parameters": {
+          "type": "object",
+          "properties": {
+              "base_currency": {
+                  "type": "string",
+                  "description": "The currency to convert from"
+              },
+              "target_currency": {
+                  "type": "string",
+                  "description": "The currency to convert to"
+              }
+          },
+          "required": [
+              "base_currency",
+              "target_currency"
+          ]
+      }
+}
+Can you book a flight for me from New York to London?
+Assistant:I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
+label_ids:
+[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
+labels:
+I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
+[INFO|training_args.py:1838] 2024-01-04 09:54:03,936 >> PyTorch: setting up devices
+Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2d738e000d25696c.arrow
+Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-fe95a5c264c6067e.arrow
+Running tokenizer on dataset:   0%|          | 0/3347 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
+Running tokenizer on dataset:  30%|██▉       | 1000/3347 [00:02<00:06, 375.58 examples/s]
+Running tokenizer on dataset:  60%|█████▉    | 2000/3347 [00:05<00:03, 389.75 examples/s]
+Running tokenizer on dataset:  90%|████████▉ | 3000/3347 [00:07<00:00, 396.16 examples/s]
+Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 395.57 examples/s]
+Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.61 examples/s]
+[INFO|trainer.py:1706] 2024-01-04 09:54:13,452 >> ***** Running training *****
+[INFO|trainer.py:1707] 2024-01-04 09:54:13,452 >>   Num examples = 3,011
+[INFO|trainer.py:1708] 2024-01-04 09:54:13,452 >>   Num Epochs = 1
+[INFO|trainer.py:1709] 2024-01-04 09:54:13,452 >>   Instantaneous batch size per device = 1
+[INFO|trainer.py:1712] 2024-01-04 09:54:13,452 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
+[INFO|trainer.py:1713] 2024-01-04 09:54:13,452 >>   Gradient Accumulation steps = 4
+[INFO|trainer.py:1714] 2024-01-04 09:54:13,452 >>   Total optimization steps = 376
+[INFO|trainer.py:1715] 2024-01-04 09:54:13,454 >>   Number of trainable parameters = 2,621,440
+  0%|          | 0/376 [00:00<?, ?it/s]
+  0%|          | 1/376 [00:02<13:10,  2.11s/it]
+  1%|          | 2/376 [00:03<09:45,  1.56s/it]
+  1%|          | 3/376 [00:04<09:09,  1.47s/it]
+  1%|          | 4/376 [00:06<08:49,  1.42s/it]
+  1%|▏         | 5/376 [00:07<08:34,  1.39s/it]
+  2%|▏         | 6/376 [00:09<09:48,  1.59s/it]
+  2%|▏         | 7/376 [00:10<09:20,  1.52s/it]
+  2%|▏         | 8/376 [00:11<08:46,  1.43s/it]
+  2%|▏         | 9/376 [00:13<08:32,  1.40s/it]
+  3%|▎         | 10/376 [00:14<08:10,  1.34s/it]
+{'loss': 1.0017, 'learning_rate': 4.991278696516879e-05, 'epoch': 0.03}
+  3%|▎         | 10/376 [00:14<08:10,  1.34s/it]
+  3%|▎         | 11/376 [00:15<08:11,  1.35s/it]
+  3%|▎         | 12/376 [00:17<08:19,  1.37s/it]
+  3%|▎         | 13/376 [00:18<08:30,  1.41s/it]
+  4%|▎         | 14/376 [00:20<08:30,  1.41s/it]
+  4%|▍         | 15/376 [00:21<08:03,  1.34s/it]
+  4%|▍         | 16/376 [00:22<08:26,  1.41s/it]
+  5%|▍         | 17/376 [00:24<07:54,  1.32s/it]
+  5%|▍         | 18/376 [00:25<07:42,  1.29s/it]
+  5%|▌         | 19/376 [00:27<08:59,  1.51s/it]
+  5%|▌         | 20/376 [00:28<08:13,  1.39s/it]
+{'loss': 0.881, 'learning_rate': 4.9651756349750716e-05, 'epoch': 0.05}
+  5%|▌         | 20/376 [00:28<08:13,  1.39s/it]
+  6%|▌         | 21/376 [00:29<08:05,  1.37s/it]
+  6%|▌         | 22/376 [00:31<09:07,  1.55s/it]
+  6%|▌         | 23/376 [00:32<08:33,  1.46s/it]
+  6%|▋         | 24/376 [00:34<08:40,  1.48s/it]
+  7%|▋         | 25/376 [00:35<08:38,  1.48s/it]
+  7%|▋         | 26/376 [00:37<08:25,  1.44s/it]
+  7%|▋         | 27/376 [00:38<08:02,  1.38s/it]
+  7%|▋         | 28/376 [00:40<08:37,  1.49s/it]
+  8%|▊         | 29/376 [00:41<08:31,  1.47s/it]
+  8%|▊         | 30/376 [00:43<08:23,  1.46s/it]
+{'loss': 0.7979, 'learning_rate': 4.9218729375518135e-05, 'epoch': 0.08}
+  8%|▊         | 30/376 [00:43<08:23,  1.46s/it]
+  8%|▊         | 31/376 [00:44<08:11,  1.43s/it]
+  9%|▊         | 32/376 [00:45<08:12,  1.43s/it]
+  9%|▉         | 33/376 [00:47<08:13,  1.44s/it]
+  9%|▉         | 34/376 [00:48<08:23,  1.47s/it]
+  9%|▉         | 35/376 [00:50<08:44,  1.54s/it]
+ 10%|▉         | 36/376 [00:51<08:28,  1.50s/it]
+ 10%|▉         | 37/376 [00:53<08:19,  1.47s/it]
+ 10%|█         | 38/376 [00:54<08:00,  1.42s/it]
+ 10%|█         | 39/376 [00:56<08:26,  1.50s/it]
+ 11%|█         | 40/376 [00:57<08:09,  1.46s/it]
+{'loss': 0.7022, 'learning_rate': 4.861672729019797e-05, 'epoch': 0.11}
+ 11%|█         | 40/376 [00:57<08:09,  1.46s/it]
+ 11%|█         | 41/376 [00:59<07:51,  1.41s/it]
+ 11%|█         | 42/376 [01:00<07:44,  1.39s/it]
+ 11%|█▏        | 43/376 [01:01<07:24,  1.34s/it]
+ 12%|█▏        | 44/376 [01:03<08:38,  1.56s/it]
+ 12%|█▏        | 45/376 [01:04<08:07,  1.47s/it]
+ 12%|█▏        | 46/376 [01:06<08:11,  1.49s/it]
+ 12%|█▎        | 47/376 [01:07<07:35,  1.38s/it]
+ 13%|█▎        | 48/376 [01:08<07:08,  1.31s/it]
+ 13%|█▎        | 49/376 [01:09<07:01,  1.29s/it]
+ 13%|█▎        | 50/376 [01:11<07:07,  1.31s/it]
+{'loss': 0.5844, 'learning_rate': 4.784995028809707e-05, 'epoch': 0.13}
+ 13%|█▎        | 50/376 [01:11<07:07,  1.31s/it]
+ 14%|█▎        | 51/376 [01:12<06:59,  1.29s/it]
+ 14%|█▍        | 52/376 [01:13<06:54,  1.28s/it]
+ 14%|█▍        | 53/376 [01:14<06:26,  1.20s/it]
+ 14%|█▍        | 54/376 [01:16<06:46,  1.26s/it]
+ 15%|█▍        | 55/376 [01:17<06:54,  1.29s/it]
+ 15%|█▍        | 56/376 [01:18<06:21,  1.19s/it]
+ 15%|█▌        | 57/376 [01:19<06:25,  1.21s/it]
+ 15%|█▌        | 58/376 [01:21<07:51,  1.48s/it]
+ 16%|█▌        | 59/376 [01:23<07:14,  1.37s/it]
+ 16%|█▌        | 60/376 [01:24<07:11,  1.36s/it]
+{'loss': 0.4454, 'learning_rate': 4.692374820516679e-05, 'epoch': 0.16}
+ 16%|█▌        | 60/376 [01:24<07:11,  1.36s/it]
+ 16%|█▌        | 61/376 [01:26<07:36,  1.45s/it]
+ 16%|█▋        | 62/376 [01:27<07:34,  1.45s/it]
+ 17%|█▋        | 63/376 [01:29<08:09,  1.56s/it]
+ 17%|█▋        | 64/376 [01:30<07:41,  1.48s/it]
+ 17%|█▋        | 65/376 [01:31<07:23,  1.43s/it]
+ 18%|█▊        | 66/376 [01:33<08:05,  1.57s/it]
+ 18%|█▊        | 67/376 [01:35<07:37,  1.48s/it]
+ 18%|█▊        | 68/376 [01:36<07:15,  1.42s/it]
+ 18%|█▊        | 69/376 [01:37<06:37,  1.29s/it]
+ 19%|█▊        | 70/376 [01:38<06:52,  1.35s/it]
+{'loss': 0.4076, 'learning_rate': 4.584458319296868e-05, 'epoch': 0.19}
+ 19%|█▊        | 70/376 [01:38<06:52,  1.35s/it]
+ 19%|█▉        | 71/376 [01:40<06:42,  1.32s/it]
+ 19%|█▉        | 72/376 [01:41<06:59,  1.38s/it]
+ 19%|█▉        | 73/376 [01:42<06:56,  1.37s/it]
+ 20%|█▉        | 74/376 [01:44<06:24,  1.27s/it]
+ 20%|█▉        | 75/376 [01:45<06:42,  1.34s/it]
+ 20%|██        | 76/376 [01:46<06:05,  1.22s/it]
+ 20%|██        | 77/376 [01:48<06:59,  1.40s/it]
+ 21%|██        | 78/376 [01:49<07:24,  1.49s/it]
+ 21%|██        | 79/376 [01:51<06:56,  1.40s/it]
+ 21%|██▏       | 80/376 [01:52<06:48,  1.38s/it]
+{'loss': 0.4111, 'learning_rate': 4.4619984631966524e-05, 'epoch': 0.21}
+ 21%|██▏       | 80/376 [01:52<06:48,  1.38s/it]
+ 22%|██▏       | 81/376 [01:54<07:02,  1.43s/it]
+ 22%|██▏       | 82/376 [01:55<06:47,  1.39s/it]
+ 22%|██▏       | 83/376 [01:56<06:31,  1.34s/it]
+ 22%|██▏       | 84/376 [01:58<07:21,  1.51s/it]
+ 23%|██▎       | 85/376 [01:59<06:58,  1.44s/it]
+ 23%|██▎       | 86/376 [02:01<07:04,  1.47s/it]
+ 23%|██▎       | 87/376 [02:02<06:35,  1.37s/it]
+ 23%|██▎       | 88/376 [02:03<06:43,  1.40s/it]
+ 24%|██▎       | 89/376 [02:05<06:29,  1.36s/it]
+ 24%|██▍       | 90/376 [02:07<07:18,  1.53s/it]
+{'loss': 0.4115, 'learning_rate': 4.3258496598716736e-05, 'epoch': 0.24}
+ 24%|██▍       | 90/376 [02:07<07:18,  1.53s/it]
+ 24%|██▍       | 91/376 [02:08<07:15,  1.53s/it]
+ 24%|██▍       | 92/376 [02:09<06:47,  1.44s/it]
+ 25%|██▍       | 93/376 [02:10<06:11,  1.31s/it]
+ 25%|██▌       | 94/376 [02:11<05:35,  1.19s/it]
+ 25%|██▌       | 95/376 [02:13<05:53,  1.26s/it]
+ 26%|██▌       | 96/376 [02:14<05:38,  1.21s/it]
+ 26%|██▌       | 97/376 [02:15<05:32,  1.19s/it]
+ 26%|██▌       | 98/376 [02:16<05:54,  1.27s/it]
+ 26%|██▋       | 99/376 [02:18<06:20,  1.37s/it]
+ 27%|██▋       | 100/376 [02:19<06:01,  1.31s/it]
+{'loss': 0.3566, 'learning_rate': 4.176961825348059e-05, 'epoch': 0.27}
+ 27%|██▋       | 100/376 [02:19<06:01,  1.31s/it]
+ 27%|██▋       | 101/376 [02:21<06:14,  1.36s/it]
+ 27%|██▋       | 102/376 [02:22<06:44,  1.48s/it]
+ 27%|██▋       | 103/376 [02:24<06:32,  1.44s/it]
+ 28%|██▊       | 104/376 [02:25<05:49,  1.28s/it]
+ 28%|██▊       | 105/376 [02:26<06:19,  1.40s/it]
+ 28%|██▊       | 106/376 [02:28<06:09,  1.37s/it]
+ 28%|██▊       | 107/376 [02:29<06:05,  1.36s/it]
+ 29%|██▊       | 108/376 [02:30<05:38,  1.26s/it]
+ 29%|██▉       | 109/376 [02:32<06:20,  1.43s/it]
+ 29%|██▉       | 110/376 [02:33<05:56,  1.34s/it]
+{'loss': 0.4302, 'learning_rate': 4.016373756417669e-05, 'epoch': 0.29}
+ 29%|██▉       | 110/376 [02:33<05:56,  1.34s/it]
+ 30%|██▉       | 111/376 [02:34<05:40,  1.29s/it]
+ 30%|██▉       | 112/376 [02:36<06:18,  1.44s/it]
+ 30%|███       | 113/376 [02:38<06:33,  1.50s/it]
+ 30%|███       | 114/376 [02:39<06:25,  1.47s/it]
+ 31%|███       | 115/376 [02:40<06:25,  1.48s/it]
+ 31%|███       | 116/376 [02:42<06:47,  1.57s/it]
+ 31%|███       | 117/376 [02:44<06:34,  1.52s/it]
+ 31%|███▏      | 118/376 [02:45<05:55,  1.38s/it]
+ 32%|███▏      | 119/376 [02:46<05:38,  1.32s/it]
+ 32%|███▏      | 120/376 [02:47<05:55,  1.39s/it]
+{'loss': 0.4271, 'learning_rate': 3.845205882908432e-05, 'epoch': 0.32}
+ 32%|███▏      | 120/376 [02:47<05:55,  1.39s/it]
+ 32%|███▏      | 121/376 [02:49<05:40,  1.33s/it]
+ 32%|███▏      | 122/376 [02:50<05:38,  1.33s/it]
+ 33%|███▎      | 123/376 [02:51<05:13,  1.24s/it]
+ 33%|███▎      | 124/376 [02:52<05:25,  1.29s/it]
+ 33%|███▎      | 125/376 [02:54<05:48,  1.39s/it]
+ 34%|███▎      | 126/376 [02:55<05:43,  1.37s/it]
+ 34%|███▍      | 127/376 [02:57<06:29,  1.57s/it]
+ 34%|███▍      | 128/376 [02:59<06:08,  1.49s/it]
+ 34%|███▍      | 129/376 [03:00<06:32,  1.59s/it]
+ 35%|███▍      | 130/376 [03:02<06:31,  1.59s/it]
+{'loss': 0.4625, 'learning_rate': 3.6646524503974955e-05, 'epoch': 0.35}
+ 35%|███▍      | 130/376 [03:02<06:31,  1.59s/it]
+ 35%|███▍      | 131/376 [03:04<06:39,  1.63s/it]
+ 35%|███▌      | 132/376 [03:05<06:34,  1.61s/it]
+ 35%|███▌      | 133/376 [03:06<05:56,  1.47s/it]
+ 36%|███▌      | 134/376 [03:08<05:40,  1.41s/it]
+ 36%|███▌      | 135/376 [03:09<05:27,  1.36s/it]
+ 36%|███▌      | 136/376 [03:11<06:03,  1.52s/it]
+ 36%|███▋      | 137/376 [03:12<05:38,  1.42s/it]
+ 37%|███▋      | 138/376 [03:13<05:37,  1.42s/it]
+ 37%|███▋      | 139/376 [03:15<05:34,  1.41s/it]
+ 37%|███▋      | 140/376 [03:17<05:54,  1.50s/it]
+{'loss': 0.5066, 'learning_rate': 3.475973187908737e-05, 'epoch': 0.37}
+ 37%|███▋      | 140/376 [03:17<05:54,  1.50s/it]
+ 38%|███▊      | 141/376 [03:18<05:46,  1.47s/it]
+ 38%|███▊      | 142/376 [03:20<05:55,  1.52s/it]
+ 38%|███▊      | 143/376 [03:21<06:18,  1.62s/it]
+ 38%|███▊      | 144/376 [03:23<05:49,  1.51s/it]
+ 39%|███▊      | 145/376 [03:24<05:39,  1.47s/it]
+ 39%|███▉      | 146/376 [03:25<05:32,  1.44s/it]
+ 39%|███▉      | 147/376 [03:27<05:25,  1.42s/it]
+ 39%|███▉      | 148/376 [03:28<05:30,  1.45s/it]
+ 40%|███▉      | 149/376 [03:30<05:54,  1.56s/it]
+ 40%|███▉      | 150/376 [03:32<05:37,  1.49s/it]
+{'loss': 0.3887, 'learning_rate': 3.280484518729466e-05, 'epoch': 0.4}
+ 40%|███▉      | 150/376 [03:32<05:37,  1.49s/it]
+ 40%|████      | 151/376 [03:33<05:14,  1.40s/it]
+ 40%|████      | 152/376 [03:34<04:52,  1.30s/it]
+ 41%|████      | 153/376 [03:35<04:54,  1.32s/it]
+ 41%|████      | 154/376 [03:37<04:56,  1.34s/it]
+ 41%|████      | 155/376 [03:38<05:14,  1.42s/it]
+ 41%|████▏     | 156/376 [03:40<05:08,  1.40s/it]
+ 42%|████▏     | 157/376 [03:41<04:45,  1.30s/it]
+ 42%|████▏     | 158/376 [03:42<04:29,  1.24s/it]
+ 42%|████▏     | 159/376 [03:43<04:41,  1.30s/it]
+ 43%|████▎     | 160/376 [03:44<04:41,  1.30s/it]
+{'loss': 0.3675, 'learning_rate': 3.079550375668821e-05, 'epoch': 0.42}
+ 43%|████▎     | 160/376 [03:44<04:41,  1.30s/it]
+ 43%|████▎     | 161/376 [03:46<04:59,  1.39s/it]
+ 43%|████▎     | 162/376 [03:47<04:35,  1.29s/it]
+ 43%|████▎     | 163/376 [03:48<04:36,  1.30s/it]
+ 44%|████▎     | 164/376 [03:50<04:34,  1.30s/it]
+ 44%|████▍     | 165/376 [03:51<04:41,  1.33s/it]
+ 44%|████▍     | 166/376 [03:53<05:04,  1.45s/it]
+ 44%|████▍     | 167/376 [03:54<05:04,  1.46s/it]
+ 45%|████▍     | 168/376 [03:56<05:02,  1.45s/it]
+ 45%|████▍     | 169/376 [03:57<05:08,  1.49s/it]
+ 45%|████▌     | 170/376 [03:59<05:00,  1.46s/it]
+{'loss': 0.4095, 'learning_rate': 2.8745726848402036e-05, 'epoch': 0.45}
+ 45%|████▌     | 170/376 [03:59<05:00,  1.46s/it]
+ 45%|████▌     | 171/376 [04:00<04:57,  1.45s/it]
+ 46%|████▌     | 172/376 [04:01<04:26,  1.31s/it]
+ 46%|████▌     | 173/376 [04:02<04:12,  1.24s/it]
+ 46%|████▋     | 174/376 [04:04<04:28,  1.33s/it]
+ 47%|████▋     | 175/376 [04:05<04:35,  1.37s/it]
+ 47%|████▋     | 176/376 [04:07<04:50,  1.45s/it]
+ 47%|████▋     | 177/376 [04:09<05:17,  1.60s/it]
+ 47%|████▋     | 178/376 [04:10<04:52,  1.48s/it]
+ 48%|████▊     | 179/376 [04:11<04:47,  1.46s/it]
+ 48%|████▊     | 180/376 [04:13<04:29,  1.37s/it]
+{'loss': 0.3782, 'learning_rate': 2.6669815843628042e-05, 'epoch': 0.48}
+ 48%|████▊     | 180/376 [04:13<04:29,  1.37s/it]
+ 48%|████▊     | 181/376 [04:14<04:25,  1.36s/it]
+ 48%|████▊     | 182/376 [04:15<04:18,  1.33s/it]
+ 49%|████▊     | 183/376 [04:17<04:42,  1.46s/it]
+ 49%|████▉     | 184/376 [04:18<04:11,  1.31s/it]
+ 49%|████▉     | 185/376 [04:19<04:02,  1.27s/it]
+ 49%|████▉     | 186/376 [04:21<04:45,  1.50s/it]
+ 50%|████▉     | 187/376 [04:22<04:26,  1.41s/it]
+ 50%|█████     | 188/376 [04:24<04:14,  1.35s/it]
+ 50%|█████     | 189/376 [04:26<05:00,  1.61s/it]
+ 51%|█████     | 190/376 [04:28<05:18,  1.71s/it]
+{'loss': 0.4195, 'learning_rate': 2.4582254462267476e-05, 'epoch': 0.5}
+ 51%|█████     | 190/376 [04:28<05:18,  1.71s/it]
+ 51%|█████     | 191/376 [04:29<04:55,  1.60s/it]
+ 51%|█████     | 192/376 [04:30<04:48,  1.57s/it]
+ 51%|█████▏    | 193/376 [04:32<04:46,  1.56s/it]
+ 52%|█████▏    | 194/376 [04:33<04:34,  1.51s/it]
+ 52%|█████▏    | 195/376 [04:36<05:04,  1.68s/it]
+ 52%|█████▏    | 196/376 [04:37<04:39,  1.55s/it]
+ 52%|█████▏    | 197/376 [04:38<04:32,  1.52s/it]
+ 53%|█████▎    | 198/376 [04:40<04:39,  1.57s/it]
+ 53%|█████▎    | 199/376 [04:42<04:42,  1.59s/it]
+ 53%|█████▎    | 200/376 [04:43<04:25,  1.51s/it]
+{'loss': 0.3392, 'learning_rate': 2.2497607709397543e-05, 'epoch': 0.53}
+ 53%|█████▎    | 200/376 [04:43<04:25,  1.51s/it]
+ 53%|█████▎    | 201/376 [04:44<04:13,  1.45s/it]
+ 54%|█████▎    | 202/376 [04:45<03:58,  1.37s/it]
+ 54%|█████▍    | 203/376 [04:47<03:53,  1.35s/it]
+ 54%|█████▍    | 204/376 [04:48<03:38,  1.27s/it]
+ 55%|█████▍    | 205/376 [04:49<03:46,  1.33s/it]
+ 55%|█████▍    | 206/376 [04:51<03:55,  1.38s/it]
+ 55%|█████▌    | 207/376 [04:52<04:11,  1.49s/it]
+ 55%|█████▌    | 208/376 [04:54<04:02,  1.44s/it]
+ 56%|█████▌    | 209/376 [04:55<03:40,  1.32s/it]
+ 56%|█████▌    | 210/376 [04:56<03:21,  1.21s/it]
+{'loss': 0.3347, 'learning_rate': 2.0430420254607748e-05, 'epoch': 0.56}
+ 56%|█████▌    | 210/376 [04:56<03:21,  1.21s/it]
+ 56%|█████▌    | 211/376 [04:57<03:18,  1.20s/it]
+ 56%|█████▋    | 212/376 [04:59<03:42,  1.36s/it]
+ 57%|█████▋    | 213/376 [05:00<03:41,  1.36s/it]
+ 57%|█████▋    | 214/376 [05:01<03:44,  1.39s/it]
+ 57%|█████▋    | 215/376 [05:03<03:28,  1.30s/it]
+ 57%|█████▋    | 216/376 [05:04<03:34,  1.34s/it]
+ 58%|█████▊    | 217/376 [05:05<03:38,  1.37s/it]
+ 58%|█████▊    | 218/376 [05:07<03:38,  1.38s/it]
+ 58%|█████▊    | 219/376 [05:08<03:30,  1.34s/it]
+ 59%|█████▊    | 220/376 [05:09<03:21,  1.29s/it]
+{'loss': 0.4117, 'learning_rate': 1.8395114953217852e-05, 'epoch': 0.58}
+ 59%|█████▊    | 220/376 [05:09<03:21,  1.29s/it]
+ 59%|█████▉    | 221/376 [05:11<03:19,  1.29s/it]
+ 59%|█████▉    | 222/376 [05:12<03:20,  1.30s/it]
+ 59%|█████▉    | 223/376 [05:13<03:03,  1.20s/it]
+ 60%|█████▉    | 224/376 [05:14<02:50,  1.12s/it]
+ 60%|█████▉    | 225/376 [05:15<02:47,  1.11s/it]
+ 60%|██████    | 226/376 [05:16<03:02,  1.21s/it]
+ 60%|██████    | 227/376 [05:18<03:26,  1.39s/it]
+ 61%|██████    | 228/376 [05:20<03:41,  1.50s/it]
+ 61%|██████    | 229/376 [05:21<03:29,  1.43s/it]
+ 61%|██████    | 230/376 [05:23<03:37,  1.49s/it]
+{'loss': 0.3772, 'learning_rate': 1.640589221739926e-05, 'epoch': 0.61}
+ 61%|██████    | 230/376 [05:23<03:37,  1.49s/it]
+ 61%|██████▏   | 231/376 [05:24<03:26,  1.43s/it]
+ 62%|██████▏   | 232/376 [05:25<03:21,  1.40s/it]
+ 62%|██████▏   | 233/376 [05:27<03:09,  1.33s/it]
+ 62%|██████▏   | 234/376 [05:28<03:33,  1.51s/it]
+ 62%|██████▎   | 235/376 [05:30<03:20,  1.42s/it]
+ 63%|██████▎   | 236/376 [05:31<03:32,  1.52s/it]
+ 63%|██████▎   | 237/376 [05:33<03:17,  1.42s/it]
+ 63%|██████▎   | 238/376 [05:34<03:00,  1.31s/it]
+ 64%|██████▎   | 239/376 [05:35<03:00,  1.32s/it]
+ 64%|██████▍   | 240/376 [05:36<03:04,  1.35s/it]
+{'loss': 0.4403, 'learning_rate': 1.447663093929163e-05, 'epoch': 0.64}
+ 64%|██████▍   | 240/376 [05:36<03:04,  1.35s/it]
+ 64%|██████▍   | 241/376 [05:38<03:02,  1.35s/it]
+ 64%|██████▍   | 242/376 [05:39<02:59,  1.34s/it]
+ 65%|██████▍   | 243/376 [05:41<03:03,  1.38s/it]
+ 65%|██████▍   | 244/376 [05:42<03:05,  1.41s/it]
+ 65%|██████▌   | 245/376 [05:43<03:00,  1.38s/it]
+ 65%|██████▌   | 246/376 [05:45<02:53,  1.33s/it]
+ 66%|██████▌   | 247/376 [05:46<02:41,  1.25s/it]
+ 66%|██████▌   | 248/376 [05:47<02:51,  1.34s/it]
+ 66%|██████▌   | 249/376 [05:48<02:46,  1.31s/it]
+ 66%|██████▋   | 250/376 [05:50<03:02,  1.45s/it]
+{'loss': 0.3867, 'learning_rate': 1.2620791657378664e-05, 'epoch': 0.66}
+ 66%|██████▋   | 250/376 [05:50<03:02,  1.45s/it]
+ 67%|██████▋   | 251/376 [05:51<02:51,  1.37s/it]
+ 67%|██████▋   | 252/376 [05:53<02:52,  1.39s/it]
+ 67%|██████▋   | 253/376 [05:54<02:50,  1.39s/it]
+ 68%|██████▊   | 254/376 [05:56<02:44,  1.35s/it]
+ 68%|██████▊   | 255/376 [05:57<02:54,  1.44s/it]
+ 68%|██████▊   | 256/376 [05:59<02:54,  1.45s/it]
+ 68%|██████▊   | 257/376 [06:00<02:47,  1.41s/it]
+ 69%|██████▊   | 258/376 [06:01<02:35,  1.31s/it]
+ 69%|██████▉   | 259/376 [06:03<02:42,  1.39s/it]
+ 69%|██████▉   | 260/376 [06:04<02:28,  1.28s/it]
+{'loss': 0.3688, 'learning_rate': 1.0851322641735118e-05, 'epoch': 0.69}
+ 69%|██████▉   | 260/376 [06:04<02:28,  1.28s/it]
+ 69%|██████▉   | 261/376 [06:05<02:29,  1.30s/it]
+ 70%|██████▉   | 262/376 [06:07<02:51,  1.50s/it]
+ 70%|██████▉   | 263/376 [06:09<02:53,  1.54s/it]
+ 70%|███████   | 264/376 [06:10<02:53,  1.55s/it]
+ 70%|███████   | 265/376 [06:11<02:43,  1.47s/it]
+ 71%|███████   | 266/376 [06:13<02:32,  1.38s/it]
+ 71%|███████   | 267/376 [06:14<02:40,  1.47s/it]
+ 71%|███████▏  | 268/376 [06:16<02:35,  1.44s/it]
+ 72%|███████▏  | 269/376 [06:17<02:44,  1.54s/it]
+ 72%|███████▏  | 270/376 [06:19<02:40,  1.51s/it]
+{'loss': 0.3655, 'learning_rate': 9.180569553392535e-06, 'epoch': 0.72}
+ 72%|███████▏  | 270/376 [06:19<02:40,  1.51s/it]
+ 72%|███��███▏  | 271/376 [06:20<02:34,  1.47s/it]
+ 72%|███████▏  | 272/376 [06:21<02:25,  1.40s/it]
+ 73%|███████▎  | 273/376 [06:23<02:24,  1.40s/it]
+ 73%|███████▎  | 274/376 [06:25<02:29,  1.47s/it]
+ 73%|███████▎  | 275/376 [06:27<02:45,  1.64s/it]
+ 73%|███████▎  | 276/376 [06:28<02:32,  1.52s/it]
+ 74%|███████▎  | 277/376 [06:29<02:27,  1.49s/it]
+ 74%|███████▍  | 278/376 [06:31<02:29,  1.53s/it]
+ 74%|███████▍  | 279/376 [06:32<02:21,  1.46s/it]
+ 74%|███████▍  | 280/376 [06:33<02:15,  1.41s/it]
+{'loss': 0.4144, 'learning_rate': 7.620189308133943e-06, 'epoch': 0.74}
+ 74%|███████▍  | 280/376 [06:33<02:15,  1.41s/it]
+ 75%|███████▍  | 281/376 [06:35<02:14,  1.41s/it]
+ 75%|███████▌  | 282/376 [06:36<02:17,  1.46s/it]
+ 75%|███████▌  | 283/376 [06:38<02:16,  1.47s/it]
+ 76%|███████▌  | 284/376 [06:39<02:03,  1.34s/it]
+ 76%|███████▌  | 285/376 [06:40<01:53,  1.25s/it]
+ 76%|███████▌  | 286/376 [06:41<01:49,  1.21s/it]
+ 76%|███████▋  | 287/376 [06:43<02:11,  1.48s/it]
+ 77%|███████▋  | 288/376 [06:45<02:24,  1.64s/it]
+ 77%|███████▋  | 289/376 [06:46<02:06,  1.45s/it]
+ 77%|███████▋  | 290/376 [06:47<01:56,  1.35s/it]
+{'loss': 0.3298, 'learning_rate': 6.181068745693716e-06, 'epoch': 0.77}
+ 77%|███████▋  | 290/376 [06:47<01:56,  1.35s/it]
+ 77%|███████▋  | 291/376 [06:48<01:47,  1.27s/it]
+ 78%|███████▊  | 292/376 [06:50<01:51,  1.33s/it]
+ 78%|███████▊  | 293/376 [06:51<01:42,  1.24s/it]
+ 78%|███████▊  | 294/376 [06:52<01:46,  1.29s/it]
+ 78%|███████▊  | 295/376 [06:54<01:50,  1.37s/it]
+ 79%|███████▊  | 296/376 [06:55<01:46,  1.33s/it]
+ 79%|███████▉  | 297/376 [06:56<01:37,  1.24s/it]
+ 79%|███████▉  | 298/376 [06:58<01:39,  1.28s/it]
+ 80%|███████▉  | 299/376 [06:59<01:34,  1.23s/it]
+ 80%|███████▉  | 300/376 [07:00<01:31,  1.20s/it]
+{'loss': 0.3337, 'learning_rate': 4.873248671810928e-06, 'epoch': 0.8}
+ 80%|███████▉  | 300/376 [07:00<01:31,  1.20s/it]
+ 80%|████████  | 301/376 [07:01<01:28,  1.17s/it]
+ 80%|████████  | 302/376 [07:03<01:36,  1.31s/it]
+ 81%|████████  | 303/376 [07:04<01:30,  1.23s/it]
+ 81%|████████  | 304/376 [07:05<01:26,  1.20s/it]
+ 81%|████████  | 305/376 [07:06<01:23,  1.18s/it]
+ 81%|████████▏ | 306/376 [07:07<01:25,  1.22s/it]
+ 82%|████████▏ | 307/376 [07:08<01:20,  1.17s/it]
+ 82%|████████▏ | 308/376 [07:10<01:22,  1.22s/it]
+ 82%|████████▏ | 309/376 [07:11<01:22,  1.24s/it]
+ 82%|████████▏ | 310/376 [07:13<01:35,  1.44s/it]
+{'loss': 0.3217, 'learning_rate': 3.7058538030980942e-06, 'epoch': 0.82}
+ 82%|████████▏ | 310/376 [07:13<01:35,  1.44s/it]
+ 83%|████████▎ | 311/376 [07:14<01:31,  1.41s/it]
+ 83%|████████▎ | 312/376 [07:15<01:22,  1.29s/it]
+ 83%|████████▎ | 313/376 [07:16<01:23,  1.33s/it]
+ 84%|████████▎ | 314/376 [07:18<01:28,  1.43s/it]
+ 84%|████████▍ | 315/376 [07:19<01:25,  1.40s/it]
+ 84%|████████▍ | 316/376 [07:21<01:17,  1.29s/it]
+ 84%|████████▍ | 317/376 [07:22<01:11,  1.21s/it]
+ 85%|████████▍ | 318/376 [07:24<01:24,  1.46s/it]
+ 85%|████████▍ | 319/376 [07:25<01:19,  1.40s/it]
+ 85%|████████▌ | 320/376 [07:26<01:16,  1.36s/it]
+{'loss': 0.3222, 'learning_rate': 2.687029103502972e-06, 'epoch': 0.85}
+ 85%|████████▌ | 320/376 [07:26<01:16,  1.36s/it]
+ 85%|████████▌ | 321/376 [07:27<01:09,  1.26s/it]
+ 86%|████████▌ | 322/376 [07:28<01:09,  1.28s/it]
+ 86%|████████▌ | 323/376 [07:30<01:12,  1.38s/it]
+ 86%|████████▌ | 324/376 [07:32<01:12,  1.40s/it]
+ 86%|████████▋ | 325/376 [07:33<01:11,  1.41s/it]
+ 87%|████████▋ | 326/376 [07:35<01:15,  1.50s/it]
+ 87%|████████▋ | 327/376 [07:36<01:12,  1.49s/it]
+ 87%|████████▋ | 328/376 [07:38<01:12,  1.50s/it]
+ 88%|████████▊ | 329/376 [07:39<01:10,  1.50s/it]
+ 88%|████████▊ | 330/376 [07:41<01:10,  1.53s/it]
+{'loss': 0.3989, 'learning_rate': 1.823882956546566e-06, 'epoch': 0.88}
+ 88%|████████▊ | 330/376 [07:41<01:10,  1.53s/it]
+ 88%|████████▊ | 331/376 [07:42<01:10,  1.57s/it]
+ 88%|████████▊ | 332/376 [07:44<01:09,  1.59s/it]
+ 89%|████████▊ | 333/376 [07:45<01:06,  1.54s/it]
+ 89%|████████▉ | 334/376 [07:47<01:02,  1.48s/it]
+ 89%|████████▉ | 335/376 [07:48<01:00,  1.48s/it]
+ 89%|████████▉ | 336/376 [07:49<00:53,  1.33s/it]
+ 90%|████████▉ | 337/376 [07:50<00:50,  1.29s/it]
+ 90%|████████▉ | 338/376 [07:52<00:48,  1.29s/it]
+ 90%|█████████ | 339/376 [07:53<00:49,  1.34s/it]
+ 90%|█████████ | 340/376 [07:55<00:48,  1.34s/it]
+{'loss': 0.3805, 'learning_rate': 1.1224375698271894e-06, 'epoch': 0.9}
+ 90%|█████████ | 340/376 [07:55<00:48,  1.34s/it]
+ 91%|█████████ | 341/376 [07:56<00:48,  1.38s/it]
+ 91%|█████████ | 342/376 [07:57<00:43,  1.28s/it]
+ 91%|█████████ | 343/376 [07:59<00:47,  1.45s/it]
+ 91%|█████████▏| 344/376 [08:00<00:47,  1.49s/it]
+ 92%|█████████▏| 345/376 [08:02<00:45,  1.46s/it]
+ 92%|█████████▏| 346/376 [08:03<00:40,  1.34s/it]
+ 92%|█████████▏| 347/376 [08:04<00:40,  1.39s/it]
+ 93%|█████████▎| 348/376 [08:06<00:36,  1.30s/it]
+ 93%|█████████▎| 349/376 [08:07<00:34,  1.28s/it]
+ 93%|█████████▎| 350/376 [08:08<00:34,  1.31s/it]
+{'loss': 0.4108, 'learning_rate': 5.875869578203824e-07, 'epoch': 0.93}
+ 93%|█████████▎| 350/376 [08:08<00:34,  1.31s/it]
+ 93%|█████████▎| 351/376 [08:10<00:34,  1.38s/it]
+ 94%|█████████▎| 352/376 [08:11<00:31,  1.32s/it]
+ 94%|█████████▍| 353/376 [08:12<00:28,  1.26s/it]
+ 94%|█████████▍| 354/376 [08:13<00:28,  1.29s/it]
+ 94%|█████████▍| 355/376 [08:14<00:25,  1.23s/it]
+ 95%|█████████▍| 356/376 [08:16<00:23,  1.19s/it]
+ 95%|█████████▍| 357/376 [08:17<00:25,  1.35s/it]
+ 95%|█████████▌| 358/376 [08:19<00:26,  1.47s/it]
+ 95%|█████████▌| 359/376 [08:20<00:22,  1.32s/it]
+ 96%|█████████▌| 360/376 [08:21<00:21,  1.37s/it]
+{'loss': 0.3578, 'learning_rate': 2.230627961304993e-07, 'epoch': 0.96}
+ 96%|█████████▌| 360/376 [08:22<00:21,  1.37s/it]
+ 96%|█████████▌| 361/376 [08:23<00:20,  1.34s/it]
+ 96%|█████████▋| 362/376 [08:24<00:19,  1.39s/it]
+ 97%|█████████▋| 363/376 [08:26<00:18,  1.40s/it]
+ 97%|█████████▋| 364/376 [08:27<00:16,  1.36s/it]
+ 97%|█████████▋| 365/376 [08:29<00:16,  1.46s/it]
+ 97%|█████████▋| 366/376 [08:30<00:14,  1.45s/it]
+ 98%|█████████▊| 367/376 [08:32<00:13,  1.46s/it]
+ 98%|█████████▊| 368/376 [08:33<00:11,  1.46s/it]
+ 98%|█████████▊| 369/376 [08:35<00:10,  1.48s/it]
+ 98%|█████████▊| 370/376 [08:36<00:08,  1.39s/it]
+{'loss': 0.3453, 'learning_rate': 3.1408385430356516e-08, 'epoch': 0.98}
+ 98%|█████████▊| 370/376 [08:36<00:08,  1.39s/it]
+ 99%|█████████▊| 371/376 [08:37<00:06,  1.31s/it]
+ 99%|█████████▉| 372/376 [08:38<00:05,  1.35s/it]
+ 99%|█████████▉| 373/376 [08:40<00:04,  1.35s/it]
+ 99%|█████████▉| 374/376 [08:41<00:02,  1.41s/it]
+100%|█████████▉| 375/376 [08:43<00:01,  1.47s/it]
+100%|██████████| 376/376 [08:44<00:00,  1.47s/it][INFO|trainer.py:3166] 2024-01-04 10:02:58,683 >> ***** Running Evaluation *****
+[INFO|trainer.py:3168] 2024-01-04 10:02:58,683 >>   Num examples = 335
+[INFO|trainer.py:3171] 2024-01-04 10:02:58,683 >>   Batch size = 1
+  0%|          | 0/168 [00:00<?, ?it/s][A
+  1%|          | 2/168 [00:00<00:16, 10.08it/s][A
+  2%|▏         | 4/168 [00:00<00:21,  7.51it/s][A
+  3%|▎         | 5/168 [00:00<00:27,  5.84it/s][A
+  4%|▎         | 6/168 [00:00<00:24,  6.53it/s][A
+  4%|▍         | 7/168 [00:01<00:28,  5.68it/s][A
+  5%|▍         | 8/168 [00:01<00:27,  5.80it/s][A
+  5%|▌         | 9/168 [00:01<00:31,  5.02it/s][A
+  6%|▌         | 10/168 [00:01<00:33,  4.74it/s][A
+  7%|▋         | 11/168 [00:01<00:29,  5.33it/s][A
+  7%|▋         | 12/168 [00:02<00:32,  4.81it/s][A
+  8%|▊         | 13/168 [00:02<00:28,  5.35it/s][A
+  8%|▊         | 14/168 [00:02<00:26,  5.85it/s][A
+  9%|▉         | 15/168 [00:02<00:29,  5.24it/s][A
+ 10%|▉         | 16/168 [00:02<00:27,  5.51it/s][A
+ 10%|█         | 17/168 [00:02<00:25,  5.91it/s][A
+ 11%|█         | 18/168 [00:03<00:24,  6.22it/s][A
+ 11%|█▏        | 19/168 [00:03<00:23,  6.44it/s][A
+ 12%|█▏        | 20/168 [00:03<00:24,  6.13it/s][A
+ 12%|█▎        | 21/168 [00:03<00:28,  5.17it/s][A
+ 13%|█▎        | 22/168 [00:03<00:26,  5.44it/s][A
+ 14%|█▍        | 24/168 [00:04<00:21,  6.73it/s][A
+ 15%|█▍        | 25/168 [00:04<00:20,  7.15it/s][A
+ 15%|█▌        | 26/168 [00:04<00:20,  6.95it/s][A
+ 16%|█▌        | 27/168 [00:04<00:24,  5.74it/s][A
+ 17%|█▋        | 28/168 [00:04<00:23,  6.08it/s][A
+ 17%|█▋        | 29/168 [00:04<00:21,  6.38it/s][A
+ 18%|█▊        | 30/168 [00:05<00:21,  6.48it/s][A
+ 18%|█▊        | 31/168 [00:05<00:20,  6.71it/s][A
+ 19%|█▉        | 32/168 [00:05<00:20,  6.76it/s][A
+ 20%|██        | 34/168 [00:05<00:18,  7.39it/s][A
+ 21%|██        | 35/168 [00:05<00:18,  7.31it/s][A
+ 21%|██▏       | 36/168 [00:05<00:17,  7.68it/s][A
+ 22%|██▏       | 37/168 [00:05<00:17,  7.58it/s][A
+ 23%|██▎       | 38/168 [00:06<00:17,  7.44it/s][A
+ 23%|██▎       | 39/168 [00:06<00:20,  6.29it/s][A
+ 24%|██▍       | 40/168 [00:06<00:20,  6.31it/s][A
+ 24%|██▍       | 41/168 [00:06<00:20,  6.31it/s][A
+ 25%|██▌       | 42/168 [00:06<00:18,  6.93it/s][A
+ 26%|██▌       | 43/168 [00:06<00:20,  5.98it/s][A
+ 26%|██▌       | 44/168 [00:07<00:19,  6.44it/s][A
+ 27%|██▋       | 45/168 [00:07<00:21,  5.70it/s][A
+ 27%|██▋       | 46/168 [00:07<00:23,  5.26it/s][A
+ 28%|██▊       | 47/168 [00:07<00:25,  4.74it/s][A
+ 29%|██▊       | 48/168 [00:07<00:22,  5.29it/s][A
+ 29%|██▉       | 49/168 [00:08<00:21,  5.65it/s][A
+ 30%|██▉       | 50/168 [00:08<00:19,  6.09it/s][A
+ 30%|███       | 51/168 [00:08<00:21,  5.46it/s][A
+ 31%|███       | 52/168 [00:08<00:20,  5.70it/s][A
+ 32%|███▏      | 53/168 [00:08<00:22,  5.02it/s][A
+ 32%|███▏      | 54/168 [00:09<00:22,  5.11it/s][A
+ 33%|███▎      | 55/168 [00:09<00:21,  5.22it/s][A
+ 33%|███▎      | 56/168 [00:09<00:19,  5.75it/s][A
+ 34%|███▍      | 57/168 [00:09<00:17,  6.50it/s][A
+ 35%|███▍      | 58/168 [00:09<00:18,  5.82it/s][A
+ 35%|███▌      | 59/168 [00:09<00:21,  5.01it/s][A
+ 36%|███▌      | 60/168 [00:10<00:19,  5.44it/s][A
+ 36%|███▋      | 61/168 [00:10<00:17,  6.24it/s][A
+ 37%|███▋      | 62/168 [00:10<00:15,  6.80it/s][A
+ 38%|███▊      | 63/168 [00:10<00:15,  6.99it/s][A
+ 38%|███▊      | 64/168 [00:10<00:14,  7.14it/s][A
+ 39%|███▊      | 65/168 [00:10<00:14,  7.02it/s][A
+ 39%|███▉      | 66/168 [00:10<00:13,  7.43it/s][A
+ 40%|███▉      | 67/168 [00:11<00:17,  5.85it/s][A
+ 40%|████      | 68/168 [00:11<00:19,  5.09it/s][A
+ 41%|████      | 69/168 [00:11<00:17,  5.57it/s][A
+ 42%|████▏     | 70/168 [00:11<00:15,  6.14it/s][A
+ 42%|████▏     | 71/168 [00:11<00:14,  6.50it/s][A
+ 43%|████▎     | 72/168 [00:11<00:16,  5.78it/s][A
+ 43%|████▎     | 73/168 [00:12<00:19,  5.00it/s][A
+ 44%|████▍     | 74/168 [00:12<00:19,  4.95it/s][A
+ 45%|████▍     | 75/168 [00:12<00:18,  5.12it/s][A
+ 46%|████▌     | 77/168 [00:12<00:13,  6.52it/s][A
+ 46%|████▋     | 78/168 [00:12<00:12,  7.01it/s][A
+ 47%|████▋     | 79/168 [00:13<00:12,  7.09it/s][A
+ 48%|████▊     | 80/168 [00:13<00:13,  6.46it/s][A
+ 48%|████▊     | 81/168 [00:13<00:13,  6.39it/s][A
+ 49%|████▉     | 82/168 [00:13<00:13,  6.33it/s][A
+ 49%|████▉     | 83/168 [00:13<00:13,  6.39it/s][A
+ 50%|█████     | 84/168 [00:13<00:14,  5.64it/s][A
+ 51%|█████     | 85/168 [00:14<00:16,  5.00it/s][A
+ 51%|█████     | 86/168 [00:14<00:14,  5.55it/s][A
+ 52%|█████▏    | 87/168 [00:14<00:13,  6.03it/s][A
+ 52%|█████▏    | 88/168 [00:14<00:12,  6.65it/s][A
+ 53%|█████▎    | 89/168 [00:14<00:12,  6.47it/s][A
+ 54%|█████▎    | 90/168 [00:14<00:11,  6.53it/s][A
+ 54%|█████▍    | 91/168 [00:15<00:11,  6.55it/s][A
+ 55%|█████▍    | 92/168 [00:15<00:10,  7.11it/s][A
+ 55%|█████▌    | 93/168 [00:15<00:10,  7.24it/s][A
+ 56%|█████▌    | 94/168 [00:15<00:10,  7.32it/s][A
+ 57%|█████▋    | 95/168 [00:15<00:09,  7.37it/s][A
+ 57%|█████▋    | 96/168 [00:15<00:12,  5.76it/s][A
+ 58%|█████▊    | 97/168 [00:16<00:13,  5.12it/s][A
+ 58%|█████▊    | 98/168 [00:16<00:14,  4.86it/s][A
+ 59%|█████▉    | 99/168 [00:16<00:12,  5.42it/s][A
+ 60%|█████▉    | 100/168 [00:16<00:11,  5.69it/s][A
+ 60%|██████    | 101/168 [00:16<00:10,  6.38it/s][A
+ 61%|██████    | 102/168 [00:16<00:10,  6.58it/s][A
+ 61%|██████▏   | 103/168 [00:16<00:09,  7.16it/s][A
+ 62%|██████▏   | 104/168 [00:17<00:11,  5.74it/s][A
+ 62%|██████▎   | 105/168 [00:17<00:10,  6.08it/s][A
+ 63%|██████▎   | 106/168 [00:17<00:09,  6.21it/s][A
+ 64%|██████▎   | 107/168 [00:17<00:09,  6.45it/s][A
+ 64%|██████▍   | 108/168 [00:17<00:09,  6.56it/s][A
+ 65%|██████▍   | 109/168 [00:17<00:09,  6.31it/s][A
+ 65%|██████▌   | 110/168 [00:18<00:08,  6.89it/s][A
+ 66%|██████▌   | 111/168 [00:18<00:08,  6.36it/s][A
+ 67%|██████▋   | 112/168 [00:18<00:08,  6.27it/s][A
+ 67%|██████▋   | 113/168 [00:18<00:09,  6.00it/s][A
+ 68%|██████▊   | 114/168 [00:18<00:08,  6.33it/s][A
+ 68%|██████▊   | 115/168 [00:18<00:07,  6.64it/s][A
+ 69%|██████▉   | 116/168 [00:19<00:08,  5.88it/s][A
+ 70%|██████▉   | 117/168 [00:19<00:08,  6.22it/s][A
+ 70%|███████   | 118/168 [00:19<00:08,  6.16it/s][A
+ 71%|███████   | 119/168 [00:19<00:08,  5.46it/s][A
+ 71%|███████▏  | 120/168 [00:19<00:09,  4.83it/s][A
+ 72%|███████▏  | 121/168 [00:20<00:09,  5.00it/s][A
+ 73%|███████▎  | 122/168 [00:20<00:09,  4.75it/s][A
+ 73%|███████▎  | 123/168 [00:20<00:08,  5.45it/s][A
+ 74%|███████▍  | 124/168 [00:20<00:08,  5.04it/s][A
+ 74%|███████▍  | 125/168 [00:20<00:07,  5.50it/s][A
+ 75%|███████▌  | 126/168 [00:20<00:06,  6.18it/s][A
+ 76%|███████▌  | 127/168 [00:21<00:06,  6.57it/s][A
+ 76%|███████▌  | 128/168 [00:21<00:07,  5.37it/s][A
+ 77%|███████▋  | 129/168 [00:21<00:07,  5.42it/s][A
+ 77%|███████▋  | 130/168 [00:21<00:07,  4.82it/s][A
+ 78%|███████▊  | 131/168 [00:21<00:06,  5.58it/s][A
+ 79%|███████▊  | 132/168 [00:22<00:06,  5.73it/s][A
+ 79%|███████▉  | 133/168 [00:22<00:06,  5.66it/s][A
+ 80%|███████▉  | 134/168 [00:22<00:05,  6.34it/s][A
+ 80%|████████  | 135/168 [00:22<00:05,  6.50it/s][A
+ 81%|████████  | 136/168 [00:22<00:04,  7.06it/s][A
+ 82%|████████▏ | 137/168 [00:22<00:05,  5.69it/s][A
+ 82%|████████▏ | 138/168 [00:23<00:05,  5.86it/s][A
+ 83%|████████▎ | 139/168 [00:23<00:04,  6.27it/s][A
+ 83%|████████▎ | 140/168 [00:23<00:04,  6.96it/s][A
+ 84%|████████▍ | 141/168 [00:23<00:03,  7.29it/s][A
+ 85%|████████▍ | 142/168 [00:23<00:03,  7.25it/s][A
+ 85%|████████▌ | 143/168 [00:23<00:04,  5.77it/s][A
+ 86%|████████▌ | 144/168 [00:24<00:04,  5.05it/s][A
+ 86%|████████▋ | 145/168 [00:24<00:04,  5.33it/s][A
+ 87%|████████▋ | 146/168 [00:24<00:03,  5.79it/s][A
+ 88%|████████▊ | 147/168 [00:24<00:03,  6.15it/s][A
+ 88%|████████▊ | 148/168 [00:24<00:02,  6.81it/s][A
+ 89%|████████▊ | 149/168 [00:24<00:03,  5.49it/s][A
+ 90%|████████▉ | 151/168 [00:25<00:03,  5.66it/s][A
+ 91%|█████████ | 153/168 [00:25<00:02,  5.77it/s][A
+ 92%|█████████▏| 154/168 [00:25<00:02,  6.05it/s][A
+ 92%|█████████▏| 155/168 [00:25<00:02,  5.80it/s][A
+ 93%|█████████▎| 156/168 [00:26<00:02,  5.39it/s][A
+ 93%|█████████▎| 157/168 [00:26<00:02,  5.36it/s][A
+ 94%|█████████▍| 158/168 [00:26<00:01,  6.07it/s][A
+ 95%|█████████▍| 159/168 [00:26<00:01,  6.11it/s][A
+ 95%|█████████▌| 160/168 [00:26<00:01,  5.82it/s][A
+ 96%|█████████▌| 161/168 [00:26<00:01,  5.74it/s][A
+ 96%|█████████▋| 162/168 [00:27<00:01,  5.98it/s][A
+ 97%|█████████▋| 163/168 [00:27<00:00,  5.17it/s][A
+ 98%|█████████▊| 164/168 [00:27<00:00,  5.61it/s][A
+ 98%|█████████▊| 165/168 [00:27<00:00,  6.31it/s][A
+ 99%|█████████▉| 166/168 [00:27<00:00,  6.78it/s][A
+100%|██████████| 168/168 [00:28<00:00,  6.50it/s][A
+[A{'eval_loss': 0.35242682695388794, 'eval_runtime': 28.2403, 'eval_samples_per_second': 11.862, 'eval_steps_per_second': 5.949, 'epoch': 1.0}
+100%|██████████| 376/376 [09:13<00:00,  1.47s/it]
+100%|██████████| 168/168 [00:28<00:00,  6.50it/s][A
+                                                 [A[INFO|trainer.py:1947] 2024-01-04 10:03:26,926 >>
+Training completed. Do not forget to share your model on huggingface.co/models =)
+{'train_runtime': 553.4721, 'train_samples_per_second': 5.44, 'train_steps_per_second': 0.679, 'train_loss': 0.4441075046011742, 'epoch': 1.0}
+100%|██████████| 376/376 [09:13<00:00,  1.47s/it]
+100%|██████████| 376/376 [09:13<00:00,  1.47s/it]
+[INFO|trainer.py:2889] 2024-01-04 10:03:26,930 >> Saving model checkpoint to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
+[INFO|tokenization_utils_base.py:2432] 2024-01-04 10:03:26,973 >> tokenizer config file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2441] 2024-01-04 10:03:26,974 >> Special tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/special_tokens_map.json
+[INFO|tokenization_utils_base.py:2492] 2024-01-04 10:03:26,974 >> added tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/added_tokens.json
+***** train metrics *****
+  epoch                    =        1.0
+  train_loss               =     0.4441
+  train_runtime            = 0:09:13.47
+  train_samples_per_second =       5.44
+  train_steps_per_second   =      0.679
+Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_loss.png
+Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_eval_loss.png
+[INFO|trainer.py:3166] 2024-01-04 10:03:27,895 >> ***** Running Evaluation *****
+[INFO|trainer.py:3168] 2024-01-04 10:03:27,895 >>   Num examples = 335
+[INFO|trainer.py:3171] 2024-01-04 10:03:27,895 >>   Batch size = 1
+  0%|          | 0/168 [00:00<?, ?it/s]
+  1%|          | 2/168 [00:00<00:16, 10.31it/s]
+  2%|▏         | 4/168 [00:00<00:21,  7.58it/s]
+  3%|▎         | 5/168 [00:00<00:27,  5.85it/s]
+  4%|▎         | 6/168 [00:00<00:24,  6.54it/s]
+  4%|▍         | 7/168 [00:01<00:28,  5.68it/s]
+  5%|▍         | 8/168 [00:01<00:27,  5.80it/s]
+  5%|▌         | 9/168 [00:01<00:31,  5.01it/s]
+  6%|▌         | 10/168 [00:01<00:33,  4.73it/s]
+  7%|▋         | 11/168 [00:01<00:29,  5.32it/s]
+  7%|▋         | 12/168 [00:02<00:32,  4.81it/s]
+  8%|▊         | 13/168 [00:02<00:29,  5.34it/s]
+  8%|▊         | 14/168 [00:02<00:26,  5.85it/s]
+  9%|▉         | 15/168 [00:02<00:29,  5.23it/s]
+ 10%|▉         | 16/168 [00:02<00:27,  5.50it/s]
+ 10%|█         | 17/168 [00:02<00:25,  5.90it/s]
+ 11%|█         | 18/168 [00:03<00:24,  6.21it/s]
+ 11%|█▏        | 19/168 [00:03<00:23,  6.45it/s]
+ 12%|█▏        | 20/168 [00:03<00:24,  6.14it/s]
+ 12%|█▎        | 21/168 [00:03<00:28,  5.19it/s]
+ 13%|█▎        | 22/168 [00:03<00:26,  5.46it/s]
+ 14%|█▍        | 24/168 [00:04<00:21,  6.74it/s]
+ 15%|█▍        | 25/168 [00:04<00:19,  7.15it/s]
+ 15%|█▌        | 26/168 [00:04<00:20,  6.95it/s]
+ 16%|█▌        | 27/168 [00:04<00:24,  5.74it/s]
+ 17%|█▋        | 28/168 [00:04<00:23,  6.08it/s]
+ 17%|█▋        | 29/168 [00:04<00:21,  6.38it/s]
+ 18%|█▊        | 30/168 [00:05<00:21,  6.49it/s]
+ 18%|█▊        | 31/168 [00:05<00:20,  6.69it/s]
+ 19%|█▉        | 32/168 [00:05<00:20,  6.76it/s]
+ 20%|██        | 34/168 [00:05<00:18,  7.39it/s]
+ 21%|██        | 35/168 [00:05<00:18,  7.31it/s]
+ 21%|██▏       | 36/168 [00:05<00:17,  7.68it/s]
+ 22%|██▏       | 37/168 [00:05<00:17,  7.57it/s]
+ 23%|██▎       | 38/168 [00:06<00:17,  7.43it/s]
+ 23%|██▎       | 39/168 [00:06<00:20,  6.28it/s]
+ 24%|██▍       | 40/168 [00:06<00:20,  6.31it/s]
+ 24%|██▍       | 41/168 [00:06<00:20,  6.30it/s]
+ 25%|██▌       | 42/168 [00:06<00:18,  6.89it/s]
+ 26%|██▌       | 43/168 [00:06<00:20,  5.96it/s]
+ 26%|██▌       | 44/168 [00:07<00:19,  6.46it/s]
+ 27%|██▋       | 45/168 [00:07<00:21,  5.72it/s]
+ 27%|██▋       | 46/168 [00:07<00:23,  5.28it/s]
+ 28%|██▊       | 47/168 [00:07<00:25,  4.73it/s]
+ 29%|██▊       | 48/168 [00:07<00:22,  5.28it/s]
+ 29%|██▉       | 49/168 [00:08<00:21,  5.64it/s]
+ 30%|██▉       | 50/168 [00:08<00:19,  6.08it/s]
+ 30%|███       | 51/168 [00:08<00:21,  5.42it/s]
+ 31%|███       | 52/168 [00:08<00:20,  5.68it/s]
+ 32%|███▏      | 53/168 [00:08<00:22,  5.00it/s]
+ 32%|███▏      | 54/168 [00:09<00:22,  5.10it/s]
+ 33%|███▎      | 55/168 [00:09<00:21,  5.22it/s]
+ 33%|███▎      | 56/168 [00:09<00:19,  5.75it/s]
+ 34%|███▍      | 57/168 [00:09<00:17,  6.50it/s]
+ 35%|███▍      | 58/168 [00:09<00:18,  5.82it/s]
+ 35%|███▌      | 59/168 [00:09<00:21,  5.01it/s]
+ 36%|███▌      | 60/168 [00:10<00:19,  5.43it/s]
+ 36%|███▋      | 61/168 [00:10<00:17,  6.23it/s]
+ 37%|███▋      | 62/168 [00:10<00:15,  6.80it/s]
+ 38%|███▊      | 63/168 [00:10<00:15,  6.98it/s]
+ 38%|███▊      | 64/168 [00:10<00:14,  7.13it/s]
+ 39%|███▊      | 65/168 [00:10<00:14,  6.99it/s]
+ 39%|███▉      | 66/168 [00:10<00:13,  7.40it/s]
+ 40%|███▉      | 67/168 [00:11<00:17,  5.83it/s]
+ 40%|████      | 68/168 [00:11<00:19,  5.08it/s]
+ 41%|████      | 69/168 [00:11<00:17,  5.55it/s]
+ 42%|████▏     | 70/168 [00:11<00:15,  6.15it/s]
+ 42%|████▏     | 71/168 [00:11<00:14,  6.50it/s]
+ 43%|████▎     | 72/168 [00:11<00:16,  5.78it/s]
+ 43%|████▎     | 73/168 [00:12<00:19,  4.99it/s]
+ 44%|████▍     | 74/168 [00:12<00:19,  4.94it/s]
+ 45%|████▍     | 75/168 [00:12<00:18,  5.11it/s]
+ 46%|████▌     | 77/168 [00:12<00:13,  6.51it/s]
+ 46%|████▋     | 78/168 [00:12<00:12,  7.02it/s]
+ 47%|████▋     | 79/168 [00:13<00:12,  7.07it/s]
+ 48%|████▊     | 80/168 [00:13<00:13,  6.42it/s]
+ 48%|████▊     | 81/168 [00:13<00:13,  6.36it/s]
+ 49%|████▉     | 82/168 [00:13<00:13,  6.30it/s]
+ 49%|████▉     | 83/168 [00:13<00:13,  6.38it/s]
+ 50%|█████     | 84/168 [00:13<00:14,  5.63it/s]
+ 51%|█████     | 85/168 [00:14<00:16,  4.99it/s]
+ 51%|█████     | 86/168 [00:14<00:14,  5.53it/s]
+ 52%|█████▏    | 87/168 [00:14<00:13,  6.01it/s]
+ 52%|█████▏    | 88/168 [00:14<00:12,  6.64it/s]
+ 53%|█████▎    | 89/168 [00:14<00:12,  6.46it/s]
+ 54%|█████▎    | 90/168 [00:14<00:11,  6.51it/s]
+ 54%|█████▍    | 91/168 [00:15<00:11,  6.53it/s]
+ 55%|█████▍    | 92/168 [00:15<00:10,  7.09it/s]
+ 55%|█████▌    | 93/168 [00:15<00:10,  7.20it/s]
+ 56%|█████▌    | 94/168 [00:15<00:10,  7.29it/s]
+ 57%|█████▋    | 95/168 [00:15<00:09,  7.34it/s]
+ 57%|█████▋    | 96/168 [00:15<00:12,  5.72it/s]
+ 58%|█████▊    | 97/168 [00:16<00:13,  5.09it/s]
+ 58%|█████▊    | 98/168 [00:16<00:14,  4.85it/s]
+ 59%|█████▉    | 99/168 [00:16<00:12,  5.42it/s]
+ 60%|█████▉    | 100/168 [00:16<00:11,  5.69it/s]
+ 60%|██████    | 101/168 [00:16<00:10,  6.38it/s]
+ 61%|██████    | 102/168 [00:16<00:10,  6.57it/s]
+ 61%|██████▏   | 103/168 [00:16<00:09,  7.15it/s]
+ 62%|██████▏   | 104/168 [00:17<00:11,  5.73it/s]
+ 62%|██████▎   | 105/168 [00:17<00:10,  6.08it/s]
+ 63%|██████▎   | 106/168 [00:17<00:09,  6.21it/s]
+ 64%|██████▎   | 107/168 [00:17<00:09,  6.46it/s]
+ 64%|██████▍   | 108/168 [00:17<00:09,  6.60it/s]
+ 65%|██████▍   | 109/168 [00:17<00:09,  6.36it/s]
+ 65%|██████▌   | 110/168 [00:18<00:08,  6.97it/s]
+ 66%|██████▌   | 111/168 [00:18<00:08,  6.43it/s]
+ 67%|██████▋   | 112/168 [00:18<00:08,  6.29it/s]
+ 67%|██████▋   | 113/168 [00:18<00:09,  6.02it/s]
+ 68%|██████▊   | 114/168 [00:18<00:08,  6.34it/s]
+ 68%|██████▊   | 115/168 [00:18<00:07,  6.66it/s]
+ 69%|██████▉   | 116/168 [00:19<00:08,  5.85it/s]
+ 70%|██████▉   | 117/168 [00:19<00:08,  6.21it/s]
+ 70%|███████   | 118/168 [00:19<00:08,  6.16it/s]
+ 71%|███████   | 119/168 [00:19<00:08,  5.47it/s]
+ 71%|███████▏  | 120/168 [00:19<00:09,  4.83it/s]
+ 72%|███████▏  | 121/168 [00:20<00:09,  4.99it/s]
+ 73%|███████▎  | 122/168 [00:20<00:09,  4.75it/s]
+ 73%|███████▎  | 123/168 [00:20<00:08,  5.47it/s]
+ 74%|███████▍  | 124/168 [00:20<00:08,  5.03it/s]
+ 74%|███████▍  | 125/168 [00:20<00:07,  5.49it/s]
+ 75%|███████▌  | 126/168 [00:20<00:06,  6.18it/s]
+ 76%|███████▌  | 127/168 [00:21<00:06,  6.57it/s]
+ 76%|███████▌  | 128/168 [00:21<00:07,  5.39it/s]
+ 77%|███████▋  | 129/168 [00:21<00:07,  5.44it/s]
+ 77%|███████▋  | 130/168 [00:21<00:07,  4.82it/s]
+ 78%|███████▊  | 131/168 [00:21<00:06,  5.59it/s]
+ 79%|███████▊  | 132/168 [00:22<00:06,  5.74it/s]
+ 79%|███████▉  | 133/168 [00:22<00:06,  5.67it/s]
+ 80%|███████▉  | 134/168 [00:22<00:05,  6.34it/s]
+ 80%|████████  | 135/168 [00:22<00:05,  6.52it/s]
+ 81%|████████  | 136/168 [00:22<00:04,  7.07it/s]
+ 82%|████████▏ | 137/168 [00:22<00:05,  5.69it/s]
+ 82%|████████▏ | 138/168 [00:23<00:05,  5.85it/s]
+ 83%|████████▎ | 139/168 [00:23<00:04,  6.27it/s]
+ 83%|████████▎ | 140/168 [00:23<00:04,  6.96it/s]
+ 84%|████████▍ | 141/168 [00:23<00:03,  7.33it/s]
+ 85%|████████▍ | 142/168 [00:23<00:03,  7.28it/s]
+ 85%|████████▌ | 143/168 [00:23<00:04,  5.78it/s]
+ 86%|████████▌ | 144/168 [00:24<00:04,  5.05it/s]
+ 86%|████████▋ | 145/168 [00:24<00:04,  5.33it/s]
+ 87%|████████▋ | 146/168 [00:24<00:03,  5.79it/s]
+ 88%|████████▊ | 147/168 [00:24<00:03,  6.16it/s]
+ 88%|████████▊ | 148/168 [00:24<00:02,  6.82it/s]
+ 89%|████████▊ | 149/168 [00:24<00:03,  5.53it/s]
+ 90%|████████▉ | 151/168 [00:25<00:02,  5.67it/s]
+ 91%|█████████ | 153/168 [00:25<00:02,  5.77it/s]
+ 92%|█████████▏| 154/168 [00:25<00:02,  6.06it/s]
+ 92%|█████████▏| 155/168 [00:25<00:02,  5.79it/s]
+ 93%|█████████▎| 156/168 [00:26<00:02,  5.38it/s]
+ 93%|█████████▎| 157/168 [00:26<00:02,  5.35it/s]
+ 94%|█████████▍| 158/168 [00:26<00:01,  6.06it/s]
+ 95%|█████████▍| 159/168 [00:26<00:01,  6.10it/s]
+ 95%|█████████▌| 160/168 [00:26<00:01,  5.81it/s]
+ 96%|█████████▌| 161/168 [00:26<00:01,  5.73it/s]
+ 96%|█████████▋| 162/168 [00:27<00:01,  5.98it/s]
+ 97%|█████████▋| 163/168 [00:27<00:00,  5.16it/s]
+ 98%|█████████▊| 164/168 [00:27<00:00,  5.61it/s]
+ 98%|█████████▊| 165/168 [00:27<00:00,  6.31it/s]
+ 99%|█████████▉| 166/168 [00:27<00:00,  6.79it/s]
+100%|██████████| 168/168 [00:28<00:00,  6.51it/s]
+100%|██████████| 168/168 [00:28<00:00,  5.99it/s]
+***** eval metrics *****
+  epoch                   =        1.0
+  eval_loss               =     0.3524
+  eval_runtime            = 0:00:28.24
+  eval_samples_per_second =     11.859
+  eval_steps_per_second   =      5.947
+[INFO|modelcard.py:452] 2024-01-04 10:03:56,150 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}

train_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 1.0,
+    "train_loss": 0.4441075046011742,
+    "train_runtime": 553.4721,
+    "train_samples_per_second": 5.44,
+    "train_steps_per_second": 0.679
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,40 @@

+{"current_steps": 10, "total_steps": 376, "loss": 1.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991278696516879e-05, "epoch": 0.03, "percentage": 2.66, "elapsed_time": "0:00:14", "remaining_time": "0:08:49"}
+{"current_steps": 20, "total_steps": 376, "loss": 0.881, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9651756349750716e-05, "epoch": 0.05, "percentage": 5.32, "elapsed_time": "0:00:28", "remaining_time": "0:08:24"}
+{"current_steps": 30, "total_steps": 376, "loss": 0.7979, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9218729375518135e-05, "epoch": 0.08, "percentage": 7.98, "elapsed_time": "0:00:43", "remaining_time": "0:08:16"}
+{"current_steps": 40, "total_steps": 376, "loss": 0.7022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.861672729019797e-05, "epoch": 0.11, "percentage": 10.64, "elapsed_time": "0:00:57", "remaining_time": "0:08:05"}
+{"current_steps": 50, "total_steps": 376, "loss": 0.5844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.784995028809707e-05, "epoch": 0.13, "percentage": 13.3, "elapsed_time": "0:01:11", "remaining_time": "0:07:45"}
+{"current_steps": 60, "total_steps": 376, "loss": 0.4454, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.692374820516679e-05, "epoch": 0.16, "percentage": 15.96, "elapsed_time": "0:01:24", "remaining_time": "0:07:24"}
+{"current_steps": 70, "total_steps": 376, "loss": 0.4076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.584458319296868e-05, "epoch": 0.19, "percentage": 18.62, "elapsed_time": "0:01:38", "remaining_time": "0:07:12"}
+{"current_steps": 80, "total_steps": 376, "loss": 0.4111, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4619984631966524e-05, "epoch": 0.21, "percentage": 21.28, "elapsed_time": "0:01:52", "remaining_time": "0:06:56"}
+{"current_steps": 90, "total_steps": 376, "loss": 0.4115, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3258496598716736e-05, "epoch": 0.24, "percentage": 23.94, "elapsed_time": "0:02:07", "remaining_time": "0:06:43"}
+{"current_steps": 100, "total_steps": 376, "loss": 0.3566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.176961825348059e-05, "epoch": 0.27, "percentage": 26.6, "elapsed_time": "0:02:19", "remaining_time": "0:06:25"}
+{"current_steps": 110, "total_steps": 376, "loss": 0.4302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.016373756417669e-05, "epoch": 0.29, "percentage": 29.26, "elapsed_time": "0:02:33", "remaining_time": "0:06:11"}
+{"current_steps": 120, "total_steps": 376, "loss": 0.4271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.845205882908432e-05, "epoch": 0.32, "percentage": 31.91, "elapsed_time": "0:02:47", "remaining_time": "0:05:58"}
+{"current_steps": 130, "total_steps": 376, "loss": 0.4625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6646524503974955e-05, "epoch": 0.35, "percentage": 34.57, "elapsed_time": "0:03:02", "remaining_time": "0:05:45"}
+{"current_steps": 140, "total_steps": 376, "loss": 0.5066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.475973187908737e-05, "epoch": 0.37, "percentage": 37.23, "elapsed_time": "0:03:17", "remaining_time": "0:05:32"}
+{"current_steps": 150, "total_steps": 376, "loss": 0.3887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.280484518729466e-05, "epoch": 0.4, "percentage": 39.89, "elapsed_time": "0:03:32", "remaining_time": "0:05:19"}
+{"current_steps": 160, "total_steps": 376, "loss": 0.3675, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.079550375668821e-05, "epoch": 0.42, "percentage": 42.55, "elapsed_time": "0:03:44", "remaining_time": "0:05:03"}
+{"current_steps": 170, "total_steps": 376, "loss": 0.4095, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8745726848402036e-05, "epoch": 0.45, "percentage": 45.21, "elapsed_time": "0:03:59", "remaining_time": "0:04:49"}
+{"current_steps": 180, "total_steps": 376, "loss": 0.3782, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6669815843628042e-05, "epoch": 0.48, "percentage": 47.87, "elapsed_time": "0:04:13", "remaining_time": "0:04:35"}
+{"current_steps": 190, "total_steps": 376, "loss": 0.4195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4582254462267476e-05, "epoch": 0.5, "percentage": 50.53, "elapsed_time": "0:04:28", "remaining_time": "0:04:22"}
+{"current_steps": 200, "total_steps": 376, "loss": 0.3392, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2497607709397543e-05, "epoch": 0.53, "percentage": 53.19, "elapsed_time": "0:04:43", "remaining_time": "0:04:09"}
+{"current_steps": 210, "total_steps": 376, "loss": 0.3347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0430420254607748e-05, "epoch": 0.56, "percentage": 55.85, "elapsed_time": "0:04:56", "remaining_time": "0:03:54"}
+{"current_steps": 220, "total_steps": 376, "loss": 0.4117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8395114953217852e-05, "epoch": 0.58, "percentage": 58.51, "elapsed_time": "0:05:09", "remaining_time": "0:03:39"}
+{"current_steps": 230, "total_steps": 376, "loss": 0.3772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.640589221739926e-05, "epoch": 0.61, "percentage": 61.17, "elapsed_time": "0:05:23", "remaining_time": "0:03:25"}
+{"current_steps": 240, "total_steps": 376, "loss": 0.4403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.447663093929163e-05, "epoch": 0.64, "percentage": 63.83, "elapsed_time": "0:05:36", "remaining_time": "0:03:10"}
+{"current_steps": 250, "total_steps": 376, "loss": 0.3867, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2620791657378664e-05, "epoch": 0.66, "percentage": 66.49, "elapsed_time": "0:05:50", "remaining_time": "0:02:56"}
+{"current_steps": 260, "total_steps": 376, "loss": 0.3688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0851322641735118e-05, "epoch": 0.69, "percentage": 69.15, "elapsed_time": "0:06:04", "remaining_time": "0:02:42"}
+{"current_steps": 270, "total_steps": 376, "loss": 0.3655, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.180569553392535e-06, "epoch": 0.72, "percentage": 71.81, "elapsed_time": "0:06:19", "remaining_time": "0:02:28"}
+{"current_steps": 280, "total_steps": 376, "loss": 0.4144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.620189308133943e-06, "epoch": 0.74, "percentage": 74.47, "elapsed_time": "0:06:33", "remaining_time": "0:02:15"}
+{"current_steps": 290, "total_steps": 376, "loss": 0.3298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.181068745693716e-06, "epoch": 0.77, "percentage": 77.13, "elapsed_time": "0:06:47", "remaining_time": "0:02:00"}
+{"current_steps": 300, "total_steps": 376, "loss": 0.3337, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.873248671810928e-06, "epoch": 0.8, "percentage": 79.79, "elapsed_time": "0:07:00", "remaining_time": "0:01:46"}
+{"current_steps": 310, "total_steps": 376, "loss": 0.3217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7058538030980942e-06, "epoch": 0.82, "percentage": 82.45, "elapsed_time": "0:07:13", "remaining_time": "0:01:32"}
+{"current_steps": 320, "total_steps": 376, "loss": 0.3222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.687029103502972e-06, "epoch": 0.85, "percentage": 85.11, "elapsed_time": "0:07:26", "remaining_time": "0:01:18"}
+{"current_steps": 330, "total_steps": 376, "loss": 0.3989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.823882956546566e-06, "epoch": 0.88, "percentage": 87.77, "elapsed_time": "0:07:41", "remaining_time": "0:01:04"}
+{"current_steps": 340, "total_steps": 376, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1224375698271894e-06, "epoch": 0.9, "percentage": 90.43, "elapsed_time": "0:07:55", "remaining_time": "0:00:50"}
+{"current_steps": 350, "total_steps": 376, "loss": 0.4108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.875869578203824e-07, "epoch": 0.93, "percentage": 93.09, "elapsed_time": "0:08:08", "remaining_time": "0:00:36"}
+{"current_steps": 360, "total_steps": 376, "loss": 0.3578, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.230627961304993e-07, "epoch": 0.96, "percentage": 95.74, "elapsed_time": "0:08:21", "remaining_time": "0:00:22"}
+{"current_steps": 370, "total_steps": 376, "loss": 0.3453, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1408385430356516e-08, "epoch": 0.98, "percentage": 98.4, "elapsed_time": "0:08:36", "remaining_time": "0:00:08"}
+{"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
+{"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
+{"current_steps": 168, "total_steps": 168, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:09:42", "remaining_time": "0:00:00"}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.99867197875166,
+  "eval_steps": 500,
+  "global_step": 376,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03,
+      "learning_rate": 4.991278696516879e-05,
+      "loss": 1.0017,
+      "step": 10
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.9651756349750716e-05,
+      "loss": 0.881,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 4.9218729375518135e-05,
+      "loss": 0.7979,
+      "step": 30
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 4.861672729019797e-05,
+      "loss": 0.7022,
+      "step": 40
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 4.784995028809707e-05,
+      "loss": 0.5844,
+      "step": 50
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 4.692374820516679e-05,
+      "loss": 0.4454,
+      "step": 60
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.584458319296868e-05,
+      "loss": 0.4076,
+      "step": 70
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.4619984631966524e-05,
+      "loss": 0.4111,
+      "step": 80
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.3258496598716736e-05,
+      "loss": 0.4115,
+      "step": 90
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.176961825348059e-05,
+      "loss": 0.3566,
+      "step": 100
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.016373756417669e-05,
+      "loss": 0.4302,
+      "step": 110
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 3.845205882908432e-05,
+      "loss": 0.4271,
+      "step": 120
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 3.6646524503974955e-05,
+      "loss": 0.4625,
+      "step": 130
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 3.475973187908737e-05,
+      "loss": 0.5066,
+      "step": 140
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 3.280484518729466e-05,
+      "loss": 0.3887,
+      "step": 150
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 3.079550375668821e-05,
+      "loss": 0.3675,
+      "step": 160
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 2.8745726848402036e-05,
+      "loss": 0.4095,
+      "step": 170
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 2.6669815843628042e-05,
+      "loss": 0.3782,
+      "step": 180
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 2.4582254462267476e-05,
+      "loss": 0.4195,
+      "step": 190
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 2.2497607709397543e-05,
+      "loss": 0.3392,
+      "step": 200
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 2.0430420254607748e-05,
+      "loss": 0.3347,
+      "step": 210
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 1.8395114953217852e-05,
+      "loss": 0.4117,
+      "step": 220
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 1.640589221739926e-05,
+      "loss": 0.3772,
+      "step": 230
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 1.447663093929163e-05,
+      "loss": 0.4403,
+      "step": 240
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 1.2620791657378664e-05,
+      "loss": 0.3867,
+      "step": 250
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 1.0851322641735118e-05,
+      "loss": 0.3688,
+      "step": 260
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 9.180569553392535e-06,
+      "loss": 0.3655,
+      "step": 270
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 7.620189308133943e-06,
+      "loss": 0.4144,
+      "step": 280
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 6.181068745693716e-06,
+      "loss": 0.3298,
+      "step": 290
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 4.873248671810928e-06,
+      "loss": 0.3337,
+      "step": 300
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 3.7058538030980942e-06,
+      "loss": 0.3217,
+      "step": 310
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 2.687029103502972e-06,
+      "loss": 0.3222,
+      "step": 320
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 1.823882956546566e-06,
+      "loss": 0.3989,
+      "step": 330
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 1.1224375698271894e-06,
+      "loss": 0.3805,
+      "step": 340
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 5.875869578203824e-07,
+      "loss": 0.4108,
+      "step": 350
+    },
+    {
+      "epoch": 0.96,
+      "learning_rate": 2.230627961304993e-07,
+      "loss": 0.3578,
+      "step": 360
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 3.1408385430356516e-08,
+      "loss": 0.3453,
+      "step": 370
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.35242682695388794,
+      "eval_runtime": 28.2403,
+      "eval_samples_per_second": 11.862,
+      "eval_steps_per_second": 5.949,
+      "step": 376
+    },
+    {
+      "epoch": 1.0,
+      "step": 376,
+      "total_flos": 2.22435081191424e+16,
+      "train_loss": 0.4441075046011742,
+      "train_runtime": 553.4721,
+      "train_samples_per_second": 5.44,
+      "train_steps_per_second": 0.679
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 376,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "total_flos": 2.22435081191424e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43631845fad65fe78dc251bc687da889571e404213e9b9c084870732e93c38ea
+size 4984

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff