diff --git a/README.md b/README.md index 154df8298fab5ecf322016157858e08cd1bccbe1..bd6cf4fd9a9be6e372b435a19ee0572855d8507e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,20 @@ --- -license: apache-2.0 +library_name: peft --- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d1627b2357785cc7ae0c5ba8da5cf93ecd0e8781 --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c97a3bd6b22c34728f2f3fab2f809d43641be7294bd9b98909fc4f7fc889a64 +size 792353 diff --git a/checkpoint-12360/README.md b/checkpoint-12360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12360/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12360/adapter_config.json b/checkpoint-12360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12360/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12360/adapter_model.bin b/checkpoint-12360/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..109cd09dec337c4fdc95d7c114af701e6c5bba5c --- /dev/null +++ b/checkpoint-12360/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3bd39cae0272bd5fae91b640061201e810518807bc8c427109022a1099d8aac +size 792353 diff --git a/checkpoint-12360/optimizer.pt b/checkpoint-12360/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5817a62665772e0478ed05f79c4562070c8ba3c --- /dev/null +++ b/checkpoint-12360/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed27b4cc5ad7e5f3014aa32fc8c3cf435e1a448088270805ce3e38e082a5cd9 +size 1586873 diff --git a/checkpoint-12360/rng_state.pth b/checkpoint-12360/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12360/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12360/scheduler.pt b/checkpoint-12360/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cce1aa8929d3edb5a511f3511074fc1329da03eb --- /dev/null +++ b/checkpoint-12360/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845c1f57b7b77abd8b8e3a8652626f29e87a4177c0fdccb7d2b228b260242406 +size 627 diff --git a/checkpoint-12360/trainer_state.json b/checkpoint-12360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d32a09de21bc7464dd536b35a66da42c481d974c --- /dev/null +++ b/checkpoint-12360/trainer_state.json @@ -0,0 +1,754 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.65625, + "global_step": 12360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9246417995335680.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12360/training_args.bin b/checkpoint-12360/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12360/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12390/README.md b/checkpoint-12390/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12390/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12390/adapter_config.json b/checkpoint-12390/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12390/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12390/adapter_model.bin b/checkpoint-12390/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9a00cb2e0eb772c46a6cbed74714802f2fd4bea --- /dev/null +++ b/checkpoint-12390/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2638e3a4999749d4f6a329f2d948a026f296cc0606de56f66eb59a7a5435ae82 +size 792353 diff --git a/checkpoint-12390/optimizer.pt b/checkpoint-12390/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..62bf222403c0c8687afbffee8234999adbb37e61 --- /dev/null +++ b/checkpoint-12390/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:611d9b387269a4b7f63640b47040166c4f161581bf62376c80b680134454b7c6 +size 1586873 diff --git a/checkpoint-12390/rng_state.pth b/checkpoint-12390/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12390/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12390/scheduler.pt b/checkpoint-12390/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..502a8adc7e5b55f4d074a1092607f3cdd7e2f1b3 --- /dev/null +++ b/checkpoint-12390/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd03d1ae0c09c9520cd92e5bf2331a87151cb6edff37a2a3de420ef62b3a176f +size 627 diff --git a/checkpoint-12390/trainer_state.json b/checkpoint-12390/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b9fd1143f864aeadb70dd73014312d16b1e5756b --- /dev/null +++ b/checkpoint-12390/trainer_state.json @@ -0,0 +1,754 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.6796875, + "global_step": 12390, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9270435050588160.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12390/training_args.bin b/checkpoint-12390/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12390/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12420/README.md b/checkpoint-12420/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12420/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12420/adapter_config.json b/checkpoint-12420/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12420/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12420/adapter_model.bin b/checkpoint-12420/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fba50f575f3ddedc60c5ff24a4e9f8b34d0fd35d --- /dev/null +++ b/checkpoint-12420/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:209158f1892bccba2617213b558a39d11cddd1e5136016bb669fc187b741f152 +size 792353 diff --git a/checkpoint-12420/optimizer.pt b/checkpoint-12420/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b27a89f9bd58cc8bd518fa553cff4de7574ebe4 --- /dev/null +++ b/checkpoint-12420/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306b66f63a98bc0870cc880ebaf5b767253138b2e0f8e2badd78119ae6ed0284 +size 1586873 diff --git a/checkpoint-12420/rng_state.pth b/checkpoint-12420/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12420/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12420/scheduler.pt b/checkpoint-12420/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5917369b9ef800d16a2c1c7822d3f351cdc6368 --- /dev/null +++ b/checkpoint-12420/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67effef7e1527b34cecc0682500e658ff0a9d2cce8252039f87084b77aabfa5d +size 627 diff --git a/checkpoint-12420/trainer_state.json b/checkpoint-12420/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98176eefe7cf02c027d3bf01fcea7fc0e17245b6 --- /dev/null +++ b/checkpoint-12420/trainer_state.json @@ -0,0 +1,760 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.703125, + "global_step": 12420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9292082437785600.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12420/training_args.bin b/checkpoint-12420/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12420/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12450/README.md b/checkpoint-12450/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12450/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12450/adapter_config.json b/checkpoint-12450/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12450/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12450/adapter_model.bin b/checkpoint-12450/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c8a0f5f5e33866c36b84056333dde37f2d107c --- /dev/null +++ b/checkpoint-12450/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31656b0458422251ad42a99d942ee864bd478e658dc330893b815d1bb5af7fb4 +size 792353 diff --git a/checkpoint-12450/optimizer.pt b/checkpoint-12450/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..95a54f31a6c71e49c024573497068b4a856da364 --- /dev/null +++ b/checkpoint-12450/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad54a640ed5e53747f52caa10179f60dd135cc0ae48257cdf042352a7a6e6ee0 +size 1586873 diff --git a/checkpoint-12450/rng_state.pth b/checkpoint-12450/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12450/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12450/scheduler.pt b/checkpoint-12450/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..313d616236442f6bbb038f01639f47598eec3dab --- /dev/null +++ b/checkpoint-12450/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3d27439df90a9a7352314b9b8e368610f9dd675c06874da737b447cd1bc3435 +size 627 diff --git a/checkpoint-12450/trainer_state.json b/checkpoint-12450/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb0fb2f0942b8398beb4b374ecfb4a18867c8d56 --- /dev/null +++ b/checkpoint-12450/trainer_state.json @@ -0,0 +1,760 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.7265625, + "global_step": 12450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9314514190632960.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12450/training_args.bin b/checkpoint-12450/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12450/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12480/README.md b/checkpoint-12480/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12480/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12480/adapter_config.json b/checkpoint-12480/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12480/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12480/adapter_model.bin b/checkpoint-12480/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..31b6e43a24e366ca94928fcb4917c8aae065c42d --- /dev/null +++ b/checkpoint-12480/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e59da02aba0ecff94c22170bd5085bbb8e5bac87f5689bde7d13318ccaa683d4 +size 792353 diff --git a/checkpoint-12480/optimizer.pt b/checkpoint-12480/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9b57922f2d9e7c437908e4b620e4d34196dd1be --- /dev/null +++ b/checkpoint-12480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78af0f47d42de1be9f4882eb373ce9f1a2130d3396b4ad2d8ee94886ef73ad3a +size 1586873 diff --git a/checkpoint-12480/rng_state.pth b/checkpoint-12480/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12480/scheduler.pt b/checkpoint-12480/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ced23d101006d6aafbb582fc6be45f611ec9594 --- /dev/null +++ b/checkpoint-12480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f07aed61c9e1af3e6872c4649dbcbdb2c02eaa0deb0ce84f1b9db4bec9454439 +size 627 diff --git a/checkpoint-12480/trainer_state.json b/checkpoint-12480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a6f2c64b4ed1b0cd0eeb9d908a599c713ab884c0 --- /dev/null +++ b/checkpoint-12480/trainer_state.json @@ -0,0 +1,760 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.75, + "global_step": 12480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9336653187287040.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12480/training_args.bin b/checkpoint-12480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12510/README.md b/checkpoint-12510/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12510/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12510/adapter_config.json b/checkpoint-12510/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12510/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12510/adapter_model.bin b/checkpoint-12510/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..69a3913b8521c6fe896b82cf19db3f502cd752be --- /dev/null +++ b/checkpoint-12510/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b948a1d9a656951f8d37c2a23f3fef23ec4359f73d94eb2c5083218d521774ba +size 792353 diff --git a/checkpoint-12510/optimizer.pt b/checkpoint-12510/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..752f82bf886379af1b48b4f8caaa46643f82c7c7 --- /dev/null +++ b/checkpoint-12510/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c4677e1bd62cddb8d9dda555f90a9ba7f55b880e4cba14ceac1778354d9796f +size 1586873 diff --git a/checkpoint-12510/rng_state.pth b/checkpoint-12510/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12510/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12510/scheduler.pt b/checkpoint-12510/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..da34fb72b96e06bd2b1cc92e885d4e1b1aa43a1c --- /dev/null +++ b/checkpoint-12510/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226c5739a6beb840fe019bb17102960eab6abdb587f0b62865030f81daa87392 +size 627 diff --git a/checkpoint-12510/trainer_state.json b/checkpoint-12510/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fd57e728d9b8f86400abbd6d769ee2bc0ea34a6 --- /dev/null +++ b/checkpoint-12510/trainer_state.json @@ -0,0 +1,766 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.7734375, + "global_step": 12510, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9357159930071040.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12510/training_args.bin b/checkpoint-12510/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12510/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12540/README.md b/checkpoint-12540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12540/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12540/adapter_config.json b/checkpoint-12540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12540/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12540/adapter_model.bin b/checkpoint-12540/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..72221450ddfe125628e699c8a00ff030b8d63870 --- /dev/null +++ b/checkpoint-12540/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955ff7ca5402929f905c9ce0cc6ccf12674919835a74a06eb62de196677b8a93 +size 792353 diff --git a/checkpoint-12540/optimizer.pt b/checkpoint-12540/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb120ab903ebb64d891bfc56e35d2ad58f6667bf --- /dev/null +++ b/checkpoint-12540/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c16f76e13fa79ac7abe1eec93d50b18c9ccd46ded1f51cc980765a7b96a3cb2 +size 1586873 diff --git a/checkpoint-12540/rng_state.pth b/checkpoint-12540/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12540/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12540/scheduler.pt b/checkpoint-12540/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..24ec4e0af448aa81d7dbe59d4b301b1915c8bf8d --- /dev/null +++ b/checkpoint-12540/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8023ff8d248d83f9a08b282630cb6e9d5e06b86c31e71173fd08aa8157312a0 +size 627 diff --git a/checkpoint-12540/trainer_state.json b/checkpoint-12540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97af0add821881f0f57e82bb4372889b4848e892 --- /dev/null +++ b/checkpoint-12540/trainer_state.json @@ -0,0 +1,766 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.796875, + "global_step": 12540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9380392619673600.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12540/training_args.bin b/checkpoint-12540/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12540/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12570/README.md b/checkpoint-12570/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12570/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12570/adapter_config.json b/checkpoint-12570/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12570/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12570/adapter_model.bin b/checkpoint-12570/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..00dcd4b35aa6dfb5123a24fdae2d8f85566c427c --- /dev/null +++ b/checkpoint-12570/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8c7c38db0f83cb6f00f14513ed71709834c9081514e1c06d81f0e726489124 +size 792353 diff --git a/checkpoint-12570/optimizer.pt b/checkpoint-12570/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..336e049a46df70d7a65d3836e2a00bb240c3121b --- /dev/null +++ b/checkpoint-12570/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b58340ac201251a4ac38dd17c89eb4a84c996e63ce1dae378660610efda9a96f +size 1586873 diff --git a/checkpoint-12570/rng_state.pth b/checkpoint-12570/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12570/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12570/scheduler.pt b/checkpoint-12570/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..338f58484c786ee0ad3406940b1cd944192edb47 --- /dev/null +++ b/checkpoint-12570/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:950433b35d41989a6b3f63aed09ad88bca311f5c384eb91cc61ee54e5c7a2369 +size 627 diff --git a/checkpoint-12570/trainer_state.json b/checkpoint-12570/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad4b0cd5b590ba0c5399cffde8237e5e1b119c86 --- /dev/null +++ b/checkpoint-12570/trainer_state.json @@ -0,0 +1,766 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.8203125, + "global_step": 12570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9402225050880000.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12570/training_args.bin b/checkpoint-12570/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12570/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12600/README.md b/checkpoint-12600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12600/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12600/adapter_config.json b/checkpoint-12600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12600/adapter_model.bin b/checkpoint-12600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..63002350e86d9d79f324d01a3b3a0a2e36c4469d --- /dev/null +++ b/checkpoint-12600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb74509279dc802c936678e269b4fc49b960c6539f894d10529f6702065ec62 +size 792353 diff --git a/checkpoint-12600/optimizer.pt b/checkpoint-12600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdf866641c857152a436314bcdbbc5274c0c65f8 --- /dev/null +++ b/checkpoint-12600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cbf0adbf51eeebca198eca16759ff6d3b2da63b06f40d9f13e59215d08d3832 +size 1586873 diff --git a/checkpoint-12600/rng_state.pth b/checkpoint-12600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12600/scheduler.pt b/checkpoint-12600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4300129601b52bbac1085fdb1788dee930425498 --- /dev/null +++ b/checkpoint-12600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f365ae999339b9fc5dc591e2b698f80f819d47fcaa80a46f306bfede567f39f3 +size 627 diff --git a/checkpoint-12600/trainer_state.json b/checkpoint-12600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..939d6653645b64d3f484b4cab975e5798f720f16 --- /dev/null +++ b/checkpoint-12600/trainer_state.json @@ -0,0 +1,772 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.84375, + "global_step": 12600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9426363627571200.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12600/training_args.bin b/checkpoint-12600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12630/README.md b/checkpoint-12630/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12630/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12630/adapter_config.json b/checkpoint-12630/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12630/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12630/adapter_model.bin b/checkpoint-12630/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e45a145ba8e0b6765b319495a804a8d08c9115ab --- /dev/null +++ b/checkpoint-12630/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc22f462356f8fec57832115e22f3dbc1ba6554c07d6ce511f7e6f44d608809 +size 792353 diff --git a/checkpoint-12630/optimizer.pt b/checkpoint-12630/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..94db8b3efbd1884aa5774dc024f265487a54330f --- /dev/null +++ b/checkpoint-12630/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e7badbbf0c3b6cd16d78e3b6742c5e41afc805eae5895f244597e0f5f4e353 +size 1586873 diff --git a/checkpoint-12630/rng_state.pth b/checkpoint-12630/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12630/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12630/scheduler.pt b/checkpoint-12630/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9794858ab23f7bd4934a2bc31f077027502da96b --- /dev/null +++ b/checkpoint-12630/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcc9e36e53e9f81c2c3b662e2b0794cd33666ab7d8c8f842527d08bf25b68ef +size 627 diff --git a/checkpoint-12630/trainer_state.json b/checkpoint-12630/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..adef428b4a916f49d1c3d8ae6a09fff736722c0e --- /dev/null +++ b/checkpoint-12630/trainer_state.json @@ -0,0 +1,772 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.8671875, + "global_step": 12630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9449869740410880.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12630/training_args.bin b/checkpoint-12630/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12630/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12660/README.md b/checkpoint-12660/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12660/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12660/adapter_config.json b/checkpoint-12660/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12660/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12660/adapter_model.bin b/checkpoint-12660/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9d81b0ce39e357d38359c2c55945b8018200a449 --- /dev/null +++ b/checkpoint-12660/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904f887678da3f2b45202831dde920494e45f975eb9031f1617f3fe4aaf57a42 +size 792353 diff --git a/checkpoint-12660/optimizer.pt b/checkpoint-12660/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..87411ff569d72c2ef752910958edf774fbb6e793 --- /dev/null +++ b/checkpoint-12660/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a829a216c0fe644b301ce1b181cb886dbe628234e2c1ad522d9ca9cce6088c1 +size 1586873 diff --git a/checkpoint-12660/rng_state.pth b/checkpoint-12660/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12660/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12660/scheduler.pt b/checkpoint-12660/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c8ef4c0a31f3e99cf5f52f61f6800dcdd9b1fb7 --- /dev/null +++ b/checkpoint-12660/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6d2be081ff393eca9a9a681bc61143b2daaa0bb6d868e8af9e7b2f5065d6937 +size 627 diff --git a/checkpoint-12660/trainer_state.json b/checkpoint-12660/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..074a55e2205a492a0f0ee663df159ce82a71c4fa --- /dev/null +++ b/checkpoint-12660/trainer_state.json @@ -0,0 +1,772 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.890625, + "global_step": 12660, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9473555373557760.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12660/training_args.bin b/checkpoint-12660/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12660/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12690/README.md b/checkpoint-12690/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12690/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12690/adapter_config.json b/checkpoint-12690/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12690/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12690/adapter_model.bin b/checkpoint-12690/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a99877c59a26ef2825880af0289f69322e731f --- /dev/null +++ b/checkpoint-12690/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543e3bc1c42c2c57c96773ed5ab6b81a782b41904167bee91a026da2c40f34ca +size 792353 diff --git a/checkpoint-12690/optimizer.pt b/checkpoint-12690/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d24d487f5f44ffd06f976a4fb64cf48ea9135d3 --- /dev/null +++ b/checkpoint-12690/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54cc0cf523941490c7b0bc533409eee3a6cdd7d1655e9c21e5819320f906cb7f +size 1586873 diff --git a/checkpoint-12690/rng_state.pth b/checkpoint-12690/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12690/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12690/scheduler.pt b/checkpoint-12690/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..69dcc451ded9ddfad681f72338a658e4fb4ede72 --- /dev/null +++ b/checkpoint-12690/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987483390a812aafdf8cf606a3c9578ed228b627430de2f56b1b136a47379e40 +size 627 diff --git a/checkpoint-12690/trainer_state.json b/checkpoint-12690/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f0b34e52ccf6d313625d1d62a248b95fd81cfce --- /dev/null +++ b/checkpoint-12690/trainer_state.json @@ -0,0 +1,772 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.9140625, + "global_step": 12690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9495992650106880.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12690/training_args.bin b/checkpoint-12690/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12690/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12720/README.md b/checkpoint-12720/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12720/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12720/adapter_config.json b/checkpoint-12720/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12720/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12720/adapter_model.bin b/checkpoint-12720/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0e586b867329c8021ef527871a4c06f7b2c98fa9 --- /dev/null +++ b/checkpoint-12720/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c134026673c618737724b7171f9988f8af4550398046bc9d169b856603e7e71 +size 792353 diff --git a/checkpoint-12720/optimizer.pt b/checkpoint-12720/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9fac7c43d324bc8181a75605cb9e60041d96d6b --- /dev/null +++ b/checkpoint-12720/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:813ce6dc1e870451f0268e9fe17eea056f47a126d30251798e759408bdffc885 +size 1586873 diff --git a/checkpoint-12720/rng_state.pth b/checkpoint-12720/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12720/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12720/scheduler.pt b/checkpoint-12720/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b17edf7d1217a7723df3396a271ec9343d91ce7b --- /dev/null +++ b/checkpoint-12720/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116b2a3568b9c4374845c3a4268c5e50a26a378f9b3a2acdcbc0535addb844ac +size 627 diff --git a/checkpoint-12720/trainer_state.json b/checkpoint-12720/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fdfaa80ccdb27ea600d500ac70ba2643029325f3 --- /dev/null +++ b/checkpoint-12720/trainer_state.json @@ -0,0 +1,778 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.9375, + "global_step": 12720, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + }, + { + "epoch": 9.92, + "learning_rate": 0.0001, + "loss": 3.877, + "step": 12700 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9519432478525440.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12720/training_args.bin b/checkpoint-12720/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12720/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12750/README.md b/checkpoint-12750/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12750/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12750/adapter_config.json b/checkpoint-12750/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12750/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12750/adapter_model.bin b/checkpoint-12750/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9d35ca5cb088f43ccbb4131b6891411aa3b19805 --- /dev/null +++ b/checkpoint-12750/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d199a9ff5ae81bb62a5adb0b8b2d283c13b70cbce62eb8b53816aab6b04286 +size 792353 diff --git a/checkpoint-12750/optimizer.pt b/checkpoint-12750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d251d7d6ebc363750259fa861757f964b54c16a2 --- /dev/null +++ b/checkpoint-12750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6012ab0a3d8c109fd93f96db70d3f6859f3510a0f54d192057caafede25c36e3 +size 1586873 diff --git a/checkpoint-12750/rng_state.pth b/checkpoint-12750/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12750/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12750/scheduler.pt b/checkpoint-12750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..42b09301c6e8ee87c28a16a38f2c37a9b2ab6eae --- /dev/null +++ b/checkpoint-12750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a92e7a6b3117538e24485e02fcd8b1f78cc3929e670f1c8bc3e7936f4e56966 +size 627 diff --git a/checkpoint-12750/trainer_state.json b/checkpoint-12750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e147b85bdee66dca2cf58b75258d473a6f3e82aa --- /dev/null +++ b/checkpoint-12750/trainer_state.json @@ -0,0 +1,778 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.9609375, + "global_step": 12750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + }, + { + "epoch": 9.92, + "learning_rate": 0.0001, + "loss": 3.877, + "step": 12700 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9541096436828160.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12750/training_args.bin b/checkpoint-12750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/checkpoint-12780/README.md b/checkpoint-12780/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd6cf4fd9a9be6e372b435a19ee0572855d8507e --- /dev/null +++ b/checkpoint-12780/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: fp4 +- bnb_4bit_use_double_quant: False +- bnb_4bit_compute_dtype: float32 +### Framework versions + + +- PEFT 0.5.0.dev0 diff --git a/checkpoint-12780/adapter_config.json b/checkpoint-12780/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65984294abc508c1734b2d0181345fce57995606 --- /dev/null +++ b/checkpoint-12780/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "models/TinyStories-33M", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-12780/adapter_model.bin b/checkpoint-12780/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ba0c6f00fc6cc3000338585468c23cd52afb91cd --- /dev/null +++ b/checkpoint-12780/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4d3e0f51f2a1a37482a7387deae09daf97f7aca4a911b2f3beaf6f4107ffbe0 +size 792353 diff --git a/checkpoint-12780/optimizer.pt b/checkpoint-12780/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4db3788c5dea8561c9a02cfe0ec0dc983235d9e8 --- /dev/null +++ b/checkpoint-12780/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7c0ad47a499c6b6ddb9ce0cb8b905afc1caa773bb10bce827f99eb394b512f +size 1586873 diff --git a/checkpoint-12780/rng_state.pth b/checkpoint-12780/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc17ea4b8b08df0dc5b803fd5f37cb0bb1cdadc2 --- /dev/null +++ b/checkpoint-12780/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3860d98363e44bc4704fb9458deec927fd96997d1d09c80639070db959d22 +size 14575 diff --git a/checkpoint-12780/scheduler.pt b/checkpoint-12780/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bfab6b7b760876502eb7e2d74577c46595370b0 --- /dev/null +++ b/checkpoint-12780/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2a31bf2e699099a99793ca4951a362e09c2751eb9133375ed559545b687b63 +size 627 diff --git a/checkpoint-12780/trainer_state.json b/checkpoint-12780/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae9212f1a14fe655d872b3af3257a0a9256792e3 --- /dev/null +++ b/checkpoint-12780/trainer_state.json @@ -0,0 +1,778 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.984375, + "global_step": 12780, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + }, + { + "epoch": 9.92, + "learning_rate": 0.0001, + "loss": 3.877, + "step": 12700 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9561340803778560.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12780/training_args.bin b/checkpoint-12780/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..687254f15aca82dd39f30bdd294576761e894d01 --- /dev/null +++ b/checkpoint-12780/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e683e2614c4dcf1ee5d4bfe9622ed2cfdbf3faf9319855e4a9b393368f21e44 +size 3899 diff --git a/runs/Jul24_15-17-23_65eca3bab58b/events.out.tfevents.1690211843.65eca3bab58b.110.0 b/runs/Jul24_15-17-23_65eca3bab58b/events.out.tfevents.1690211843.65eca3bab58b.110.0 new file mode 100644 index 0000000000000000000000000000000000000000..4fc98ffe34651e753c011e085150af916f15aa24 --- /dev/null +++ b/runs/Jul24_15-17-23_65eca3bab58b/events.out.tfevents.1690211843.65eca3bab58b.110.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c081cc9d3cc85b303b1dd4199431effe50c4c1aeac2f3df04584b4f6c503189 +size 25146 diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c7160d18e9aaa0c046331e373e15477cd182653 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,793 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "global_step": 12800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 10.386, + "step": 100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 7.4061, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001, + "loss": 6.4112, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 6.091, + "step": 400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001, + "loss": 5.8571, + "step": 500 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 5.7144, + "step": 600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001, + "loss": 5.6228, + "step": 700 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 5.546, + "step": 800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001, + "loss": 5.4904, + "step": 900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 5.4413, + "step": 1000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 5.3603, + "step": 1100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 5.3346, + "step": 1200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 5.2744, + "step": 1300 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 5.231, + "step": 1400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001, + "loss": 5.1782, + "step": 1500 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 5.1613, + "step": 1600 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001, + "loss": 5.1091, + "step": 1700 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 5.0939, + "step": 1800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001, + "loss": 5.0667, + "step": 1900 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 5.0054, + "step": 2000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001, + "loss": 4.96, + "step": 2100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 4.9301, + "step": 2200 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 4.9302, + "step": 2300 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001, + "loss": 4.9129, + "step": 2400 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001, + "loss": 4.8627, + "step": 2500 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 4.8336, + "step": 2600 + }, + { + "epoch": 2.11, + "learning_rate": 0.0001, + "loss": 4.8018, + "step": 2700 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 4.7479, + "step": 2800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 4.7572, + "step": 2900 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 4.759, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001, + "loss": 4.7441, + "step": 3100 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 4.7195, + "step": 3200 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 4.6821, + "step": 3300 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001, + "loss": 4.6587, + "step": 3400 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001, + "loss": 4.6228, + "step": 3500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 4.6248, + "step": 3600 + }, + { + "epoch": 2.89, + "learning_rate": 0.0001, + "loss": 4.5801, + "step": 3700 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 4.5934, + "step": 3800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 4.5781, + "step": 3900 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 4.5763, + "step": 4000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0001, + "loss": 4.5598, + "step": 4100 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 4.4796, + "step": 4200 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 4.5204, + "step": 4300 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001, + "loss": 4.4858, + "step": 4400 + }, + { + "epoch": 3.52, + "learning_rate": 0.0001, + "loss": 4.4882, + "step": 4500 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001, + "loss": 4.4852, + "step": 4600 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001, + "loss": 4.4302, + "step": 4700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001, + "loss": 4.4267, + "step": 4800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001, + "loss": 4.4298, + "step": 4900 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001, + "loss": 4.3892, + "step": 5000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0001, + "loss": 4.3886, + "step": 5100 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001, + "loss": 4.3915, + "step": 5200 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001, + "loss": 4.3706, + "step": 5300 + }, + { + "epoch": 4.22, + "learning_rate": 0.0001, + "loss": 4.3528, + "step": 5400 + }, + { + "epoch": 4.3, + "learning_rate": 0.0001, + "loss": 4.3272, + "step": 5500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001, + "loss": 4.3679, + "step": 5600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0001, + "loss": 4.3608, + "step": 5700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001, + "loss": 4.3262, + "step": 5800 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001, + "loss": 4.3097, + "step": 5900 + }, + { + "epoch": 4.69, + "learning_rate": 0.0001, + "loss": 4.2973, + "step": 6000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0001, + "loss": 4.2756, + "step": 6100 + }, + { + "epoch": 4.84, + "learning_rate": 0.0001, + "loss": 4.3174, + "step": 6200 + }, + { + "epoch": 4.92, + "learning_rate": 0.0001, + "loss": 4.2786, + "step": 6300 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001, + "loss": 4.2484, + "step": 6400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001, + "loss": 4.2634, + "step": 6500 + }, + { + "epoch": 5.16, + "learning_rate": 0.0001, + "loss": 4.228, + "step": 6600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0001, + "loss": 4.2745, + "step": 6700 + }, + { + "epoch": 5.31, + "learning_rate": 0.0001, + "loss": 4.2297, + "step": 6800 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001, + "loss": 4.1989, + "step": 6900 + }, + { + "epoch": 5.47, + "learning_rate": 0.0001, + "loss": 4.2074, + "step": 7000 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001, + "loss": 4.201, + "step": 7100 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001, + "loss": 4.2109, + "step": 7200 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001, + "loss": 4.1945, + "step": 7300 + }, + { + "epoch": 5.78, + "learning_rate": 0.0001, + "loss": 4.1744, + "step": 7400 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001, + "loss": 4.1741, + "step": 7500 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001, + "loss": 4.19, + "step": 7600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001, + "loss": 4.1612, + "step": 7700 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001, + "loss": 4.1559, + "step": 7800 + }, + { + "epoch": 6.17, + "learning_rate": 0.0001, + "loss": 4.1378, + "step": 7900 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001, + "loss": 4.144, + "step": 8000 + }, + { + "epoch": 6.33, + "learning_rate": 0.0001, + "loss": 4.1601, + "step": 8100 + }, + { + "epoch": 6.41, + "learning_rate": 0.0001, + "loss": 4.1339, + "step": 8200 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001, + "loss": 4.13, + "step": 8300 + }, + { + "epoch": 6.56, + "learning_rate": 0.0001, + "loss": 4.0877, + "step": 8400 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001, + "loss": 4.1022, + "step": 8500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001, + "loss": 4.1034, + "step": 8600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001, + "loss": 4.091, + "step": 8700 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001, + "loss": 4.0914, + "step": 8800 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001, + "loss": 4.0937, + "step": 8900 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001, + "loss": 4.0897, + "step": 9000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001, + "loss": 4.0844, + "step": 9100 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001, + "loss": 4.0574, + "step": 9200 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001, + "loss": 4.0837, + "step": 9300 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001, + "loss": 4.0766, + "step": 9400 + }, + { + "epoch": 7.42, + "learning_rate": 0.0001, + "loss": 4.066, + "step": 9500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0001, + "loss": 3.9944, + "step": 9600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001, + "loss": 4.0126, + "step": 9700 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001, + "loss": 4.0273, + "step": 9800 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001, + "loss": 4.0706, + "step": 9900 + }, + { + "epoch": 7.81, + "learning_rate": 0.0001, + "loss": 4.0008, + "step": 10000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0001, + "loss": 4.0323, + "step": 10100 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001, + "loss": 3.9864, + "step": 10200 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001, + "loss": 4.0265, + "step": 10300 + }, + { + "epoch": 8.12, + "learning_rate": 0.0001, + "loss": 3.9874, + "step": 10400 + }, + { + "epoch": 8.2, + "learning_rate": 0.0001, + "loss": 3.9839, + "step": 10500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0001, + "loss": 3.9583, + "step": 10600 + }, + { + "epoch": 8.36, + "learning_rate": 0.0001, + "loss": 4.007, + "step": 10700 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001, + "loss": 3.9567, + "step": 10800 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001, + "loss": 3.9888, + "step": 10900 + }, + { + "epoch": 8.59, + "learning_rate": 0.0001, + "loss": 3.9993, + "step": 11000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001, + "loss": 3.9914, + "step": 11100 + }, + { + "epoch": 8.75, + "learning_rate": 0.0001, + "loss": 4.0094, + "step": 11200 + }, + { + "epoch": 8.83, + "learning_rate": 0.0001, + "loss": 3.9338, + "step": 11300 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001, + "loss": 3.9472, + "step": 11400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0001, + "loss": 3.9595, + "step": 11500 + }, + { + "epoch": 9.06, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 11600 + }, + { + "epoch": 9.14, + "learning_rate": 0.0001, + "loss": 3.993, + "step": 11700 + }, + { + "epoch": 9.22, + "learning_rate": 0.0001, + "loss": 3.895, + "step": 11800 + }, + { + "epoch": 9.3, + "learning_rate": 0.0001, + "loss": 3.9527, + "step": 11900 + }, + { + "epoch": 9.38, + "learning_rate": 0.0001, + "loss": 3.9205, + "step": 12000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0001, + "loss": 3.914, + "step": 12100 + }, + { + "epoch": 9.53, + "learning_rate": 0.0001, + "loss": 3.8872, + "step": 12200 + }, + { + "epoch": 9.61, + "learning_rate": 0.0001, + "loss": 3.9171, + "step": 12300 + }, + { + "epoch": 9.69, + "learning_rate": 0.0001, + "loss": 3.9148, + "step": 12400 + }, + { + "epoch": 9.77, + "learning_rate": 0.0001, + "loss": 3.9181, + "step": 12500 + }, + { + "epoch": 9.84, + "learning_rate": 0.0001, + "loss": 3.9545, + "step": 12600 + }, + { + "epoch": 9.92, + "learning_rate": 0.0001, + "loss": 3.877, + "step": 12700 + }, + { + "epoch": 10.0, + "learning_rate": 0.0001, + "loss": 3.894, + "step": 12800 + }, + { + "epoch": 10.0, + "step": 12800, + "total_flos": 9576109973975040.0, + "train_loss": 4.495523297786713, + "train_runtime": 7585.914, + "train_samples_per_second": 107.976, + "train_steps_per_second": 1.687 + } + ], + "max_steps": 12800, + "num_train_epochs": 10, + "total_flos": 9576109973975040.0, + "trial_name": null, + "trial_params": null +}