yujia23 commited on Apr 5

Commit

fd6d42c

•

1 Parent(s): 37f92cd

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

README.md +171 -0
adapter_config.json +34 -0
adapter_model.bin +3 -0
checkpoint-132/README.md +202 -0
checkpoint-132/adapter_config.json +34 -0
checkpoint-132/adapter_model.safetensors +3 -0
checkpoint-132/optimizer.pt +3 -0
checkpoint-132/rng_state_0.pth +3 -0
checkpoint-132/rng_state_1.pth +3 -0
checkpoint-132/scheduler.pt +3 -0
checkpoint-132/special_tokens_map.json +24 -0
checkpoint-132/tokenizer.model +3 -0
checkpoint-132/tokenizer_config.json +44 -0
checkpoint-132/trainer_state.json +1009 -0
checkpoint-132/training_args.bin +3 -0
checkpoint-198/README.md +202 -0
checkpoint-198/adapter_config.json +34 -0
checkpoint-198/adapter_model.safetensors +3 -0
checkpoint-198/optimizer.pt +3 -0
checkpoint-198/rng_state_0.pth +3 -0
checkpoint-198/rng_state_1.pth +3 -0
checkpoint-198/scheduler.pt +3 -0
checkpoint-198/special_tokens_map.json +24 -0
checkpoint-198/tokenizer.model +3 -0
checkpoint-198/tokenizer_config.json +44 -0
checkpoint-198/trainer_state.json +1503 -0
checkpoint-198/training_args.bin +3 -0
checkpoint-66/README.md +202 -0
checkpoint-66/adapter_config.json +34 -0
checkpoint-66/adapter_model.safetensors +3 -0
checkpoint-66/optimizer.pt +3 -0
checkpoint-66/rng_state_0.pth +3 -0
checkpoint-66/rng_state_1.pth +3 -0
checkpoint-66/scheduler.pt +3 -0
checkpoint-66/special_tokens_map.json +24 -0
checkpoint-66/tokenizer.model +3 -0
checkpoint-66/tokenizer_config.json +44 -0
checkpoint-66/trainer_state.json +515 -0
checkpoint-66/training_args.bin +3 -0
config.json +41 -0
runs/Apr04_09-49-49_mala/events.out.tfevents.1712195390.mala.189757.0 +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +44 -0

README.md ADDED Viewed

	@@ -0,0 +1,171 @@

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: mistralai/Mistral-7B-v0.1
+model-index:
+- name: home/yujia/home/CN_Hateful/trained_models/mistral/CN/toxi/1e-5/
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: mistralai/Mistral-7B-v0.1
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+datasets:
+  # - path: mhenrichsen/alpaca_2k_test
+  # - path: /home/yujia/home/CN_Hateful/train_toxiCN.json
+  - path: /home/yujia/home/CN_Hateful/train_toxiCN_cn.json
+  # - path: /home/yujia/home/CN_Hateful/train.json
+    ds_type: json
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+# output_dir: /home/yujia/home/CN_Hateful/trained_models/mistral/toxi/1e-5/
+output_dir: /home/yujia/home/CN_Hateful/trained_models/mistral/CN/toxi/1e-5/
+# output_dir: /home/yujia/home/CN_Hateful/trained_models/mistral/cold/3e-5/
+adapter: lora
+lora_model_dir:
+sequence_len: 256
+sample_packing: true
+pad_to_sequence_len: true
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 8
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.00001
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+```
+</details><br>
+# home/yujia/home/CN_Hateful/trained_models/mistral/CN/toxi/1e-5/
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0627
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 64
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 3
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 2.5188        | 0.01  | 1    | 2.5282          |
+| 1.0047        | 0.25  | 17   | 0.8628          |
+| 0.086         | 0.51  | 34   | 0.0862          |
+| 0.0732        | 0.76  | 51   | 0.0753          |
+| 0.0719        | 1.02  | 68   | 0.0753          |
+| 0.0722        | 1.25  | 85   | 0.0680          |
+| 0.0676        | 1.51  | 102  | 0.0666          |
+| 0.068         | 1.76  | 119  | 0.0648          |
+| 0.0562        | 2.02  | 136  | 0.0637          |
+| 0.0674        | 2.25  | 153  | 0.0628          |
+| 0.0611        | 2.51  | 170  | 0.0625          |
+| 0.0536        | 2.76  | 187  | 0.0627          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.0.dev0
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9c53742dade340257822a74fcf3c3c29e983be237f0228ceeb13ba1027cad4
+size 335706186

checkpoint-132/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-v0.1
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-132/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-132/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da2f7026aee1954624a4d121a3034ababb94a9e0255936c4ebe0de69c430489c
+size 335604696

checkpoint-132/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3302863f1cce911c286932f7680a84f90e57287d62d3bd66bf7afbda6431aa7d
+size 168624724

checkpoint-132/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c0ada088704bd2a27d92f392ed3ba36174227fbdb773741e70b712e7821e04e
+size 14512

checkpoint-132/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:143beb2e0beeab66765d2e7979b31a8d5873e1e52fce9217ff7670399111cefc
+size 14512

checkpoint-132/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66291d042452696f9bc3d5ddeefa9641822fdfd15566ea8b675bc7f31f7bce2b
+size 1064

checkpoint-132/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-132/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-132/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-132/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1009 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9570093457943925,
+  "eval_steps": 17,
+  "global_step": 132,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.601767539978027,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.5188,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.5281636714935303,
+      "eval_runtime": 136.1276,
+      "eval_samples_per_second": 6.795,
+      "eval_steps_per_second": 0.852,
+      "step": 1
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.70827865600586,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.561,
+      "step": 2
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.504729270935059,
+      "learning_rate": 3e-06,
+      "loss": 2.5392,
+      "step": 3
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.70252513885498,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.5358,
+      "step": 4
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.545235633850098,
+      "learning_rate": 5e-06,
+      "loss": 2.4953,
+      "step": 5
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.594281196594238,
+      "learning_rate": 6e-06,
+      "loss": 2.5025,
+      "step": 6
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.570807456970215,
+      "learning_rate": 7e-06,
+      "loss": 2.4483,
+      "step": 7
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.201045036315918,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 2.3844,
+      "step": 8
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.294769287109375,
+      "learning_rate": 9e-06,
+      "loss": 2.3132,
+      "step": 9
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.825733661651611,
+      "learning_rate": 1e-05,
+      "loss": 2.2073,
+      "step": 10
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.359045028686523,
+      "learning_rate": 9.999301905929286e-06,
+      "loss": 2.0469,
+      "step": 11
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.876375198364258,
+      "learning_rate": 9.997207818651273e-06,
+      "loss": 1.8699,
+      "step": 12
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.270627021789551,
+      "learning_rate": 9.99371832291393e-06,
+      "loss": 1.6649,
+      "step": 13
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.742035388946533,
+      "learning_rate": 9.988834393115768e-06,
+      "loss": 1.4954,
+      "step": 14
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.249215602874756,
+      "learning_rate": 9.982557393033758e-06,
+      "loss": 1.3214,
+      "step": 15
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 4.818938732147217,
+      "learning_rate": 9.97488907544252e-06,
+      "loss": 1.1535,
+      "step": 16
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.380295753479004,
+      "learning_rate": 9.965831581624872e-06,
+      "loss": 1.0047,
+      "step": 17
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 0.8628284335136414,
+      "eval_runtime": 133.5823,
+      "eval_samples_per_second": 6.925,
+      "eval_steps_per_second": 0.868,
+      "step": 17
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.99696683883667,
+      "learning_rate": 9.955387440773902e-06,
+      "loss": 0.8632,
+      "step": 18
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.7503676414489746,
+      "learning_rate": 9.943559569286731e-06,
+      "loss": 0.7559,
+      "step": 19
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 3.430863618850708,
+      "learning_rate": 9.930351269950144e-06,
+      "loss": 0.6444,
+      "step": 20
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.25299072265625,
+      "learning_rate": 9.915766231018317e-06,
+      "loss": 0.5364,
+      "step": 21
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 3.217674970626831,
+      "learning_rate": 9.899808525182935e-06,
+      "loss": 0.4766,
+      "step": 22
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 3.2001354694366455,
+      "learning_rate": 9.882482608435924e-06,
+      "loss": 0.4038,
+      "step": 23
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.011241912841797,
+      "learning_rate": 9.863793318825186e-06,
+      "loss": 0.3333,
+      "step": 24
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.758089303970337,
+      "learning_rate": 9.843745875103628e-06,
+      "loss": 0.2752,
+      "step": 25
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.914292812347412,
+      "learning_rate": 9.822345875271884e-06,
+      "loss": 0.2229,
+      "step": 26
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.1877121925354004,
+      "learning_rate": 9.799599295015154e-06,
+      "loss": 0.1846,
+      "step": 27
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.3617541790008545,
+      "learning_rate": 9.775512486034564e-06,
+      "loss": 0.1556,
+      "step": 28
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.431768774986267,
+      "learning_rate": 9.75009217427352e-06,
+      "loss": 0.1279,
+      "step": 29
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7117260098457336,
+      "learning_rate": 9.723345458039595e-06,
+      "loss": 0.1061,
+      "step": 30
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.45769569277763367,
+      "learning_rate": 9.695279806022391e-06,
+      "loss": 0.0987,
+      "step": 31
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2953682243824005,
+      "learning_rate": 9.665903055208013e-06,
+      "loss": 0.0919,
+      "step": 32
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.17375054955482483,
+      "learning_rate": 9.635223408690688e-06,
+      "loss": 0.0854,
+      "step": 33
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.11500216275453568,
+      "learning_rate": 9.603249433382145e-06,
+      "loss": 0.086,
+      "step": 34
+    },
+    {
+      "epoch": 0.51,
+      "eval_loss": 0.08624568581581116,
+      "eval_runtime": 132.1775,
+      "eval_samples_per_second": 6.998,
+      "eval_steps_per_second": 0.878,
+      "step": 34
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12646625936031342,
+      "learning_rate": 9.569990057619414e-06,
+      "loss": 0.0851,
+      "step": 35
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11912114918231964,
+      "learning_rate": 9.535454568671705e-06,
+      "loss": 0.0834,
+      "step": 36
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.24543708562850952,
+      "learning_rate": 9.49965261014704e-06,
+      "loss": 0.0836,
+      "step": 37
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.0936342254281044,
+      "learning_rate": 9.462594179299408e-06,
+      "loss": 0.0844,
+      "step": 38
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.18341362476348877,
+      "learning_rate": 9.424289624237143e-06,
+      "loss": 0.0805,
+      "step": 39
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21815626323223114,
+      "learning_rate": 9.384749641033358e-06,
+      "loss": 0.0811,
+      "step": 40
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21489091217517853,
+      "learning_rate": 9.343985270739184e-06,
+      "loss": 0.0793,
+      "step": 41
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.23281769454479218,
+      "learning_rate": 9.302007896300697e-06,
+      "loss": 0.0775,
+      "step": 42
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6911139488220215,
+      "learning_rate": 9.25882923938038e-06,
+      "loss": 0.0812,
+      "step": 43
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6087940335273743,
+      "learning_rate": 9.214461357083986e-06,
+      "loss": 0.0801,
+      "step": 44
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.5599693059921265,
+      "learning_rate": 9.168916638593736e-06,
+      "loss": 0.0822,
+      "step": 45
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.23726361989974976,
+      "learning_rate": 9.122207801708802e-06,
+      "loss": 0.0744,
+      "step": 46
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.075922966003418,
+      "learning_rate": 9.074347889294017e-06,
+      "loss": 0.0824,
+      "step": 47
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.49190425872802734,
+      "learning_rate": 9.025350265637816e-06,
+      "loss": 0.0784,
+      "step": 48
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.8844243288040161,
+      "learning_rate": 8.975228612720415e-06,
+      "loss": 0.0722,
+      "step": 49
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2909235954284668,
+      "learning_rate": 8.923996926393306e-06,
+      "loss": 0.075,
+      "step": 50
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3661656975746155,
+      "learning_rate": 8.871669512471068e-06,
+      "loss": 0.0732,
+      "step": 51
+    },
+    {
+      "epoch": 0.76,
+      "eval_loss": 0.07530223578214645,
+      "eval_runtime": 133.0044,
+      "eval_samples_per_second": 6.955,
+      "eval_steps_per_second": 0.872,
+      "step": 51
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.49425163865089417,
+      "learning_rate": 8.818260982736662e-06,
+      "loss": 0.07,
+      "step": 52
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9367341995239258,
+      "learning_rate": 8.763786250861258e-06,
+      "loss": 0.0819,
+      "step": 53
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9425927400588989,
+      "learning_rate": 8.708260528239788e-06,
+      "loss": 0.0754,
+      "step": 54
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.3247818052768707,
+      "learning_rate": 8.651699319743348e-06,
+      "loss": 0.0739,
+      "step": 55
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8091337084770203,
+      "learning_rate": 8.594118419389648e-06,
+      "loss": 0.0724,
+      "step": 56
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.556105375289917,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 0.0733,
+      "step": 57
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9925010800361633,
+      "learning_rate": 8.475962138373212e-06,
+      "loss": 0.0761,
+      "step": 58
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.32729703187942505,
+      "learning_rate": 8.415419751390155e-06,
+      "loss": 0.0694,
+      "step": 59
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2549174129962921,
+      "learning_rate": 8.353923650696119e-06,
+      "loss": 0.0672,
+      "step": 60
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.2688353657722473,
+      "learning_rate": 8.291491008316409e-06,
+      "loss": 0.0694,
+      "step": 61
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.38362765312194824,
+      "learning_rate": 8.228139257794012e-06,
+      "loss": 0.0671,
+      "step": 62
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8581087589263916,
+      "learning_rate": 8.163886089321493e-06,
+      "loss": 0.0745,
+      "step": 63
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5643619894981384,
+      "learning_rate": 8.098749444801226e-06,
+      "loss": 0.0681,
+      "step": 64
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.637834906578064,
+      "learning_rate": 8.032747512835338e-06,
+      "loss": 0.0773,
+      "step": 65
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.2533693313598633,
+      "learning_rate": 7.965898723646777e-06,
+      "loss": 0.0663,
+      "step": 66
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.35648882389068604,
+      "learning_rate": 7.898221743932887e-06,
+      "loss": 0.0573,
+      "step": 67
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.638164758682251,
+      "learning_rate": 7.829735471652978e-06,
+      "loss": 0.0719,
+      "step": 68
+    },
+    {
+      "epoch": 1.02,
+      "eval_loss": 0.07532496750354767,
+      "eval_runtime": 133.6875,
+      "eval_samples_per_second": 6.919,
+      "eval_steps_per_second": 0.868,
+      "step": 68
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6950666308403015,
+      "learning_rate": 7.760459030751285e-06,
+      "loss": 0.0712,
+      "step": 69
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.5866235494613647,
+      "learning_rate": 7.690411765816864e-06,
+      "loss": 0.0753,
+      "step": 70
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.5796181559562683,
+      "learning_rate": 7.619613236681845e-06,
+      "loss": 0.0672,
+      "step": 71
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.5313260555267334,
+      "learning_rate": 7.548083212959588e-06,
+      "loss": 0.0634,
+      "step": 72
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.4373992681503296,
+      "learning_rate": 7.475841668524268e-06,
+      "loss": 0.0632,
+      "step": 73
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.3541436195373535,
+      "learning_rate": 7.402908775933419e-06,
+      "loss": 0.0675,
+      "step": 74
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.5151358842849731,
+      "learning_rate": 7.329304900794991e-06,
+      "loss": 0.066,
+      "step": 75
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.6343135833740234,
+      "learning_rate": 7.25505059608051e-06,
+      "loss": 0.0676,
+      "step": 76
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.6041606664657593,
+      "learning_rate": 7.180166596385915e-06,
+      "loss": 0.0659,
+      "step": 77
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.5856900811195374,
+      "learning_rate": 7.104673812141676e-06,
+      "loss": 0.0585,
+      "step": 78
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.2674119770526886,
+      "learning_rate": 7.028593323773819e-06,
+      "loss": 0.0688,
+      "step": 79
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.5323411226272583,
+      "learning_rate": 6.9519463758174745e-06,
+      "loss": 0.0645,
+      "step": 80
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.5299087166786194,
+      "learning_rate": 6.8747543709846064e-06,
+      "loss": 0.0631,
+      "step": 81
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.1804003715515137,
+      "learning_rate": 6.797038864187564e-06,
+      "loss": 0.0813,
+      "step": 82
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.129088044166565,
+      "learning_rate": 6.718821556520151e-06,
+      "loss": 0.0661,
+      "step": 83
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.37926897406578064,
+      "learning_rate": 6.640124289197845e-06,
+      "loss": 0.0739,
+      "step": 84
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5506991744041443,
+      "learning_rate": 6.560969037458933e-06,
+      "loss": 0.0722,
+      "step": 85
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.06798132508993149,
+      "eval_runtime": 131.6498,
+      "eval_samples_per_second": 7.026,
+      "eval_steps_per_second": 0.881,
+      "step": 85
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.36365798115730286,
+      "learning_rate": 6.481377904428171e-06,
+      "loss": 0.0675,
+      "step": 86
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.24918232858181,
+      "learning_rate": 6.401373114944781e-06,
+      "loss": 0.0654,
+      "step": 87
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.32376477122306824,
+      "learning_rate": 6.3209770093564315e-06,
+      "loss": 0.0666,
+      "step": 88
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.5324047207832336,
+      "learning_rate": 6.240212037280967e-06,
+      "loss": 0.0616,
+      "step": 89
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.2454373985528946,
+      "learning_rate": 6.1591007513376425e-06,
+      "loss": 0.0705,
+      "step": 90
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.5578957796096802,
+      "learning_rate": 6.077665800849568e-06,
+      "loss": 0.0783,
+      "step": 91
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.475643515586853,
+      "learning_rate": 5.995929925519181e-06,
+      "loss": 0.0689,
+      "step": 92
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.5447214841842651,
+      "learning_rate": 5.913915949078453e-06,
+      "loss": 0.0692,
+      "step": 93
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.19609016180038452,
+      "learning_rate": 5.831646772915651e-06,
+      "loss": 0.0684,
+      "step": 94
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.3513399362564087,
+      "learning_rate": 5.7491453696804075e-06,
+      "loss": 0.0606,
+      "step": 95
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.5132963061332703,
+      "learning_rate": 5.666434776868895e-06,
+      "loss": 0.0618,
+      "step": 96
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.3573090732097626,
+      "learning_rate": 5.583538090390882e-06,
+      "loss": 0.066,
+      "step": 97
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.37191343307495117,
+      "learning_rate": 5.500478458120493e-06,
+      "loss": 0.0574,
+      "step": 98
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.5762889385223389,
+      "learning_rate": 5.41727907343245e-06,
+      "loss": 0.0698,
+      "step": 99
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.2511120140552521,
+      "learning_rate": 5.3339631687256085e-06,
+      "loss": 0.0642,
+      "step": 100
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.5286504626274109,
+      "learning_rate": 5.250554008935596e-06,
+      "loss": 0.0702,
+      "step": 101
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.283104807138443,
+      "learning_rate": 5.1670748850383734e-06,
+      "loss": 0.0676,
+      "step": 102
+    },
+    {
+      "epoch": 1.51,
+      "eval_loss": 0.06662092357873917,
+      "eval_runtime": 128.4351,
+      "eval_samples_per_second": 7.202,
+      "eval_steps_per_second": 0.903,
+      "step": 102
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3505866527557373,
+      "learning_rate": 5.083549107546505e-06,
+      "loss": 0.0749,
+      "step": 103
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.4083852469921112,
+      "learning_rate": 5e-06,
+      "loss": 0.0683,
+      "step": 104
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.6204113364219666,
+      "learning_rate": 4.916450892453495e-06,
+      "loss": 0.0579,
+      "step": 105
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.2663183808326721,
+      "learning_rate": 4.832925114961629e-06,
+      "loss": 0.0606,
+      "step": 106
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.5618534684181213,
+      "learning_rate": 4.7494459910644044e-06,
+      "loss": 0.0702,
+      "step": 107
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.5477017760276794,
+      "learning_rate": 4.666036831274392e-06,
+      "loss": 0.0717,
+      "step": 108
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.5102267265319824,
+      "learning_rate": 4.582720926567552e-06,
+      "loss": 0.0703,
+      "step": 109
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.2536521255970001,
+      "learning_rate": 4.499521541879508e-06,
+      "loss": 0.0653,
+      "step": 110
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.3182464838027954,
+      "learning_rate": 4.416461909609119e-06,
+      "loss": 0.068,
+      "step": 111
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.2570602595806122,
+      "learning_rate": 4.333565223131107e-06,
+      "loss": 0.0636,
+      "step": 112
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.1940167397260666,
+      "learning_rate": 4.250854630319593e-06,
+      "loss": 0.0692,
+      "step": 113
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.33302024006843567,
+      "learning_rate": 4.1683532270843505e-06,
+      "loss": 0.062,
+      "step": 114
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.26795655488967896,
+      "learning_rate": 4.08608405092155e-06,
+      "loss": 0.0629,
+      "step": 115
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5258805751800537,
+      "learning_rate": 4.004070074480821e-06,
+      "loss": 0.0634,
+      "step": 116
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.2478051334619522,
+      "learning_rate": 3.922334199150433e-06,
+      "loss": 0.0675,
+      "step": 117
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.30332913994789124,
+      "learning_rate": 3.840899248662358e-06,
+      "loss": 0.063,
+      "step": 118
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.21814480423927307,
+      "learning_rate": 3.7597879627190337e-06,
+      "loss": 0.068,
+      "step": 119
+    },
+    {
+      "epoch": 1.76,
+      "eval_loss": 0.06482138484716415,
+      "eval_runtime": 126.2918,
+      "eval_samples_per_second": 7.324,
+      "eval_steps_per_second": 0.919,
+      "step": 119
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.27936533093452454,
+      "learning_rate": 3.6790229906435706e-06,
+      "loss": 0.065,
+      "step": 120
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.2364480048418045,
+      "learning_rate": 3.598626885055219e-06,
+      "loss": 0.0672,
+      "step": 121
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.32356905937194824,
+      "learning_rate": 3.518622095571831e-06,
+      "loss": 0.065,
+      "step": 122
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.24160273373126984,
+      "learning_rate": 3.439030962541069e-06,
+      "loss": 0.0646,
+      "step": 123
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.26129576563835144,
+      "learning_rate": 3.3598757108021546e-06,
+      "loss": 0.064,
+      "step": 124
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.24441897869110107,
+      "learning_rate": 3.281178443479852e-06,
+      "loss": 0.0525,
+      "step": 125
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.23377354443073273,
+      "learning_rate": 3.202961135812437e-06,
+      "loss": 0.0592,
+      "step": 126
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3297436833381653,
+      "learning_rate": 3.1252456290153952e-06,
+      "loss": 0.0625,
+      "step": 127
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.7371227145195007,
+      "learning_rate": 3.0480536241825263e-06,
+      "loss": 0.0644,
+      "step": 128
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.46529266238212585,
+      "learning_rate": 2.9714066762261825e-06,
+      "loss": 0.0641,
+      "step": 129
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.5547051429748535,
+      "learning_rate": 2.8953261878583263e-06,
+      "loss": 0.0747,
+      "step": 130
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.3253116011619568,
+      "learning_rate": 2.8198334036140873e-06,
+      "loss": 0.0709,
+      "step": 131
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.26438066363334656,
+      "learning_rate": 2.74494940391949e-06,
+      "loss": 0.0651,
+      "step": 132
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 198,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 66,
+  "total_flos": 9.335735208168653e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-132/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3aa8a6641a3601e0e6938116d17cd8cd3d398e0c12208d120adf3e793eb3f1a
+size 5752

checkpoint-198/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-v0.1
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-198/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-198/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8295242790c22cec577be324c71ccb9f3ff63a80c90fd31ed9b6954aebf186ca
+size 335604696

checkpoint-198/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58450e32d33998679616617fb93e3596329afc890b670205ca86a16a625aa292
+size 168624724

checkpoint-198/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:235cc0bb2525f99c586faca867bf75ca4a9696902b1555023e9c619f8b220ca8
+size 14512

checkpoint-198/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1af87ed6fbb43056a85d0f67c4d0e7ce57477197352fd3ab757e709bfab7ac9
+size 14512

checkpoint-198/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:165f5f16c799555d565b8431f95f01fc4c54f4d825a78bf3b187f60ed59e36fc
+size 1064

checkpoint-198/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-198/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-198/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-198/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1503 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.925233644859813,
+  "eval_steps": 17,
+  "global_step": 198,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.601767539978027,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.5188,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.5281636714935303,
+      "eval_runtime": 136.1276,
+      "eval_samples_per_second": 6.795,
+      "eval_steps_per_second": 0.852,
+      "step": 1
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.70827865600586,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.561,
+      "step": 2
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.504729270935059,
+      "learning_rate": 3e-06,
+      "loss": 2.5392,
+      "step": 3
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.70252513885498,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.5358,
+      "step": 4
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.545235633850098,
+      "learning_rate": 5e-06,
+      "loss": 2.4953,
+      "step": 5
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.594281196594238,
+      "learning_rate": 6e-06,
+      "loss": 2.5025,
+      "step": 6
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.570807456970215,
+      "learning_rate": 7e-06,
+      "loss": 2.4483,
+      "step": 7
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.201045036315918,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 2.3844,
+      "step": 8
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.294769287109375,
+      "learning_rate": 9e-06,
+      "loss": 2.3132,
+      "step": 9
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.825733661651611,
+      "learning_rate": 1e-05,
+      "loss": 2.2073,
+      "step": 10
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.359045028686523,
+      "learning_rate": 9.999301905929286e-06,
+      "loss": 2.0469,
+      "step": 11
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.876375198364258,
+      "learning_rate": 9.997207818651273e-06,
+      "loss": 1.8699,
+      "step": 12
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.270627021789551,
+      "learning_rate": 9.99371832291393e-06,
+      "loss": 1.6649,
+      "step": 13
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.742035388946533,
+      "learning_rate": 9.988834393115768e-06,
+      "loss": 1.4954,
+      "step": 14
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.249215602874756,
+      "learning_rate": 9.982557393033758e-06,
+      "loss": 1.3214,
+      "step": 15
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 4.818938732147217,
+      "learning_rate": 9.97488907544252e-06,
+      "loss": 1.1535,
+      "step": 16
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.380295753479004,
+      "learning_rate": 9.965831581624872e-06,
+      "loss": 1.0047,
+      "step": 17
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 0.8628284335136414,
+      "eval_runtime": 133.5823,
+      "eval_samples_per_second": 6.925,
+      "eval_steps_per_second": 0.868,
+      "step": 17
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.99696683883667,
+      "learning_rate": 9.955387440773902e-06,
+      "loss": 0.8632,
+      "step": 18
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.7503676414489746,
+      "learning_rate": 9.943559569286731e-06,
+      "loss": 0.7559,
+      "step": 19
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 3.430863618850708,
+      "learning_rate": 9.930351269950144e-06,
+      "loss": 0.6444,
+      "step": 20
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.25299072265625,
+      "learning_rate": 9.915766231018317e-06,
+      "loss": 0.5364,
+      "step": 21
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 3.217674970626831,
+      "learning_rate": 9.899808525182935e-06,
+      "loss": 0.4766,
+      "step": 22
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 3.2001354694366455,
+      "learning_rate": 9.882482608435924e-06,
+      "loss": 0.4038,
+      "step": 23
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.011241912841797,
+      "learning_rate": 9.863793318825186e-06,
+      "loss": 0.3333,
+      "step": 24
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.758089303970337,
+      "learning_rate": 9.843745875103628e-06,
+      "loss": 0.2752,
+      "step": 25
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.914292812347412,
+      "learning_rate": 9.822345875271884e-06,
+      "loss": 0.2229,
+      "step": 26
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.1877121925354004,
+      "learning_rate": 9.799599295015154e-06,
+      "loss": 0.1846,
+      "step": 27
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.3617541790008545,
+      "learning_rate": 9.775512486034564e-06,
+      "loss": 0.1556,
+      "step": 28
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.431768774986267,
+      "learning_rate": 9.75009217427352e-06,
+      "loss": 0.1279,
+      "step": 29
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7117260098457336,
+      "learning_rate": 9.723345458039595e-06,
+      "loss": 0.1061,
+      "step": 30
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.45769569277763367,
+      "learning_rate": 9.695279806022391e-06,
+      "loss": 0.0987,
+      "step": 31
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2953682243824005,
+      "learning_rate": 9.665903055208013e-06,
+      "loss": 0.0919,
+      "step": 32
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.17375054955482483,
+      "learning_rate": 9.635223408690688e-06,
+      "loss": 0.0854,
+      "step": 33
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.11500216275453568,
+      "learning_rate": 9.603249433382145e-06,
+      "loss": 0.086,
+      "step": 34
+    },
+    {
+      "epoch": 0.51,
+      "eval_loss": 0.08624568581581116,
+      "eval_runtime": 132.1775,
+      "eval_samples_per_second": 6.998,
+      "eval_steps_per_second": 0.878,
+      "step": 34
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12646625936031342,
+      "learning_rate": 9.569990057619414e-06,
+      "loss": 0.0851,
+      "step": 35
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11912114918231964,
+      "learning_rate": 9.535454568671705e-06,
+      "loss": 0.0834,
+      "step": 36
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.24543708562850952,
+      "learning_rate": 9.49965261014704e-06,
+      "loss": 0.0836,
+      "step": 37
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.0936342254281044,
+      "learning_rate": 9.462594179299408e-06,
+      "loss": 0.0844,
+      "step": 38
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.18341362476348877,
+      "learning_rate": 9.424289624237143e-06,
+      "loss": 0.0805,
+      "step": 39
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21815626323223114,
+      "learning_rate": 9.384749641033358e-06,
+      "loss": 0.0811,
+      "step": 40
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21489091217517853,
+      "learning_rate": 9.343985270739184e-06,
+      "loss": 0.0793,
+      "step": 41
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.23281769454479218,
+      "learning_rate": 9.302007896300697e-06,
+      "loss": 0.0775,
+      "step": 42
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6911139488220215,
+      "learning_rate": 9.25882923938038e-06,
+      "loss": 0.0812,
+      "step": 43
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6087940335273743,
+      "learning_rate": 9.214461357083986e-06,
+      "loss": 0.0801,
+      "step": 44
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.5599693059921265,
+      "learning_rate": 9.168916638593736e-06,
+      "loss": 0.0822,
+      "step": 45
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.23726361989974976,
+      "learning_rate": 9.122207801708802e-06,
+      "loss": 0.0744,
+      "step": 46
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.075922966003418,
+      "learning_rate": 9.074347889294017e-06,
+      "loss": 0.0824,
+      "step": 47
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.49190425872802734,
+      "learning_rate": 9.025350265637816e-06,
+      "loss": 0.0784,
+      "step": 48
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.8844243288040161,
+      "learning_rate": 8.975228612720415e-06,
+      "loss": 0.0722,
+      "step": 49
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2909235954284668,
+      "learning_rate": 8.923996926393306e-06,
+      "loss": 0.075,
+      "step": 50
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3661656975746155,
+      "learning_rate": 8.871669512471068e-06,
+      "loss": 0.0732,
+      "step": 51
+    },
+    {
+      "epoch": 0.76,
+      "eval_loss": 0.07530223578214645,
+      "eval_runtime": 133.0044,
+      "eval_samples_per_second": 6.955,
+      "eval_steps_per_second": 0.872,
+      "step": 51
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.49425163865089417,
+      "learning_rate": 8.818260982736662e-06,
+      "loss": 0.07,
+      "step": 52
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9367341995239258,
+      "learning_rate": 8.763786250861258e-06,
+      "loss": 0.0819,
+      "step": 53
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9425927400588989,
+      "learning_rate": 8.708260528239788e-06,
+      "loss": 0.0754,
+      "step": 54
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.3247818052768707,
+      "learning_rate": 8.651699319743348e-06,
+      "loss": 0.0739,
+      "step": 55
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8091337084770203,
+      "learning_rate": 8.594118419389648e-06,
+      "loss": 0.0724,
+      "step": 56
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.556105375289917,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 0.0733,
+      "step": 57
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9925010800361633,
+      "learning_rate": 8.475962138373212e-06,
+      "loss": 0.0761,
+      "step": 58
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.32729703187942505,
+      "learning_rate": 8.415419751390155e-06,
+      "loss": 0.0694,
+      "step": 59
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2549174129962921,
+      "learning_rate": 8.353923650696119e-06,
+      "loss": 0.0672,
+      "step": 60
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.2688353657722473,
+      "learning_rate": 8.291491008316409e-06,
+      "loss": 0.0694,
+      "step": 61
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.38362765312194824,
+      "learning_rate": 8.228139257794012e-06,
+      "loss": 0.0671,
+      "step": 62
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8581087589263916,
+      "learning_rate": 8.163886089321493e-06,
+      "loss": 0.0745,
+      "step": 63
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5643619894981384,
+      "learning_rate": 8.098749444801226e-06,
+      "loss": 0.0681,
+      "step": 64
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.637834906578064,
+      "learning_rate": 8.032747512835338e-06,
+      "loss": 0.0773,
+      "step": 65
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.2533693313598633,
+      "learning_rate": 7.965898723646777e-06,
+      "loss": 0.0663,
+      "step": 66
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.35648882389068604,
+      "learning_rate": 7.898221743932887e-06,
+      "loss": 0.0573,
+      "step": 67
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.638164758682251,
+      "learning_rate": 7.829735471652978e-06,
+      "loss": 0.0719,
+      "step": 68
+    },
+    {
+      "epoch": 1.02,
+      "eval_loss": 0.07532496750354767,
+      "eval_runtime": 133.6875,
+      "eval_samples_per_second": 6.919,
+      "eval_steps_per_second": 0.868,
+      "step": 68
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6950666308403015,
+      "learning_rate": 7.760459030751285e-06,
+      "loss": 0.0712,
+      "step": 69
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.5866235494613647,
+      "learning_rate": 7.690411765816864e-06,
+      "loss": 0.0753,
+      "step": 70
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.5796181559562683,
+      "learning_rate": 7.619613236681845e-06,
+      "loss": 0.0672,
+      "step": 71
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.5313260555267334,
+      "learning_rate": 7.548083212959588e-06,
+      "loss": 0.0634,
+      "step": 72
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.4373992681503296,
+      "learning_rate": 7.475841668524268e-06,
+      "loss": 0.0632,
+      "step": 73
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.3541436195373535,
+      "learning_rate": 7.402908775933419e-06,
+      "loss": 0.0675,
+      "step": 74
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.5151358842849731,
+      "learning_rate": 7.329304900794991e-06,
+      "loss": 0.066,
+      "step": 75
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.6343135833740234,
+      "learning_rate": 7.25505059608051e-06,
+      "loss": 0.0676,
+      "step": 76
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.6041606664657593,
+      "learning_rate": 7.180166596385915e-06,
+      "loss": 0.0659,
+      "step": 77
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.5856900811195374,
+      "learning_rate": 7.104673812141676e-06,
+      "loss": 0.0585,
+      "step": 78
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.2674119770526886,
+      "learning_rate": 7.028593323773819e-06,
+      "loss": 0.0688,
+      "step": 79
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.5323411226272583,
+      "learning_rate": 6.9519463758174745e-06,
+      "loss": 0.0645,
+      "step": 80
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.5299087166786194,
+      "learning_rate": 6.8747543709846064e-06,
+      "loss": 0.0631,
+      "step": 81
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.1804003715515137,
+      "learning_rate": 6.797038864187564e-06,
+      "loss": 0.0813,
+      "step": 82
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.129088044166565,
+      "learning_rate": 6.718821556520151e-06,
+      "loss": 0.0661,
+      "step": 83
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.37926897406578064,
+      "learning_rate": 6.640124289197845e-06,
+      "loss": 0.0739,
+      "step": 84
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5506991744041443,
+      "learning_rate": 6.560969037458933e-06,
+      "loss": 0.0722,
+      "step": 85
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.06798132508993149,
+      "eval_runtime": 131.6498,
+      "eval_samples_per_second": 7.026,
+      "eval_steps_per_second": 0.881,
+      "step": 85
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.36365798115730286,
+      "learning_rate": 6.481377904428171e-06,
+      "loss": 0.0675,
+      "step": 86
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.24918232858181,
+      "learning_rate": 6.401373114944781e-06,
+      "loss": 0.0654,
+      "step": 87
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.32376477122306824,
+      "learning_rate": 6.3209770093564315e-06,
+      "loss": 0.0666,
+      "step": 88
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.5324047207832336,
+      "learning_rate": 6.240212037280967e-06,
+      "loss": 0.0616,
+      "step": 89
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.2454373985528946,
+      "learning_rate": 6.1591007513376425e-06,
+      "loss": 0.0705,
+      "step": 90
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.5578957796096802,
+      "learning_rate": 6.077665800849568e-06,
+      "loss": 0.0783,
+      "step": 91
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.475643515586853,
+      "learning_rate": 5.995929925519181e-06,
+      "loss": 0.0689,
+      "step": 92
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.5447214841842651,
+      "learning_rate": 5.913915949078453e-06,
+      "loss": 0.0692,
+      "step": 93
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.19609016180038452,
+      "learning_rate": 5.831646772915651e-06,
+      "loss": 0.0684,
+      "step": 94
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.3513399362564087,
+      "learning_rate": 5.7491453696804075e-06,
+      "loss": 0.0606,
+      "step": 95
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.5132963061332703,
+      "learning_rate": 5.666434776868895e-06,
+      "loss": 0.0618,
+      "step": 96
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.3573090732097626,
+      "learning_rate": 5.583538090390882e-06,
+      "loss": 0.066,
+      "step": 97
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.37191343307495117,
+      "learning_rate": 5.500478458120493e-06,
+      "loss": 0.0574,
+      "step": 98
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.5762889385223389,
+      "learning_rate": 5.41727907343245e-06,
+      "loss": 0.0698,
+      "step": 99
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.2511120140552521,
+      "learning_rate": 5.3339631687256085e-06,
+      "loss": 0.0642,
+      "step": 100
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.5286504626274109,
+      "learning_rate": 5.250554008935596e-06,
+      "loss": 0.0702,
+      "step": 101
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.283104807138443,
+      "learning_rate": 5.1670748850383734e-06,
+      "loss": 0.0676,
+      "step": 102
+    },
+    {
+      "epoch": 1.51,
+      "eval_loss": 0.06662092357873917,
+      "eval_runtime": 128.4351,
+      "eval_samples_per_second": 7.202,
+      "eval_steps_per_second": 0.903,
+      "step": 102
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3505866527557373,
+      "learning_rate": 5.083549107546505e-06,
+      "loss": 0.0749,
+      "step": 103
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.4083852469921112,
+      "learning_rate": 5e-06,
+      "loss": 0.0683,
+      "step": 104
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.6204113364219666,
+      "learning_rate": 4.916450892453495e-06,
+      "loss": 0.0579,
+      "step": 105
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.2663183808326721,
+      "learning_rate": 4.832925114961629e-06,
+      "loss": 0.0606,
+      "step": 106
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.5618534684181213,
+      "learning_rate": 4.7494459910644044e-06,
+      "loss": 0.0702,
+      "step": 107
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.5477017760276794,
+      "learning_rate": 4.666036831274392e-06,
+      "loss": 0.0717,
+      "step": 108
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.5102267265319824,
+      "learning_rate": 4.582720926567552e-06,
+      "loss": 0.0703,
+      "step": 109
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.2536521255970001,
+      "learning_rate": 4.499521541879508e-06,
+      "loss": 0.0653,
+      "step": 110
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.3182464838027954,
+      "learning_rate": 4.416461909609119e-06,
+      "loss": 0.068,
+      "step": 111
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.2570602595806122,
+      "learning_rate": 4.333565223131107e-06,
+      "loss": 0.0636,
+      "step": 112
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.1940167397260666,
+      "learning_rate": 4.250854630319593e-06,
+      "loss": 0.0692,
+      "step": 113
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.33302024006843567,
+      "learning_rate": 4.1683532270843505e-06,
+      "loss": 0.062,
+      "step": 114
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.26795655488967896,
+      "learning_rate": 4.08608405092155e-06,
+      "loss": 0.0629,
+      "step": 115
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5258805751800537,
+      "learning_rate": 4.004070074480821e-06,
+      "loss": 0.0634,
+      "step": 116
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.2478051334619522,
+      "learning_rate": 3.922334199150433e-06,
+      "loss": 0.0675,
+      "step": 117
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.30332913994789124,
+      "learning_rate": 3.840899248662358e-06,
+      "loss": 0.063,
+      "step": 118
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.21814480423927307,
+      "learning_rate": 3.7597879627190337e-06,
+      "loss": 0.068,
+      "step": 119
+    },
+    {
+      "epoch": 1.76,
+      "eval_loss": 0.06482138484716415,
+      "eval_runtime": 126.2918,
+      "eval_samples_per_second": 7.324,
+      "eval_steps_per_second": 0.919,
+      "step": 119
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.27936533093452454,
+      "learning_rate": 3.6790229906435706e-06,
+      "loss": 0.065,
+      "step": 120
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.2364480048418045,
+      "learning_rate": 3.598626885055219e-06,
+      "loss": 0.0672,
+      "step": 121
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.32356905937194824,
+      "learning_rate": 3.518622095571831e-06,
+      "loss": 0.065,
+      "step": 122
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.24160273373126984,
+      "learning_rate": 3.439030962541069e-06,
+      "loss": 0.0646,
+      "step": 123
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.26129576563835144,
+      "learning_rate": 3.3598757108021546e-06,
+      "loss": 0.064,
+      "step": 124
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.24441897869110107,
+      "learning_rate": 3.281178443479852e-06,
+      "loss": 0.0525,
+      "step": 125
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.23377354443073273,
+      "learning_rate": 3.202961135812437e-06,
+      "loss": 0.0592,
+      "step": 126
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3297436833381653,
+      "learning_rate": 3.1252456290153952e-06,
+      "loss": 0.0625,
+      "step": 127
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.7371227145195007,
+      "learning_rate": 3.0480536241825263e-06,
+      "loss": 0.0644,
+      "step": 128
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.46529266238212585,
+      "learning_rate": 2.9714066762261825e-06,
+      "loss": 0.0641,
+      "step": 129
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.5547051429748535,
+      "learning_rate": 2.8953261878583263e-06,
+      "loss": 0.0747,
+      "step": 130
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.3253116011619568,
+      "learning_rate": 2.8198334036140873e-06,
+      "loss": 0.0709,
+      "step": 131
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.26438066363334656,
+      "learning_rate": 2.74494940391949e-06,
+      "loss": 0.0651,
+      "step": 132
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.28292524814605713,
+      "learning_rate": 2.6706950992050097e-06,
+      "loss": 0.0662,
+      "step": 133
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.2487596571445465,
+      "learning_rate": 2.5970912240665815e-06,
+      "loss": 0.0617,
+      "step": 134
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.3353869318962097,
+      "learning_rate": 2.5241583314757327e-06,
+      "loss": 0.0706,
+      "step": 135
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.25893494486808777,
+      "learning_rate": 2.4519167870404126e-06,
+      "loss": 0.0562,
+      "step": 136
+    },
+    {
+      "epoch": 2.02,
+      "eval_loss": 0.06374615430831909,
+      "eval_runtime": 125.462,
+      "eval_samples_per_second": 7.373,
+      "eval_steps_per_second": 0.925,
+      "step": 136
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.29034262895584106,
+      "learning_rate": 2.3803867633181575e-06,
+      "loss": 0.0604,
+      "step": 137
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.3979719281196594,
+      "learning_rate": 2.309588234183137e-06,
+      "loss": 0.0592,
+      "step": 138
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.4841715395450592,
+      "learning_rate": 2.2395409692487174e-06,
+      "loss": 0.0701,
+      "step": 139
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.3323078453540802,
+      "learning_rate": 2.1702645283470238e-06,
+      "loss": 0.0616,
+      "step": 140
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 0.3198353946208954,
+      "learning_rate": 2.1017782560671124e-06,
+      "loss": 0.0582,
+      "step": 141
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.2505796253681183,
+      "learning_rate": 2.0341012763532243e-06,
+      "loss": 0.0698,
+      "step": 142
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.40075603127479553,
+      "learning_rate": 1.967252487164663e-06,
+      "loss": 0.0608,
+      "step": 143
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.23713812232017517,
+      "learning_rate": 1.9012505551987764e-06,
+      "loss": 0.0615,
+      "step": 144
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 0.39941224455833435,
+      "learning_rate": 1.836113910678507e-06,
+      "loss": 0.0721,
+      "step": 145
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.22243474423885345,
+      "learning_rate": 1.771860742205988e-06,
+      "loss": 0.0591,
+      "step": 146
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.2193121612071991,
+      "learning_rate": 1.7085089916835924e-06,
+      "loss": 0.0628,
+      "step": 147
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.4138857126235962,
+      "learning_rate": 1.646076349303884e-06,
+      "loss": 0.06,
+      "step": 148
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.2536584436893463,
+      "learning_rate": 1.5845802486098461e-06,
+      "loss": 0.0627,
+      "step": 149
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 0.37862974405288696,
+      "learning_rate": 1.5240378616267887e-06,
+      "loss": 0.068,
+      "step": 150
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.484770804643631,
+      "learning_rate": 1.4644660940672628e-06,
+      "loss": 0.0562,
+      "step": 151
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.5063880085945129,
+      "learning_rate": 1.4058815806103542e-06,
+      "loss": 0.0698,
+      "step": 152
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.6178727746009827,
+      "learning_rate": 1.3483006802566546e-06,
+      "loss": 0.0674,
+      "step": 153
+    },
+    {
+      "epoch": 2.25,
+      "eval_loss": 0.0628371387720108,
+      "eval_runtime": 32.0386,
+      "eval_samples_per_second": 28.871,
+      "eval_steps_per_second": 3.621,
+      "step": 153
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 0.3401700258255005,
+      "learning_rate": 1.2917394717602123e-06,
+      "loss": 0.0597,
+      "step": 154
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.3246176838874817,
+      "learning_rate": 1.2362137491387433e-06,
+      "loss": 0.0582,
+      "step": 155
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.29669150710105896,
+      "learning_rate": 1.1817390172633402e-06,
+      "loss": 0.0726,
+      "step": 156
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 0.25248482823371887,
+      "learning_rate": 1.1283304875289335e-06,
+      "loss": 0.0535,
+      "step": 157
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 0.21017588675022125,
+      "learning_rate": 1.0760030736066952e-06,
+      "loss": 0.0628,
+      "step": 158
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.213237002491951,
+      "learning_rate": 1.024771387279585e-06,
+      "loss": 0.0594,
+      "step": 159
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.3133507966995239,
+      "learning_rate": 9.746497343621857e-07,
+      "loss": 0.0616,
+      "step": 160
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.29600203037261963,
+      "learning_rate": 9.256521107059834e-07,
+      "loss": 0.0569,
+      "step": 161
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 0.267036497592926,
+      "learning_rate": 8.777921982911996e-07,
+      "loss": 0.0571,
+      "step": 162
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.23227794468402863,
+      "learning_rate": 8.310833614062652e-07,
+      "loss": 0.0652,
+      "step": 163
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.28623953461647034,
+      "learning_rate": 7.85538642916015e-07,
+      "loss": 0.0665,
+      "step": 164
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 0.23467646539211273,
+      "learning_rate": 7.411707606196189e-07,
+      "loss": 0.063,
+      "step": 165
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 0.25430357456207275,
+      "learning_rate": 6.979921036993042e-07,
+      "loss": 0.0629,
+      "step": 166
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.22180821001529694,
+      "learning_rate": 6.560147292608177e-07,
+      "loss": 0.0584,
+      "step": 167
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.2619880437850952,
+      "learning_rate": 6.152503589666426e-07,
+      "loss": 0.0586,
+      "step": 168
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 0.2673584520816803,
+      "learning_rate": 5.757103757628573e-07,
+      "loss": 0.058,
+      "step": 169
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 0.27297332882881165,
+      "learning_rate": 5.374058207005945e-07,
+      "loss": 0.0611,
+      "step": 170
+    },
+    {
+      "epoch": 2.51,
+      "eval_loss": 0.06254950165748596,
+      "eval_runtime": 32.0576,
+      "eval_samples_per_second": 28.854,
+      "eval_steps_per_second": 3.618,
+      "step": 170
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.752245306968689,
+      "learning_rate": 5.00347389852961e-07,
+      "loss": 0.0626,
+      "step": 171
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.3121594190597534,
+      "learning_rate": 4.6454543132829653e-07,
+      "loss": 0.0551,
+      "step": 172
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 0.3015207350254059,
+      "learning_rate": 4.300099423805865e-07,
+      "loss": 0.0564,
+      "step": 173
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 0.25785017013549805,
+      "learning_rate": 3.9675056661785563e-07,
+      "loss": 0.0624,
+      "step": 174
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.43912383913993835,
+      "learning_rate": 3.647765913093132e-07,
+      "loss": 0.0561,
+      "step": 175
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.34310853481292725,
+      "learning_rate": 3.340969447919873e-07,
+      "loss": 0.0651,
+      "step": 176
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.2365168333053589,
+      "learning_rate": 3.0472019397761065e-07,
+      "loss": 0.0583,
+      "step": 177
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.4571215808391571,
+      "learning_rate": 2.7665454196040665e-07,
+      "loss": 0.0728,
+      "step": 178
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.2848273813724518,
+      "learning_rate": 2.4990782572647977e-07,
+      "loss": 0.0577,
+      "step": 179
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.2541896402835846,
+      "learning_rate": 2.2448751396543788e-07,
+      "loss": 0.0579,
+      "step": 180
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.25773105025291443,
+      "learning_rate": 2.004007049848461e-07,
+      "loss": 0.0598,
+      "step": 181
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.4684333801269531,
+      "learning_rate": 1.776541247281177e-07,
+      "loss": 0.0487,
+      "step": 182
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.4208538830280304,
+      "learning_rate": 1.5625412489637337e-07,
+      "loss": 0.0734,
+      "step": 183
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.36085304617881775,
+      "learning_rate": 1.3620668117481471e-07,
+      "loss": 0.068,
+      "step": 184
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 0.25514504313468933,
+      "learning_rate": 1.1751739156407649e-07,
+      "loss": 0.0589,
+      "step": 185
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.27741602063179016,
+      "learning_rate": 1.0019147481706626e-07,
+      "loss": 0.0593,
+      "step": 186
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.3092593848705292,
+      "learning_rate": 8.423376898168246e-08,
+      "loss": 0.0536,
+      "step": 187
+    },
+    {
+      "epoch": 2.76,
+      "eval_loss": 0.0627126693725586,
+      "eval_runtime": 32.0187,
+      "eval_samples_per_second": 28.889,
+      "eval_steps_per_second": 3.623,
+      "step": 187
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 0.2300243228673935,
+      "learning_rate": 6.964873004985717e-08,
+      "loss": 0.0585,
+      "step": 188
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.6012884974479675,
+      "learning_rate": 5.6440430713269325e-08,
+      "loss": 0.0683,
+      "step": 189
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.4536500573158264,
+      "learning_rate": 4.461255922609986e-08,
+      "loss": 0.0637,
+      "step": 190
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.2440507858991623,
+      "learning_rate": 3.416841837512952e-08,
+      "loss": 0.0574,
+      "step": 191
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.29257693886756897,
+      "learning_rate": 2.511092455747932e-08,
+      "loss": 0.055,
+      "step": 192
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.24556905031204224,
+      "learning_rate": 1.7442606966242005e-08,
+      "loss": 0.0646,
+      "step": 193
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.243976429104805,
+      "learning_rate": 1.1165606884234182e-08,
+      "loss": 0.0547,
+      "step": 194
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.44641396403312683,
+      "learning_rate": 6.281677086071303e-09,
+      "loss": 0.0629,
+      "step": 195
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.27298280596733093,
+      "learning_rate": 2.792181348726941e-09,
+      "loss": 0.059,
+      "step": 196
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.23333518207073212,
+      "learning_rate": 6.980940707146388e-10,
+      "loss": 0.055,
+      "step": 197
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.3281053602695465,
+      "learning_rate": 0.0,
+      "loss": 0.0653,
+      "step": 198
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 198,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 66,
+  "total_flos": 1.4000287567262515e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-198/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3aa8a6641a3601e0e6938116d17cd8cd3d398e0c12208d120adf3e793eb3f1a
+size 5752

checkpoint-66/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-v0.1
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-66/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-66/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15eb26078458da96500c2534e9e6a70b4ca21aab1db4b2d3a10a19197fe531f
+size 335604696

checkpoint-66/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b26fda8233e95b0bc0b9c571d03584a9a1c3a15196f5fe89d0b5971b1fee4c5
+size 168624724

checkpoint-66/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05a4a21e6e7af1623eb3c6bc57e16fd5464c110be52ca3b39df274c713b6e784
+size 14512

checkpoint-66/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edb38aec5b941d876dcc9e5dc2d340494bca120e83efdea07f67b905ca6259be
+size 14512

checkpoint-66/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20687955dcda24f1fd29fd690390cbb14b9af05d547e8b7b9e6686c87349cee5
+size 1064

checkpoint-66/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-66/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-66/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-66/trainer_state.json ADDED Viewed

	@@ -0,0 +1,515 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9869158878504672,
+  "eval_steps": 17,
+  "global_step": 66,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.601767539978027,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.5188,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.5281636714935303,
+      "eval_runtime": 136.1276,
+      "eval_samples_per_second": 6.795,
+      "eval_steps_per_second": 0.852,
+      "step": 1
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 8.70827865600586,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.561,
+      "step": 2
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 8.504729270935059,
+      "learning_rate": 3e-06,
+      "loss": 2.5392,
+      "step": 3
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 8.70252513885498,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.5358,
+      "step": 4
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 8.545235633850098,
+      "learning_rate": 5e-06,
+      "loss": 2.4953,
+      "step": 5
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 8.594281196594238,
+      "learning_rate": 6e-06,
+      "loss": 2.5025,
+      "step": 6
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.570807456970215,
+      "learning_rate": 7e-06,
+      "loss": 2.4483,
+      "step": 7
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 8.201045036315918,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 2.3844,
+      "step": 8
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 8.294769287109375,
+      "learning_rate": 9e-06,
+      "loss": 2.3132,
+      "step": 9
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.825733661651611,
+      "learning_rate": 1e-05,
+      "loss": 2.2073,
+      "step": 10
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.359045028686523,
+      "learning_rate": 9.999301905929286e-06,
+      "loss": 2.0469,
+      "step": 11
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 6.876375198364258,
+      "learning_rate": 9.997207818651273e-06,
+      "loss": 1.8699,
+      "step": 12
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 6.270627021789551,
+      "learning_rate": 9.99371832291393e-06,
+      "loss": 1.6649,
+      "step": 13
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.742035388946533,
+      "learning_rate": 9.988834393115768e-06,
+      "loss": 1.4954,
+      "step": 14
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 5.249215602874756,
+      "learning_rate": 9.982557393033758e-06,
+      "loss": 1.3214,
+      "step": 15
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 4.818938732147217,
+      "learning_rate": 9.97488907544252e-06,
+      "loss": 1.1535,
+      "step": 16
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.380295753479004,
+      "learning_rate": 9.965831581624872e-06,
+      "loss": 1.0047,
+      "step": 17
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 0.8628284335136414,
+      "eval_runtime": 133.5823,
+      "eval_samples_per_second": 6.925,
+      "eval_steps_per_second": 0.868,
+      "step": 17
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.99696683883667,
+      "learning_rate": 9.955387440773902e-06,
+      "loss": 0.8632,
+      "step": 18
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.7503676414489746,
+      "learning_rate": 9.943559569286731e-06,
+      "loss": 0.7559,
+      "step": 19
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 3.430863618850708,
+      "learning_rate": 9.930351269950144e-06,
+      "loss": 0.6444,
+      "step": 20
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.25299072265625,
+      "learning_rate": 9.915766231018317e-06,
+      "loss": 0.5364,
+      "step": 21
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 3.217674970626831,
+      "learning_rate": 9.899808525182935e-06,
+      "loss": 0.4766,
+      "step": 22
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 3.2001354694366455,
+      "learning_rate": 9.882482608435924e-06,
+      "loss": 0.4038,
+      "step": 23
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.011241912841797,
+      "learning_rate": 9.863793318825186e-06,
+      "loss": 0.3333,
+      "step": 24
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.758089303970337,
+      "learning_rate": 9.843745875103628e-06,
+      "loss": 0.2752,
+      "step": 25
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.914292812347412,
+      "learning_rate": 9.822345875271884e-06,
+      "loss": 0.2229,
+      "step": 26
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.1877121925354004,
+      "learning_rate": 9.799599295015154e-06,
+      "loss": 0.1846,
+      "step": 27
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.3617541790008545,
+      "learning_rate": 9.775512486034564e-06,
+      "loss": 0.1556,
+      "step": 28
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.431768774986267,
+      "learning_rate": 9.75009217427352e-06,
+      "loss": 0.1279,
+      "step": 29
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7117260098457336,
+      "learning_rate": 9.723345458039595e-06,
+      "loss": 0.1061,
+      "step": 30
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.45769569277763367,
+      "learning_rate": 9.695279806022391e-06,
+      "loss": 0.0987,
+      "step": 31
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2953682243824005,
+      "learning_rate": 9.665903055208013e-06,
+      "loss": 0.0919,
+      "step": 32
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.17375054955482483,
+      "learning_rate": 9.635223408690688e-06,
+      "loss": 0.0854,
+      "step": 33
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.11500216275453568,
+      "learning_rate": 9.603249433382145e-06,
+      "loss": 0.086,
+      "step": 34
+    },
+    {
+      "epoch": 0.51,
+      "eval_loss": 0.08624568581581116,
+      "eval_runtime": 132.1775,
+      "eval_samples_per_second": 6.998,
+      "eval_steps_per_second": 0.878,
+      "step": 34
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12646625936031342,
+      "learning_rate": 9.569990057619414e-06,
+      "loss": 0.0851,
+      "step": 35
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11912114918231964,
+      "learning_rate": 9.535454568671705e-06,
+      "loss": 0.0834,
+      "step": 36
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.24543708562850952,
+      "learning_rate": 9.49965261014704e-06,
+      "loss": 0.0836,
+      "step": 37
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.0936342254281044,
+      "learning_rate": 9.462594179299408e-06,
+      "loss": 0.0844,
+      "step": 38
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.18341362476348877,
+      "learning_rate": 9.424289624237143e-06,
+      "loss": 0.0805,
+      "step": 39
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21815626323223114,
+      "learning_rate": 9.384749641033358e-06,
+      "loss": 0.0811,
+      "step": 40
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21489091217517853,
+      "learning_rate": 9.343985270739184e-06,
+      "loss": 0.0793,
+      "step": 41
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.23281769454479218,
+      "learning_rate": 9.302007896300697e-06,
+      "loss": 0.0775,
+      "step": 42
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6911139488220215,
+      "learning_rate": 9.25882923938038e-06,
+      "loss": 0.0812,
+      "step": 43
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6087940335273743,
+      "learning_rate": 9.214461357083986e-06,
+      "loss": 0.0801,
+      "step": 44
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.5599693059921265,
+      "learning_rate": 9.168916638593736e-06,
+      "loss": 0.0822,
+      "step": 45
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.23726361989974976,
+      "learning_rate": 9.122207801708802e-06,
+      "loss": 0.0744,
+      "step": 46
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.075922966003418,
+      "learning_rate": 9.074347889294017e-06,
+      "loss": 0.0824,
+      "step": 47
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.49190425872802734,
+      "learning_rate": 9.025350265637816e-06,
+      "loss": 0.0784,
+      "step": 48
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.8844243288040161,
+      "learning_rate": 8.975228612720415e-06,
+      "loss": 0.0722,
+      "step": 49
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2909235954284668,
+      "learning_rate": 8.923996926393306e-06,
+      "loss": 0.075,
+      "step": 50
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3661656975746155,
+      "learning_rate": 8.871669512471068e-06,
+      "loss": 0.0732,
+      "step": 51
+    },
+    {
+      "epoch": 0.76,
+      "eval_loss": 0.07530223578214645,
+      "eval_runtime": 133.0044,
+      "eval_samples_per_second": 6.955,
+      "eval_steps_per_second": 0.872,
+      "step": 51
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.49425163865089417,
+      "learning_rate": 8.818260982736662e-06,
+      "loss": 0.07,
+      "step": 52
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9367341995239258,
+      "learning_rate": 8.763786250861258e-06,
+      "loss": 0.0819,
+      "step": 53
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9425927400588989,
+      "learning_rate": 8.708260528239788e-06,
+      "loss": 0.0754,
+      "step": 54
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.3247818052768707,
+      "learning_rate": 8.651699319743348e-06,
+      "loss": 0.0739,
+      "step": 55
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8091337084770203,
+      "learning_rate": 8.594118419389648e-06,
+      "loss": 0.0724,
+      "step": 56
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.556105375289917,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 0.0733,
+      "step": 57
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9925010800361633,
+      "learning_rate": 8.475962138373212e-06,
+      "loss": 0.0761,
+      "step": 58
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.32729703187942505,
+      "learning_rate": 8.415419751390155e-06,
+      "loss": 0.0694,
+      "step": 59
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2549174129962921,
+      "learning_rate": 8.353923650696119e-06,
+      "loss": 0.0672,
+      "step": 60
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.2688353657722473,
+      "learning_rate": 8.291491008316409e-06,
+      "loss": 0.0694,
+      "step": 61
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.38362765312194824,
+      "learning_rate": 8.228139257794012e-06,
+      "loss": 0.0671,
+      "step": 62
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8581087589263916,
+      "learning_rate": 8.163886089321493e-06,
+      "loss": 0.0745,
+      "step": 63
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5643619894981384,
+      "learning_rate": 8.098749444801226e-06,
+      "loss": 0.0681,
+      "step": 64
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.637834906578064,
+      "learning_rate": 8.032747512835338e-06,
+      "loss": 0.0773,
+      "step": 65
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.2533693313598633,
+      "learning_rate": 7.965898723646777e-06,
+      "loss": 0.0663,
+      "step": 66
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 198,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 66,
+  "total_flos": 4.667867604084326e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-66/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3aa8a6641a3601e0e6938116d17cd8cd3d398e0c12208d120adf3e793eb3f1a
+size 5752

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

runs/Apr04_09-49-49_mala/events.out.tfevents.1712195390.mala.189757.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b98399004f8d697069f34d506390ef6cb37eaa622d118b97c6c45f227b33175
+size 50909

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}