Upload 9 files

Browse files

Files changed (9) hide show

README.md +202 -0
adapter_config.json +33 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1463 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-v0.1
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8da3d6940a0710a4b3d188fdccdb370e9112726b25d4c59167b9cd4620489e1
+size 167832688

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cacaabab08b0f795a13d2985e6a316f11bb4337ac03346ad329fbf9d6ee38274
+size 335922450

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e4d28e39f27ee044f5ee49f5a7645695b76932948b11c1b00584d14fc5b1d8e
+size 14512

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34510b901a214f68e833527d44cd53f157ee8d7281a702ad5a5e36a5552e8f41
+size 14512

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb081055c2974be825162360067e630830a5dee1233c8f964bb185bd4f65142
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1463 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.963855421686747,
+  "eval_steps": 500,
+  "global_step": 206,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.03125,
+      "learning_rate": 2e-05,
+      "loss": 4.8216,
+      "step": 1
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.75,
+      "learning_rate": 4e-05,
+      "loss": 4.7468,
+      "step": 2
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 3.046875,
+      "learning_rate": 6e-05,
+      "loss": 4.6897,
+      "step": 3
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 3.609375,
+      "learning_rate": 8e-05,
+      "loss": 4.871,
+      "step": 4
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0001,
+      "loss": 4.3603,
+      "step": 5
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.00012,
+      "loss": 4.1471,
+      "step": 6
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.375,
+      "learning_rate": 0.00014,
+      "loss": 4.1348,
+      "step": 7
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.875,
+      "learning_rate": 0.00016,
+      "loss": 3.7162,
+      "step": 8
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 3.09375,
+      "learning_rate": 0.00018,
+      "loss": 3.4262,
+      "step": 9
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0002,
+      "loss": 3.3557,
+      "step": 10
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.00019998715457999314,
+      "loss": 2.8991,
+      "step": 11
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 3.578125,
+      "learning_rate": 0.0001999486216200688,
+      "loss": 3.058,
+      "step": 12
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.0001998844110196681,
+      "loss": 2.9225,
+      "step": 13
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.00019979453927503364,
+      "loss": 2.7459,
+      "step": 14
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 7.875,
+      "learning_rate": 0.00019967902947497156,
+      "loss": 2.7172,
+      "step": 15
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 3.65625,
+      "learning_rate": 0.00019953791129491983,
+      "loss": 2.5032,
+      "step": 16
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.00019937122098932428,
+      "loss": 2.756,
+      "step": 17
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.875,
+      "learning_rate": 0.0001991790013823246,
+      "loss": 2.7377,
+      "step": 18
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00019896130185675261,
+      "loss": 2.3483,
+      "step": 19
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 23.75,
+      "learning_rate": 0.00019871817834144504,
+      "loss": 2.7189,
+      "step": 20
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 6.40625,
+      "learning_rate": 0.00019844969329687527,
+      "loss": 2.2662,
+      "step": 21
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.00019815591569910654,
+      "loss": 2.172,
+      "step": 22
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00019783692102207155,
+      "loss": 2.3391,
+      "step": 23
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.00019749279121818235,
+      "loss": 2.2976,
+      "step": 24
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0001971236146972764,
+      "loss": 2.3962,
+      "step": 25
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.78125,
+      "learning_rate": 0.00019672948630390294,
+      "loss": 2.4274,
+      "step": 26
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 3.375,
+      "learning_rate": 0.00019631050729295707,
+      "loss": 2.0537,
+      "step": 27
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00019586678530366606,
+      "loss": 2.204,
+      "step": 28
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00019539843433193639,
+      "loss": 2.2146,
+      "step": 29
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00019490557470106686,
+      "loss": 2.1501,
+      "step": 30
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.00019438833303083678,
+      "loss": 2.295,
+      "step": 31
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00019384684220497605,
+      "loss": 2.215,
+      "step": 32
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0001932812413370265,
+      "loss": 2.0909,
+      "step": 33
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0001926916757346022,
+      "loss": 2.0389,
+      "step": 34
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00019207829686205882,
+      "loss": 2.071,
+      "step": 35
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.0,
+      "learning_rate": 0.00019144126230158127,
+      "loss": 2.099,
+      "step": 36
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00019078073571269922,
+      "loss": 1.9823,
+      "step": 37
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.0001900968867902419,
+      "loss": 2.2366,
+      "step": 38
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.00018938989122074197,
+      "loss": 1.9585,
+      "step": 39
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00018865993063730004,
+      "loss": 2.0241,
+      "step": 40
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.00018790719257292174,
+      "loss": 2.1073,
+      "step": 41
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.00018713187041233896,
+      "loss": 2.0247,
+      "step": 42
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00018633416334232753,
+      "loss": 2.1386,
+      "step": 43
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00018551427630053463,
+      "loss": 1.9789,
+      "step": 44
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00018467241992282843,
+      "loss": 2.0121,
+      "step": 45
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00018380881048918405,
+      "loss": 1.8511,
+      "step": 46
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0001829236698681195,
+      "loss": 1.9241,
+      "step": 47
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0001820172254596956,
+      "loss": 2.1745,
+      "step": 48
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0001810897101370951,
+      "loss": 2.129,
+      "step": 49
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00018014136218679567,
+      "loss": 2.0897,
+      "step": 50
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.000179172425247352,
+      "loss": 1.9669,
+      "step": 51
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000178183148246803,
+      "loss": 2.1534,
+      "step": 52
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00017717378533872017,
+      "loss": 2.0392,
+      "step": 53
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00017614459583691346,
+      "loss": 2.0854,
+      "step": 54
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.00017509584414881113,
+      "loss": 2.0876,
+      "step": 55
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.953125,
+      "learning_rate": 0.00017402779970753155,
+      "loss": 2.056,
+      "step": 56
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00017294073690266344,
+      "loss": 2.1355,
+      "step": 57
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00017183493500977278,
+      "loss": 2.1969,
+      "step": 58
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 2.1154,
+      "step": 59
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00016956825506034867,
+      "loss": 2.1209,
+      "step": 60
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00016840795933293463,
+      "loss": 1.8669,
+      "step": 61
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0001672300890261317,
+      "loss": 2.1942,
+      "step": 62
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00016603494674471593,
+      "loss": 1.9799,
+      "step": 63
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00016482283953077887,
+      "loss": 2.0549,
+      "step": 64
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00016359407878484552,
+      "loss": 2.0861,
+      "step": 65
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 1.9708,
+      "step": 66
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00016108786361015143,
+      "loss": 1.9662,
+      "step": 67
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00015981105304912162,
+      "loss": 2.1796,
+      "step": 68
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00015851887652614237,
+      "loss": 2.1684,
+      "step": 69
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00015721166601221698,
+      "loss": 2.0407,
+      "step": 70
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00015588975734070717,
+      "loss": 1.9925,
+      "step": 71
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00015455349012105486,
+      "loss": 1.9737,
+      "step": 72
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00015320320765153367,
+      "loss": 1.9839,
+      "step": 73
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00015183925683105254,
+      "loss": 2.0463,
+      "step": 74
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0001504619880700346,
+      "loss": 1.9362,
+      "step": 75
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001490717552003938,
+      "loss": 1.9806,
+      "step": 76
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.00014766891538463254,
+      "loss": 2.0698,
+      "step": 77
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00014625382902408356,
+      "loss": 2.1027,
+      "step": 78
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0001448268596663197,
+      "loss": 2.1428,
+      "step": 79
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00014338837391175582,
+      "loss": 1.9171,
+      "step": 80
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001419387413194657,
+      "loss": 2.0363,
+      "step": 81
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00014047833431223938,
+      "loss": 2.1144,
+      "step": 82
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00013900752808090468,
+      "loss": 2.0149,
+      "step": 83
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00013752670048793744,
+      "loss": 1.9503,
+      "step": 84
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00013603623197038536,
+      "loss": 1.9914,
+      "step": 85
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00013453650544213076,
+      "loss": 1.9256,
+      "step": 86
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00013302790619551674,
+      "loss": 1.9014,
+      "step": 87
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0001315108218023621,
+      "loss": 1.8,
+      "step": 88
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00012998564201439116,
+      "loss": 1.9711,
+      "step": 89
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00012845275866310324,
+      "loss": 1.9172,
+      "step": 90
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00012691256555910768,
+      "loss": 1.8175,
+      "step": 91
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00012536545839095074,
+      "loss": 1.8827,
+      "step": 92
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00012381183462345982,
+      "loss": 1.8027,
+      "step": 93
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 2.0469,
+      "step": 94
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00012068663541808909,
+      "loss": 1.9651,
+      "step": 95
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00011911586287013725,
+      "loss": 1.8538,
+      "step": 96
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00011754017929643817,
+      "loss": 1.8906,
+      "step": 97
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011595998950333793,
+      "loss": 1.9807,
+      "step": 98
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00011437569945486819,
+      "loss": 1.8098,
+      "step": 99
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00011278771616845061,
+      "loss": 1.9236,
+      "step": 100
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011119644761033078,
+      "loss": 1.9111,
+      "step": 101
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010960230259076818,
+      "loss": 2.0487,
+      "step": 102
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00010800569065900933,
+      "loss": 2.025,
+      "step": 103
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001064070219980713,
+      "loss": 1.833,
+      "step": 104
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.00010480670731936208,
+      "loss": 1.886,
+      "step": 105
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00010320515775716555,
+      "loss": 1.8777,
+      "step": 106
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001016027847630174,
+      "loss": 1.6924,
+      "step": 107
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0001,
+      "loss": 1.673,
+      "step": 108
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.234375,
+      "learning_rate": 9.839721523698264e-05,
+      "loss": 1.9082,
+      "step": 109
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.92578125,
+      "learning_rate": 9.679484224283449e-05,
+      "loss": 1.8195,
+      "step": 110
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.0390625,
+      "learning_rate": 9.519329268063795e-05,
+      "loss": 1.6165,
+      "step": 111
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.85546875,
+      "learning_rate": 9.359297800192872e-05,
+      "loss": 1.7139,
+      "step": 112
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.83203125,
+      "learning_rate": 9.199430934099068e-05,
+      "loss": 1.6608,
+      "step": 113
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.79296875,
+      "learning_rate": 9.039769740923183e-05,
+      "loss": 1.8531,
+      "step": 114
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.83203125,
+      "learning_rate": 8.880355238966923e-05,
+      "loss": 1.7978,
+      "step": 115
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.015625,
+      "learning_rate": 8.721228383154939e-05,
+      "loss": 1.5377,
+      "step": 116
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.562430054513184e-05,
+      "loss": 1.7113,
+      "step": 117
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.53125,
+      "learning_rate": 8.404001049666211e-05,
+      "loss": 1.681,
+      "step": 118
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.96875,
+      "learning_rate": 8.245982070356185e-05,
+      "loss": 1.5415,
+      "step": 119
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.94140625,
+      "learning_rate": 8.08841371298628e-05,
+      "loss": 1.6291,
+      "step": 120
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.9921875,
+      "learning_rate": 7.931336458191092e-05,
+      "loss": 1.6358,
+      "step": 121
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.1015625,
+      "learning_rate": 7.774790660436858e-05,
+      "loss": 1.7496,
+      "step": 122
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.618816537654018e-05,
+      "loss": 1.8027,
+      "step": 123
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.9140625,
+      "learning_rate": 7.463454160904928e-05,
+      "loss": 1.7852,
+      "step": 124
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.91015625,
+      "learning_rate": 7.308743444089232e-05,
+      "loss": 1.6982,
+      "step": 125
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.98046875,
+      "learning_rate": 7.154724133689677e-05,
+      "loss": 1.8611,
+      "step": 126
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.85546875,
+      "learning_rate": 7.001435798560883e-05,
+      "loss": 1.5955,
+      "step": 127
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.84765625,
+      "learning_rate": 6.848917819763793e-05,
+      "loss": 1.7338,
+      "step": 128
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.697209380448333e-05,
+      "loss": 1.6586,
+      "step": 129
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.8671875,
+      "learning_rate": 6.546349455786926e-05,
+      "loss": 1.6435,
+      "step": 130
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.953125,
+      "learning_rate": 6.396376802961468e-05,
+      "loss": 1.7111,
+      "step": 131
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.1015625,
+      "learning_rate": 6.24732995120626e-05,
+      "loss": 1.5686,
+      "step": 132
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.0992471919095315e-05,
+      "loss": 1.7383,
+      "step": 133
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.9375,
+      "learning_rate": 5.952166568776062e-05,
+      "loss": 1.7647,
+      "step": 134
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.99609375,
+      "learning_rate": 5.806125868053433e-05,
+      "loss": 1.8344,
+      "step": 135
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.984375,
+      "learning_rate": 5.6611626088244194e-05,
+      "loss": 1.8154,
+      "step": 136
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.1953125,
+      "learning_rate": 5.5173140333680306e-05,
+      "loss": 1.5994,
+      "step": 137
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.37461709759165e-05,
+      "loss": 1.6497,
+      "step": 138
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.9296875,
+      "learning_rate": 5.2331084615367485e-05,
+      "loss": 1.8425,
+      "step": 139
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.078125,
+      "learning_rate": 5.092824479960625e-05,
+      "loss": 1.8534,
+      "step": 140
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.953801192996543e-05,
+      "loss": 1.6864,
+      "step": 141
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.9765625,
+      "learning_rate": 4.8160743168947496e-05,
+      "loss": 1.6976,
+      "step": 142
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.0625,
+      "learning_rate": 4.6796792348466356e-05,
+      "loss": 1.7416,
+      "step": 143
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.91015625,
+      "learning_rate": 4.544650987894514e-05,
+      "loss": 1.485,
+      "step": 144
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.93359375,
+      "learning_rate": 4.4110242659292836e-05,
+      "loss": 1.6244,
+      "step": 145
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.0,
+      "learning_rate": 4.278833398778306e-05,
+      "loss": 1.6941,
+      "step": 146
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.148112347385762e-05,
+      "loss": 1.7538,
+      "step": 147
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.9609375,
+      "learning_rate": 4.0188946950878404e-05,
+      "loss": 1.7124,
+      "step": 148
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.8912136389848576e-05,
+      "loss": 1.668,
+      "step": 149
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.7651019814126654e-05,
+      "loss": 1.6419,
+      "step": 150
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.6405921215154494e-05,
+      "loss": 1.6867,
+      "step": 151
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.99609375,
+      "learning_rate": 3.517716046922118e-05,
+      "loss": 1.8303,
+      "step": 152
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.9921875,
+      "learning_rate": 3.3965053255284084e-05,
+      "loss": 1.6061,
+      "step": 153
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.0078125,
+      "learning_rate": 3.276991097386831e-05,
+      "loss": 1.5528,
+      "step": 154
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.159204066706539e-05,
+      "loss": 1.6806,
+      "step": 155
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.0431744939651364e-05,
+      "loss": 1.7271,
+      "step": 156
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 1.0625,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 1.6939,
+      "step": 157
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.8165064990227252e-05,
+      "loss": 1.9148,
+      "step": 158
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.7059263097336597e-05,
+      "loss": 1.5374,
+      "step": 159
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.2265625,
+      "learning_rate": 2.5972200292468464e-05,
+      "loss": 1.6695,
+      "step": 160
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.96875,
+      "learning_rate": 2.4904155851188872e-05,
+      "loss": 1.6876,
+      "step": 161
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.3855404163086558e-05,
+      "loss": 1.731,
+      "step": 162
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.0625,
+      "learning_rate": 2.282621466127982e-05,
+      "loss": 1.5561,
+      "step": 163
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.171875,
+      "learning_rate": 2.181685175319702e-05,
+      "loss": 1.7871,
+      "step": 164
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.25,
+      "learning_rate": 2.0827574752648038e-05,
+      "loss": 1.7695,
+      "step": 165
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.985863781320435e-05,
+      "loss": 1.6149,
+      "step": 166
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.891028986290492e-05,
+      "loss": 1.5127,
+      "step": 167
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.7982774540304403e-05,
+      "loss": 1.757,
+      "step": 168
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.7076330131880526e-05,
+      "loss": 1.5873,
+      "step": 169
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.619118951081594e-05,
+      "loss": 1.6649,
+      "step": 170
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.5327580077171587e-05,
+      "loss": 1.7183,
+      "step": 171
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.0,
+      "learning_rate": 1.4485723699465392e-05,
+      "loss": 1.7235,
+      "step": 172
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.3665836657672493e-05,
+      "loss": 1.758,
+      "step": 173
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.286812958766106e-05,
+      "loss": 1.8054,
+      "step": 174
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.95703125,
+      "learning_rate": 1.2092807427078279e-05,
+      "loss": 1.7808,
+      "step": 175
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.134006936269999e-05,
+      "loss": 1.7116,
+      "step": 176
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.98828125,
+      "learning_rate": 1.0610108779258044e-05,
+      "loss": 1.6665,
+      "step": 177
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.9921875,
+      "learning_rate": 9.903113209758096e-06,
+      "loss": 1.6648,
+      "step": 178
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.1171875,
+      "learning_rate": 9.219264287300799e-06,
+      "loss": 1.6068,
+      "step": 179
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.558737698418761e-06,
+      "loss": 1.6528,
+      "step": 180
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.0234375,
+      "learning_rate": 7.921703137941173e-06,
+      "loss": 1.6026,
+      "step": 181
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.984375,
+      "learning_rate": 7.308324265397836e-06,
+      "loss": 1.7228,
+      "step": 182
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.234375,
+      "learning_rate": 6.718758662973523e-06,
+      "loss": 1.7112,
+      "step": 183
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.96875,
+      "learning_rate": 6.153157795023956e-06,
+      "loss": 1.6732,
+      "step": 184
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.611666969163243e-06,
+      "loss": 1.6743,
+      "step": 185
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.1015625,
+      "learning_rate": 5.094425298933136e-06,
+      "loss": 1.5847,
+      "step": 186
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.601565668063623e-06,
+      "loss": 1.6576,
+      "step": 187
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.984375,
+      "learning_rate": 4.133214696333942e-06,
+      "loss": 1.6032,
+      "step": 188
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.689492707042974e-06,
+      "loss": 1.634,
+      "step": 189
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.0234375,
+      "learning_rate": 3.270513696097055e-06,
+      "loss": 1.6969,
+      "step": 190
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.876385302723628e-06,
+      "loss": 1.6449,
+      "step": 191
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.96484375,
+      "learning_rate": 2.5072087818176382e-06,
+      "loss": 1.4763,
+      "step": 192
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.96875,
+      "learning_rate": 2.1630789779284675e-06,
+      "loss": 1.719,
+      "step": 193
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.8440843008934561e-06,
+      "loss": 1.8517,
+      "step": 194
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.9296875,
+      "learning_rate": 1.5503067031247598e-06,
+      "loss": 1.5597,
+      "step": 195
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.2818216585549825e-06,
+      "loss": 1.7673,
+      "step": 196
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.125,
+      "learning_rate": 1.0386981432474074e-06,
+      "loss": 1.6707,
+      "step": 197
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.9453125,
+      "learning_rate": 8.209986176753948e-07,
+      "loss": 1.6395,
+      "step": 198
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.1875,
+      "learning_rate": 6.287790106757396e-07,
+      "loss": 1.5912,
+      "step": 199
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 1.0234375,
+      "learning_rate": 4.62088705080177e-07,
+      "loss": 1.5877,
+      "step": 200
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.94140625,
+      "learning_rate": 3.2097052502843007e-07,
+      "loss": 1.5594,
+      "step": 201
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.3125,
+      "learning_rate": 2.054607249663665e-07,
+      "loss": 1.6429,
+      "step": 202
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.1558898033191546e-07,
+      "loss": 1.6142,
+      "step": 203
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.137837993121064e-08,
+      "loss": 1.6624,
+      "step": 204
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.2845420006879494e-08,
+      "loss": 1.5157,
+      "step": 205
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0,
+      "loss": 1.6079,
+      "step": 206
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 206,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 103,
+  "total_flos": 5.827761978432553e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05285fd2f0f92792a9921a780968c1959edd9be78eac25715595f43216594e50
+size 5624