Helw150
commited on
Commit
•
ae8d8b0
0
Parent(s):
Restore
Browse files- .gitattributes +39 -0
- .gitignore +1 -0
- README.md +77 -0
- added_tokens.json +60 -0
- all_results.json +20 -0
- config.json +32 -0
- eval_results.json +9 -0
- generated_predictions.txt +0 -0
- predict_results.json +8 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +5 -0
- spiece.model +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +11 -0
- train_results.json +8 -0
- trainer_state.json +250 -0
- training_args.bin +3 -0
.gitattributes
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
+
spiece.model filter=lfs diff=lfs merge=lfs -text
|
38 |
+
tokenizer filter=lfs diff=lfs merge=lfs -text
|
39 |
+
training_args.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
checkpoint-*/
|
README.md
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
license: apache-2.0
|
5 |
+
tags:
|
6 |
+
- generated_from_trainer
|
7 |
+
datasets:
|
8 |
+
- cstop_artificial
|
9 |
+
model-index:
|
10 |
+
- name: t5-base-pointer-adv-cstop_artificial
|
11 |
+
results: []
|
12 |
+
---
|
13 |
+
|
14 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
15 |
+
should probably proofread and complete it, then remove this comment. -->
|
16 |
+
|
17 |
+
# t5-base-pointer-adv-cstop_artificial
|
18 |
+
|
19 |
+
This model is a fine-tuned version of [google/mt5-base](https://huggingface.co/google/mt5-base) on the cstop_artificial dataset.
|
20 |
+
It achieves the following results on the evaluation set:
|
21 |
+
- Loss: 0.0728
|
22 |
+
- Exact Match: 0.7925
|
23 |
+
|
24 |
+
## Model description
|
25 |
+
|
26 |
+
More information needed
|
27 |
+
|
28 |
+
## Intended uses & limitations
|
29 |
+
|
30 |
+
More information needed
|
31 |
+
|
32 |
+
## Training and evaluation data
|
33 |
+
|
34 |
+
More information needed
|
35 |
+
|
36 |
+
## Training procedure
|
37 |
+
|
38 |
+
### Training hyperparameters
|
39 |
+
|
40 |
+
The following hyperparameters were used during training:
|
41 |
+
- learning_rate: 0.001
|
42 |
+
- train_batch_size: 8
|
43 |
+
- eval_batch_size: 8
|
44 |
+
- seed: 42
|
45 |
+
- gradient_accumulation_steps: 64
|
46 |
+
- total_train_batch_size: 512
|
47 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
48 |
+
- lr_scheduler_type: linear
|
49 |
+
- training_steps: 3000
|
50 |
+
|
51 |
+
### Training results
|
52 |
+
|
53 |
+
| Training Loss | Epoch | Step | Validation Loss | Exact Match |
|
54 |
+
|:-------------:|:-----:|:----:|:---------------:|:-----------:|
|
55 |
+
| 1.7423 | 12.5 | 200 | 0.1173 | 0.2397 |
|
56 |
+
| 0.3678 | 25.0 | 400 | 0.0728 | 0.3363 |
|
57 |
+
| 0.3202 | 37.5 | 600 | 0.0879 | 0.3381 |
|
58 |
+
| 0.3452 | 50.0 | 800 | 0.0908 | 0.3363 |
|
59 |
+
| 0.3099 | 62.5 | 1000 | 0.1056 | 0.3435 |
|
60 |
+
| 0.3057 | 75.0 | 1200 | 0.1109 | 0.3470 |
|
61 |
+
| 0.3045 | 87.5 | 1400 | 0.1273 | 0.3453 |
|
62 |
+
| 0.3052 | 100.0 | 1600 | 0.1065 | 0.3417 |
|
63 |
+
| 0.3037 | 112.5 | 1800 | 0.1387 | 0.3381 |
|
64 |
+
| 0.3036 | 125.0 | 2000 | 0.1421 | 0.3453 |
|
65 |
+
| 0.3023 | 137.5 | 2200 | 0.1649 | 0.3399 |
|
66 |
+
| 0.3028 | 150.0 | 2400 | 0.1574 | 0.3399 |
|
67 |
+
| 0.3025 | 162.5 | 2600 | 0.1563 | 0.3399 |
|
68 |
+
| 0.3017 | 175.0 | 2800 | 0.1589 | 0.3399 |
|
69 |
+
| 0.302 | 187.5 | 3000 | 0.1587 | 0.3417 |
|
70 |
+
|
71 |
+
|
72 |
+
### Framework versions
|
73 |
+
|
74 |
+
- Transformers 4.24.0
|
75 |
+
- Pytorch 1.13.0+cu117
|
76 |
+
- Datasets 2.7.0
|
77 |
+
- Tokenizers 0.13.2
|
added_tokens.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<pt-0>": 250131,
|
3 |
+
"<pt-10>": 250103,
|
4 |
+
"<pt-11>": 250108,
|
5 |
+
"<pt-12>": 250146,
|
6 |
+
"<pt-13>": 250135,
|
7 |
+
"<pt-14>": 250139,
|
8 |
+
"<pt-15>": 250147,
|
9 |
+
"<pt-16>": 250143,
|
10 |
+
"<pt-17>": 250155,
|
11 |
+
"<pt-18>": 250156,
|
12 |
+
"<pt-19>": 250157,
|
13 |
+
"<pt-1>": 250129,
|
14 |
+
"<pt-2>": 250114,
|
15 |
+
"<pt-3>": 250116,
|
16 |
+
"<pt-4>": 250106,
|
17 |
+
"<pt-5>": 250101,
|
18 |
+
"<pt-6>": 250100,
|
19 |
+
"<pt-7>": 250119,
|
20 |
+
"<pt-8>": 250117,
|
21 |
+
"<pt-9>": 250121,
|
22 |
+
"[IN:CAMERA_FOLLOW": 250104,
|
23 |
+
"[IN:CAMERA_STOP_FOLLOWING": 250107,
|
24 |
+
"[IN:CLOSE_RESOURCE": 250109,
|
25 |
+
"[IN:DECREASE_BRIGHTNESS": 250124,
|
26 |
+
"[IN:DECREASE_VOLUME": 250145,
|
27 |
+
"[IN:GET_INFO_CONTACT": 250152,
|
28 |
+
"[IN:GET_WEATHER": 250141,
|
29 |
+
"[IN:INCREASE_BRIGHTNESS": 250115,
|
30 |
+
"[IN:INCREASE_VOLUME": 250144,
|
31 |
+
"[IN:MAXIMIZE_VOLUME": 250136,
|
32 |
+
"[IN:MUTE_VOLUME": 250128,
|
33 |
+
"[IN:OPEN_HOMESCREEN": 250111,
|
34 |
+
"[IN:OPEN_RESOURCE": 250102,
|
35 |
+
"[IN:SET_BRIGHTNESS": 250127,
|
36 |
+
"[IN:SET_VOLUME": 250150,
|
37 |
+
"[IN:SLEEP": 250130,
|
38 |
+
"[IN:TURN_OFF": 250105,
|
39 |
+
"[IN:TURN_ON": 250110,
|
40 |
+
"[IN:UNMUTE_VOLUME": 250112,
|
41 |
+
"[IN:UNSUPPORTED_DEVICE": 250137,
|
42 |
+
"[IN:UNSUPPORTED_PEOPLE": 250149,
|
43 |
+
"[IN:UNSUPPORTED_WEATHER": 250142,
|
44 |
+
"[IN:WAKE_UP": 250133,
|
45 |
+
"[IN:ZOOM_IN": 250120,
|
46 |
+
"[IN:ZOOM_OUT": 250126,
|
47 |
+
"[SL:ABSTRACT_AMOUNT": 250132,
|
48 |
+
"[SL:COMPONENT": 250113,
|
49 |
+
"[SL:CONTACT": 250154,
|
50 |
+
"[SL:CONTACT_RELATED": 250148,
|
51 |
+
"[SL:DATE_TIME": 250151,
|
52 |
+
"[SL:LOCATION": 250138,
|
53 |
+
"[SL:PERCENT": 250125,
|
54 |
+
"[SL:PRECISE_AMOUNT": 250118,
|
55 |
+
"[SL:RESOURCE": 250123,
|
56 |
+
"[SL:RESOURCE_TYPE": 250122,
|
57 |
+
"[SL:TYPE_RELATION": 250153,
|
58 |
+
"[SL:WEATHER_ATTRIBUTE": 250140,
|
59 |
+
"[SL:WEATHER_TEMPERATURE_UNIT": 250134
|
60 |
+
}
|
all_results.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 187.5,
|
3 |
+
"eval_exact_match": 0.7924865831842576,
|
4 |
+
"eval_loss": 0.0727909728884697,
|
5 |
+
"eval_runtime": 45.8385,
|
6 |
+
"eval_samples": 559,
|
7 |
+
"eval_samples_per_second": 12.195,
|
8 |
+
"eval_steps_per_second": 1.527,
|
9 |
+
"predict_exact_match": 0.8063410454155956,
|
10 |
+
"predict_loss": 0.07482758909463882,
|
11 |
+
"predict_runtime": 97.1755,
|
12 |
+
"predict_samples": 1167,
|
13 |
+
"predict_samples_per_second": 12.009,
|
14 |
+
"predict_steps_per_second": 1.502,
|
15 |
+
"train_loss": 0.4079610900878906,
|
16 |
+
"train_runtime": 34544.7098,
|
17 |
+
"train_samples": 8224,
|
18 |
+
"train_samples_per_second": 44.464,
|
19 |
+
"train_steps_per_second": 0.087
|
20 |
+
}
|
config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "google/mt5-base",
|
3 |
+
"architectures": [
|
4 |
+
"AlignedMT5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"d_ff": 2048,
|
7 |
+
"d_kv": 64,
|
8 |
+
"d_model": 768,
|
9 |
+
"decoder_start_token_id": 0,
|
10 |
+
"dense_act_fn": "gelu_new",
|
11 |
+
"dropout_rate": 0.1,
|
12 |
+
"eos_token_id": 1,
|
13 |
+
"feed_forward_proj": "gated-gelu",
|
14 |
+
"initializer_factor": 1.0,
|
15 |
+
"is_encoder_decoder": true,
|
16 |
+
"is_gated_act": true,
|
17 |
+
"layer_norm_epsilon": 1e-06,
|
18 |
+
"model_type": "mt5",
|
19 |
+
"num_decoder_layers": 12,
|
20 |
+
"num_heads": 12,
|
21 |
+
"num_layers": 12,
|
22 |
+
"output_past": true,
|
23 |
+
"pad_token_id": 0,
|
24 |
+
"relative_attention_max_distance": 128,
|
25 |
+
"relative_attention_num_buckets": 32,
|
26 |
+
"tie_word_embeddings": false,
|
27 |
+
"tokenizer_class": "T5Tokenizer",
|
28 |
+
"torch_dtype": "float32",
|
29 |
+
"transformers_version": "4.24.0",
|
30 |
+
"use_cache": true,
|
31 |
+
"vocab_size": 250158
|
32 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 187.5,
|
3 |
+
"eval_exact_match": 0.7924865831842576,
|
4 |
+
"eval_loss": 0.0727909728884697,
|
5 |
+
"eval_runtime": 45.8385,
|
6 |
+
"eval_samples": 559,
|
7 |
+
"eval_samples_per_second": 12.195,
|
8 |
+
"eval_steps_per_second": 1.527
|
9 |
+
}
|
generated_predictions.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
predict_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"predict_exact_match": 0.8063410454155956,
|
3 |
+
"predict_loss": 0.07482758909463882,
|
4 |
+
"predict_runtime": 97.1755,
|
5 |
+
"predict_samples": 1167,
|
6 |
+
"predict_samples_per_second": 12.009,
|
7 |
+
"predict_steps_per_second": 1.502
|
8 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b69347ce660a344f2233d6fca17edd3abaebdd3df1ddaf57c3cb5e6f1c028e7f
|
3 |
+
size 2332353369
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
spiece.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
|
3 |
+
size 4309802
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9929b4c5b9acf43dac7a509446035e5dd7ad4b416483deb07b3067e47a5b81e2
|
3 |
+
size 16341505
|
tokenizer_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": null,
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"extra_ids": 0,
|
5 |
+
"name_or_path": "google/mt5-base",
|
6 |
+
"pad_token": "<pad>",
|
7 |
+
"sp_model_kwargs": {},
|
8 |
+
"special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276",
|
9 |
+
"tokenizer_class": "T5Tokenizer",
|
10 |
+
"unk_token": "<unk>"
|
11 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 187.5,
|
3 |
+
"train_loss": 0.4079610900878906,
|
4 |
+
"train_runtime": 34544.7098,
|
5 |
+
"train_samples": 8224,
|
6 |
+
"train_samples_per_second": 44.464,
|
7 |
+
"train_steps_per_second": 0.087
|
8 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.0727909728884697,
|
3 |
+
"best_model_checkpoint": "/data/wheld3/mt5-base-pointer-adv-cstop_artificial/checkpoint-400",
|
4 |
+
"epoch": 187.49805447470817,
|
5 |
+
"global_step": 3000,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 12.5,
|
12 |
+
"learning_rate": 0.0009333333333333333,
|
13 |
+
"loss": 1.7423,
|
14 |
+
"step": 200
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 12.5,
|
18 |
+
"eval_exact_match": 0.23971377459749552,
|
19 |
+
"eval_loss": 0.11730749905109406,
|
20 |
+
"eval_runtime": 27.8131,
|
21 |
+
"eval_samples_per_second": 20.098,
|
22 |
+
"eval_steps_per_second": 2.517,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 25.0,
|
27 |
+
"learning_rate": 0.0008666666666666667,
|
28 |
+
"loss": 0.3678,
|
29 |
+
"step": 400
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"epoch": 25.0,
|
33 |
+
"eval_exact_match": 0.3363148479427549,
|
34 |
+
"eval_loss": 0.0727909728884697,
|
35 |
+
"eval_runtime": 29.0466,
|
36 |
+
"eval_samples_per_second": 19.245,
|
37 |
+
"eval_steps_per_second": 2.41,
|
38 |
+
"step": 400
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"epoch": 37.5,
|
42 |
+
"learning_rate": 0.0008,
|
43 |
+
"loss": 0.3202,
|
44 |
+
"step": 600
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 37.5,
|
48 |
+
"eval_exact_match": 0.33810375670840787,
|
49 |
+
"eval_loss": 0.08794570714235306,
|
50 |
+
"eval_runtime": 29.1974,
|
51 |
+
"eval_samples_per_second": 19.146,
|
52 |
+
"eval_steps_per_second": 2.397,
|
53 |
+
"step": 600
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"epoch": 50.0,
|
57 |
+
"learning_rate": 0.0007333333333333333,
|
58 |
+
"loss": 0.3452,
|
59 |
+
"step": 800
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"epoch": 50.0,
|
63 |
+
"eval_exact_match": 0.3363148479427549,
|
64 |
+
"eval_loss": 0.09075574576854706,
|
65 |
+
"eval_runtime": 28.6647,
|
66 |
+
"eval_samples_per_second": 19.501,
|
67 |
+
"eval_steps_per_second": 2.442,
|
68 |
+
"step": 800
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"epoch": 62.5,
|
72 |
+
"learning_rate": 0.0006666666666666666,
|
73 |
+
"loss": 0.3099,
|
74 |
+
"step": 1000
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"epoch": 62.5,
|
78 |
+
"eval_exact_match": 0.3434704830053667,
|
79 |
+
"eval_loss": 0.10556001961231232,
|
80 |
+
"eval_runtime": 28.9715,
|
81 |
+
"eval_samples_per_second": 19.295,
|
82 |
+
"eval_steps_per_second": 2.416,
|
83 |
+
"step": 1000
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"epoch": 75.0,
|
87 |
+
"learning_rate": 0.0006,
|
88 |
+
"loss": 0.3057,
|
89 |
+
"step": 1200
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"epoch": 75.0,
|
93 |
+
"eval_exact_match": 0.3470483005366726,
|
94 |
+
"eval_loss": 0.11086518317461014,
|
95 |
+
"eval_runtime": 29.2047,
|
96 |
+
"eval_samples_per_second": 19.141,
|
97 |
+
"eval_steps_per_second": 2.397,
|
98 |
+
"step": 1200
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"epoch": 87.5,
|
102 |
+
"learning_rate": 0.0005333333333333334,
|
103 |
+
"loss": 0.3045,
|
104 |
+
"step": 1400
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"epoch": 87.5,
|
108 |
+
"eval_exact_match": 0.34525939177101966,
|
109 |
+
"eval_loss": 0.1273432970046997,
|
110 |
+
"eval_runtime": 29.0031,
|
111 |
+
"eval_samples_per_second": 19.274,
|
112 |
+
"eval_steps_per_second": 2.414,
|
113 |
+
"step": 1400
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"epoch": 100.0,
|
117 |
+
"learning_rate": 0.00046666666666666666,
|
118 |
+
"loss": 0.3052,
|
119 |
+
"step": 1600
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"epoch": 100.0,
|
123 |
+
"eval_exact_match": 0.3416815742397138,
|
124 |
+
"eval_loss": 0.10654404759407043,
|
125 |
+
"eval_runtime": 28.7998,
|
126 |
+
"eval_samples_per_second": 19.41,
|
127 |
+
"eval_steps_per_second": 2.431,
|
128 |
+
"step": 1600
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 112.5,
|
132 |
+
"learning_rate": 0.0004,
|
133 |
+
"loss": 0.3037,
|
134 |
+
"step": 1800
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"epoch": 112.5,
|
138 |
+
"eval_exact_match": 0.33810375670840787,
|
139 |
+
"eval_loss": 0.13873372972011566,
|
140 |
+
"eval_runtime": 29.2765,
|
141 |
+
"eval_samples_per_second": 19.094,
|
142 |
+
"eval_steps_per_second": 2.391,
|
143 |
+
"step": 1800
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"epoch": 125.0,
|
147 |
+
"learning_rate": 0.0003333333333333333,
|
148 |
+
"loss": 0.3036,
|
149 |
+
"step": 2000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 125.0,
|
153 |
+
"eval_exact_match": 0.34525939177101966,
|
154 |
+
"eval_loss": 0.1421414017677307,
|
155 |
+
"eval_runtime": 28.8117,
|
156 |
+
"eval_samples_per_second": 19.402,
|
157 |
+
"eval_steps_per_second": 2.43,
|
158 |
+
"step": 2000
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"epoch": 137.5,
|
162 |
+
"learning_rate": 0.0002666666666666667,
|
163 |
+
"loss": 0.3023,
|
164 |
+
"step": 2200
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"epoch": 137.5,
|
168 |
+
"eval_exact_match": 0.33989266547406083,
|
169 |
+
"eval_loss": 0.16489343345165253,
|
170 |
+
"eval_runtime": 30.9662,
|
171 |
+
"eval_samples_per_second": 18.052,
|
172 |
+
"eval_steps_per_second": 2.261,
|
173 |
+
"step": 2200
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"epoch": 150.0,
|
177 |
+
"learning_rate": 0.0002,
|
178 |
+
"loss": 0.3028,
|
179 |
+
"step": 2400
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"epoch": 150.0,
|
183 |
+
"eval_exact_match": 0.33989266547406083,
|
184 |
+
"eval_loss": 0.1573849618434906,
|
185 |
+
"eval_runtime": 29.0042,
|
186 |
+
"eval_samples_per_second": 19.273,
|
187 |
+
"eval_steps_per_second": 2.413,
|
188 |
+
"step": 2400
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"epoch": 162.5,
|
192 |
+
"learning_rate": 0.00013333333333333334,
|
193 |
+
"loss": 0.3025,
|
194 |
+
"step": 2600
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"epoch": 162.5,
|
198 |
+
"eval_exact_match": 0.33989266547406083,
|
199 |
+
"eval_loss": 0.15625949203968048,
|
200 |
+
"eval_runtime": 28.8424,
|
201 |
+
"eval_samples_per_second": 19.381,
|
202 |
+
"eval_steps_per_second": 2.427,
|
203 |
+
"step": 2600
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"epoch": 175.0,
|
207 |
+
"learning_rate": 6.666666666666667e-05,
|
208 |
+
"loss": 0.3017,
|
209 |
+
"step": 2800
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"epoch": 175.0,
|
213 |
+
"eval_exact_match": 0.33989266547406083,
|
214 |
+
"eval_loss": 0.1589040458202362,
|
215 |
+
"eval_runtime": 28.7706,
|
216 |
+
"eval_samples_per_second": 19.43,
|
217 |
+
"eval_steps_per_second": 2.433,
|
218 |
+
"step": 2800
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"epoch": 187.5,
|
222 |
+
"learning_rate": 0.0,
|
223 |
+
"loss": 0.302,
|
224 |
+
"step": 3000
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"epoch": 187.5,
|
228 |
+
"eval_exact_match": 0.3416815742397138,
|
229 |
+
"eval_loss": 0.15874968469142914,
|
230 |
+
"eval_runtime": 33.2641,
|
231 |
+
"eval_samples_per_second": 16.805,
|
232 |
+
"eval_steps_per_second": 2.104,
|
233 |
+
"step": 3000
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 187.5,
|
237 |
+
"step": 3000,
|
238 |
+
"total_flos": 6.474719775139762e+16,
|
239 |
+
"train_loss": 0.4079610900878906,
|
240 |
+
"train_runtime": 34544.7098,
|
241 |
+
"train_samples_per_second": 44.464,
|
242 |
+
"train_steps_per_second": 0.087
|
243 |
+
}
|
244 |
+
],
|
245 |
+
"max_steps": 3000,
|
246 |
+
"num_train_epochs": 188,
|
247 |
+
"total_flos": 6.474719775139762e+16,
|
248 |
+
"trial_name": null,
|
249 |
+
"trial_params": null
|
250 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f0f9fc79fe4d1b66c314882732d3b427b10b38f9fef2d225acf1c1a1db62712
|
3 |
+
size 3643
|