ottomoritz commited on
Commit
767733c
·
1 Parent(s): 68f4349

Uploading testfile

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +71 -0
  2. all_results.json +13 -0
  3. checkpoint-10000/config.json +21 -0
  4. checkpoint-10000/model.safetensors +3 -0
  5. checkpoint-10000/optimizer.pt +3 -0
  6. checkpoint-10000/rng_state.pth +3 -0
  7. checkpoint-10000/scheduler.pt +3 -0
  8. checkpoint-10000/special_tokens_map.json +23 -0
  9. checkpoint-10000/tokenizer.json +62 -0
  10. checkpoint-10000/tokenizer_config.json +34 -0
  11. checkpoint-10000/trainer_state.json +48 -0
  12. checkpoint-10000/training_args.bin +3 -0
  13. checkpoint-100000/config.json +21 -0
  14. checkpoint-100000/model.safetensors +3 -0
  15. checkpoint-100000/optimizer.pt +3 -0
  16. checkpoint-100000/rng_state.pth +3 -0
  17. checkpoint-100000/scheduler.pt +3 -0
  18. checkpoint-100000/special_tokens_map.json +23 -0
  19. checkpoint-100000/tokenizer.json +62 -0
  20. checkpoint-100000/tokenizer_config.json +34 -0
  21. checkpoint-100000/trainer_state.json +183 -0
  22. checkpoint-100000/training_args.bin +3 -0
  23. checkpoint-110000/config.json +21 -0
  24. checkpoint-110000/model.safetensors +3 -0
  25. checkpoint-110000/optimizer.pt +3 -0
  26. checkpoint-110000/rng_state.pth +3 -0
  27. checkpoint-110000/scheduler.pt +3 -0
  28. checkpoint-110000/special_tokens_map.json +23 -0
  29. checkpoint-110000/tokenizer.json +62 -0
  30. checkpoint-110000/tokenizer_config.json +34 -0
  31. checkpoint-110000/trainer_state.json +198 -0
  32. checkpoint-110000/training_args.bin +3 -0
  33. checkpoint-120000/config.json +21 -0
  34. checkpoint-120000/model.safetensors +3 -0
  35. checkpoint-120000/optimizer.pt +3 -0
  36. checkpoint-120000/rng_state.pth +3 -0
  37. checkpoint-120000/scheduler.pt +3 -0
  38. checkpoint-120000/special_tokens_map.json +23 -0
  39. checkpoint-120000/tokenizer.json +62 -0
  40. checkpoint-120000/tokenizer_config.json +34 -0
  41. checkpoint-120000/trainer_state.json +213 -0
  42. checkpoint-120000/training_args.bin +3 -0
  43. checkpoint-20000/config.json +21 -0
  44. checkpoint-20000/model.safetensors +3 -0
  45. checkpoint-20000/optimizer.pt +3 -0
  46. checkpoint-20000/rng_state.pth +3 -0
  47. checkpoint-20000/scheduler.pt +3 -0
  48. checkpoint-20000/special_tokens_map.json +23 -0
  49. checkpoint-20000/tokenizer.json +62 -0
  50. checkpoint-20000/tokenizer_config.json +34 -0
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ datasets:
5
+ - ottomoritz/TriboliumCastaneum
6
+ model-index:
7
+ - name: your_output_dir
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # your_output_dir
15
+
16
+ This model is a fine-tuned version of [](https://huggingface.co/) on the ottomoritz/TriboliumCastaneum dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: nan
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 0.001
38
+ - train_batch_size: 4
39
+ - eval_batch_size: 4
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
+ - lr_scheduler_type: constant_with_warmup
44
+ - lr_scheduler_warmup_steps: 1000
45
+ - training_steps: 120000
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:------:|:------:|:---------------:|
52
+ | 1.217 | 0.0833 | 10000 | nan |
53
+ | 1.1568 | 0.1667 | 20000 | nan |
54
+ | 1.1522 | 0.25 | 30000 | nan |
55
+ | 1.1443 | 0.3333 | 40000 | nan |
56
+ | 1.1404 | 0.4167 | 50000 | nan |
57
+ | 1.1329 | 0.5 | 60000 | nan |
58
+ | 1.1323 | 0.5833 | 70000 | nan |
59
+ | 1.1292 | 0.6667 | 80000 | nan |
60
+ | 1.1264 | 0.75 | 90000 | nan |
61
+ | 1.1312 | 0.8333 | 100000 | nan |
62
+ | 1.1305 | 0.9167 | 110000 | nan |
63
+ | 1.1285 | 1.0 | 120000 | nan |
64
+
65
+
66
+ ### Framework versions
67
+
68
+ - Transformers 4.41.2
69
+ - Pytorch 2.3.0+cu121
70
+ - Datasets 2.19.2
71
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": NaN,
4
+ "eval_runtime": 571.893,
5
+ "eval_samples_per_second": 241.171,
6
+ "eval_steps_per_second": 60.293,
7
+ "perplexity": NaN,
8
+ "total_flos": 9.714412005674189e+16,
9
+ "train_loss": 1.1434922200520834,
10
+ "train_runtime": 17035.2815,
11
+ "train_samples_per_second": 28.177,
12
+ "train_steps_per_second": 7.044
13
+ }
checkpoint-10000/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvNetForMaskedLM"
4
+ ],
5
+ "aux_features_vocab_size": 5,
6
+ "dilation_base": 2,
7
+ "dilation_cycle": 6,
8
+ "dilation_double_every": 1,
9
+ "dilation_max": 32,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "kernel_size": 9,
15
+ "model_type": "ConvNet",
16
+ "n_aux_features": 0,
17
+ "n_layers": 25,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.41.2",
20
+ "vocab_size": 7
21
+ }
checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c73170894e6f908793e7d8092ac085344046391e384a526242a89c2b1274f5
3
+ size 263540548
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:867b4b6c09b5edc24d783718bd9aa13d088eaceb78d285e0c4529e6711ea98f2
3
+ size 527212602
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec915ecc453dbbfdd0ab75a49b4e4d24df74862ad77a9f68dc03bfa3b2a11f70
3
+ size 14244
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651c17881739fb1e81c6ac9a2b03392313a3e3f597c8803b041c9a31b8c0d76c
3
+ size 1064
checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mask_token": {
3
+ "content": "[MASK]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "[PAD]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "[UNK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-10000/tokenizer.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[MASK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[UNK]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Lowercase"
36
+ },
37
+ "pre_tokenizer": {
38
+ "type": "Whitespace"
39
+ },
40
+ "post_processor": null,
41
+ "decoder": null,
42
+ "model": {
43
+ "type": "BPE",
44
+ "dropout": null,
45
+ "unk_token": "[UNK]",
46
+ "continuing_subword_prefix": null,
47
+ "end_of_word_suffix": null,
48
+ "fuse_unk": false,
49
+ "byte_fallback": false,
50
+ "ignore_merges": false,
51
+ "vocab": {
52
+ "[PAD]": 0,
53
+ "[MASK]": 1,
54
+ "[UNK]": 2,
55
+ "a": 3,
56
+ "c": 4,
57
+ "g": 5,
58
+ "t": 6
59
+ },
60
+ "merges": []
61
+ }
62
+ }
checkpoint-10000/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "mask_token": "[MASK]",
30
+ "model_max_length": 1000000000000000019884624838656,
31
+ "pad_token": "[PAD]",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "[UNK]"
34
+ }
checkpoint-10000/trainer_state.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.08333333333333333,
5
+ "eval_steps": 10000,
6
+ "global_step": 10000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08333333333333333,
13
+ "grad_norm": 0.6651873588562012,
14
+ "learning_rate": 0.001,
15
+ "loss": 1.217,
16
+ "step": 10000
17
+ },
18
+ {
19
+ "epoch": 0.08333333333333333,
20
+ "eval_loss": NaN,
21
+ "eval_runtime": 874.824,
22
+ "eval_samples_per_second": 157.659,
23
+ "eval_steps_per_second": 39.415,
24
+ "step": 10000
25
+ }
26
+ ],
27
+ "logging_steps": 10000,
28
+ "max_steps": 120000,
29
+ "num_input_tokens_seen": 0,
30
+ "num_train_epochs": 9223372036854775807,
31
+ "save_steps": 10000,
32
+ "stateful_callbacks": {
33
+ "TrainerControl": {
34
+ "args": {
35
+ "should_epoch_stop": false,
36
+ "should_evaluate": false,
37
+ "should_log": false,
38
+ "should_save": true,
39
+ "should_training_stop": false
40
+ },
41
+ "attributes": {}
42
+ }
43
+ },
44
+ "total_flos": 8095343338061824.0,
45
+ "train_batch_size": 4,
46
+ "trial_name": null,
47
+ "trial_params": null
48
+ }
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e24d240c0212150205bb434684f1d705979477cac1d7cdb25ba76821568db2
3
+ size 5112
checkpoint-100000/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvNetForMaskedLM"
4
+ ],
5
+ "aux_features_vocab_size": 5,
6
+ "dilation_base": 2,
7
+ "dilation_cycle": 6,
8
+ "dilation_double_every": 1,
9
+ "dilation_max": 32,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "kernel_size": 9,
15
+ "model_type": "ConvNet",
16
+ "n_aux_features": 0,
17
+ "n_layers": 25,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.41.2",
20
+ "vocab_size": 7
21
+ }
checkpoint-100000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d97c12a35b705d932e7aa1aeff8c96c09c4774c700a49fadc0dd9772b8eeea7f
3
+ size 263540548
checkpoint-100000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f82bda1bf25f895adbe9376bae9508b34618f4b525151e86e7bf99f6c259698
3
+ size 527212602
checkpoint-100000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56038c7844026d477cf6aa7ee2ca7b5dfd712a7bb2ab5d1bd89d0285c1140839
3
+ size 14244
checkpoint-100000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f585e27de5d2f3265c6a9c63d374ed84caea636d087de5c766dcffdefe1484
3
+ size 1064
checkpoint-100000/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mask_token": {
3
+ "content": "[MASK]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "[PAD]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "[UNK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-100000/tokenizer.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[MASK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[UNK]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Lowercase"
36
+ },
37
+ "pre_tokenizer": {
38
+ "type": "Whitespace"
39
+ },
40
+ "post_processor": null,
41
+ "decoder": null,
42
+ "model": {
43
+ "type": "BPE",
44
+ "dropout": null,
45
+ "unk_token": "[UNK]",
46
+ "continuing_subword_prefix": null,
47
+ "end_of_word_suffix": null,
48
+ "fuse_unk": false,
49
+ "byte_fallback": false,
50
+ "ignore_merges": false,
51
+ "vocab": {
52
+ "[PAD]": 0,
53
+ "[MASK]": 1,
54
+ "[UNK]": 2,
55
+ "a": 3,
56
+ "c": 4,
57
+ "g": 5,
58
+ "t": 6
59
+ },
60
+ "merges": []
61
+ }
62
+ }
checkpoint-100000/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "mask_token": "[MASK]",
30
+ "model_max_length": 1000000000000000019884624838656,
31
+ "pad_token": "[PAD]",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "[UNK]"
34
+ }
checkpoint-100000/trainer_state.json ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.8333333333333334,
5
+ "eval_steps": 10000,
6
+ "global_step": 100000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08333333333333333,
13
+ "grad_norm": 0.6651873588562012,
14
+ "learning_rate": 0.001,
15
+ "loss": 1.217,
16
+ "step": 10000
17
+ },
18
+ {
19
+ "epoch": 0.08333333333333333,
20
+ "eval_loss": NaN,
21
+ "eval_runtime": 874.824,
22
+ "eval_samples_per_second": 157.659,
23
+ "eval_steps_per_second": 39.415,
24
+ "step": 10000
25
+ },
26
+ {
27
+ "epoch": 0.16666666666666666,
28
+ "grad_norm": 0.3876087963581085,
29
+ "learning_rate": 0.001,
30
+ "loss": 1.1568,
31
+ "step": 20000
32
+ },
33
+ {
34
+ "epoch": 0.16666666666666666,
35
+ "eval_loss": NaN,
36
+ "eval_runtime": 774.5634,
37
+ "eval_samples_per_second": 178.067,
38
+ "eval_steps_per_second": 44.517,
39
+ "step": 20000
40
+ },
41
+ {
42
+ "epoch": 0.25,
43
+ "grad_norm": 0.2238207757472992,
44
+ "learning_rate": 0.001,
45
+ "loss": 1.1522,
46
+ "step": 30000
47
+ },
48
+ {
49
+ "epoch": 0.25,
50
+ "eval_loss": NaN,
51
+ "eval_runtime": 739.1972,
52
+ "eval_samples_per_second": 186.586,
53
+ "eval_steps_per_second": 46.647,
54
+ "step": 30000
55
+ },
56
+ {
57
+ "epoch": 0.3333333333333333,
58
+ "grad_norm": 0.1445535570383072,
59
+ "learning_rate": 0.001,
60
+ "loss": 1.1443,
61
+ "step": 40000
62
+ },
63
+ {
64
+ "epoch": 0.3333333333333333,
65
+ "eval_loss": NaN,
66
+ "eval_runtime": 500.0215,
67
+ "eval_samples_per_second": 275.836,
68
+ "eval_steps_per_second": 68.959,
69
+ "step": 40000
70
+ },
71
+ {
72
+ "epoch": 0.4166666666666667,
73
+ "grad_norm": 0.17618992924690247,
74
+ "learning_rate": 0.001,
75
+ "loss": 1.1404,
76
+ "step": 50000
77
+ },
78
+ {
79
+ "epoch": 0.4166666666666667,
80
+ "eval_loss": NaN,
81
+ "eval_runtime": 589.6171,
82
+ "eval_samples_per_second": 233.921,
83
+ "eval_steps_per_second": 58.48,
84
+ "step": 50000
85
+ },
86
+ {
87
+ "epoch": 0.5,
88
+ "grad_norm": 0.12298904359340668,
89
+ "learning_rate": 0.001,
90
+ "loss": 1.1329,
91
+ "step": 60000
92
+ },
93
+ {
94
+ "epoch": 0.5,
95
+ "eval_loss": NaN,
96
+ "eval_runtime": 599.1773,
97
+ "eval_samples_per_second": 230.189,
98
+ "eval_steps_per_second": 57.547,
99
+ "step": 60000
100
+ },
101
+ {
102
+ "epoch": 0.5833333333333334,
103
+ "grad_norm": 0.21368645131587982,
104
+ "learning_rate": 0.001,
105
+ "loss": 1.1323,
106
+ "step": 70000
107
+ },
108
+ {
109
+ "epoch": 0.5833333333333334,
110
+ "eval_loss": NaN,
111
+ "eval_runtime": 526.8173,
112
+ "eval_samples_per_second": 261.806,
113
+ "eval_steps_per_second": 65.452,
114
+ "step": 70000
115
+ },
116
+ {
117
+ "epoch": 0.6666666666666666,
118
+ "grad_norm": 0.15141108632087708,
119
+ "learning_rate": 0.001,
120
+ "loss": 1.1292,
121
+ "step": 80000
122
+ },
123
+ {
124
+ "epoch": 0.6666666666666666,
125
+ "eval_loss": NaN,
126
+ "eval_runtime": 542.434,
127
+ "eval_samples_per_second": 254.269,
128
+ "eval_steps_per_second": 63.567,
129
+ "step": 80000
130
+ },
131
+ {
132
+ "epoch": 0.75,
133
+ "grad_norm": 0.15512333810329437,
134
+ "learning_rate": 0.001,
135
+ "loss": 1.1264,
136
+ "step": 90000
137
+ },
138
+ {
139
+ "epoch": 0.75,
140
+ "eval_loss": NaN,
141
+ "eval_runtime": 700.6971,
142
+ "eval_samples_per_second": 196.838,
143
+ "eval_steps_per_second": 49.21,
144
+ "step": 90000
145
+ },
146
+ {
147
+ "epoch": 0.8333333333333334,
148
+ "grad_norm": 0.15970458090305328,
149
+ "learning_rate": 0.001,
150
+ "loss": 1.1312,
151
+ "step": 100000
152
+ },
153
+ {
154
+ "epoch": 0.8333333333333334,
155
+ "eval_loss": NaN,
156
+ "eval_runtime": 774.1988,
157
+ "eval_samples_per_second": 178.151,
158
+ "eval_steps_per_second": 44.538,
159
+ "step": 100000
160
+ }
161
+ ],
162
+ "logging_steps": 10000,
163
+ "max_steps": 120000,
164
+ "num_input_tokens_seen": 0,
165
+ "num_train_epochs": 9223372036854775807,
166
+ "save_steps": 10000,
167
+ "stateful_callbacks": {
168
+ "TrainerControl": {
169
+ "args": {
170
+ "should_epoch_stop": false,
171
+ "should_evaluate": false,
172
+ "should_log": false,
173
+ "should_save": true,
174
+ "should_training_stop": false
175
+ },
176
+ "attributes": {}
177
+ }
178
+ },
179
+ "total_flos": 8.095343338061824e+16,
180
+ "train_batch_size": 4,
181
+ "trial_name": null,
182
+ "trial_params": null
183
+ }
checkpoint-100000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e24d240c0212150205bb434684f1d705979477cac1d7cdb25ba76821568db2
3
+ size 5112
checkpoint-110000/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvNetForMaskedLM"
4
+ ],
5
+ "aux_features_vocab_size": 5,
6
+ "dilation_base": 2,
7
+ "dilation_cycle": 6,
8
+ "dilation_double_every": 1,
9
+ "dilation_max": 32,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "kernel_size": 9,
15
+ "model_type": "ConvNet",
16
+ "n_aux_features": 0,
17
+ "n_layers": 25,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.41.2",
20
+ "vocab_size": 7
21
+ }
checkpoint-110000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f709e4d698ae62bb811225a5e87798d09f5a6579d137e29ddf0d1efa2d2d033
3
+ size 263540548
checkpoint-110000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24df7f0d093ad6bd5e95f5b40158b8c0365ad766813a200557716da11b5ba7f6
3
+ size 527212602
checkpoint-110000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc5a17f58f700ac501af8e40f71184ae5e7a36fe3304bb4dac612aebb94a26c
3
+ size 14244
checkpoint-110000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9df09116e63bfdc0e09627da3aa2680596ae9b928f01740c0acedf00513cf2cb
3
+ size 1064
checkpoint-110000/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mask_token": {
3
+ "content": "[MASK]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "[PAD]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "[UNK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-110000/tokenizer.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[MASK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[UNK]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Lowercase"
36
+ },
37
+ "pre_tokenizer": {
38
+ "type": "Whitespace"
39
+ },
40
+ "post_processor": null,
41
+ "decoder": null,
42
+ "model": {
43
+ "type": "BPE",
44
+ "dropout": null,
45
+ "unk_token": "[UNK]",
46
+ "continuing_subword_prefix": null,
47
+ "end_of_word_suffix": null,
48
+ "fuse_unk": false,
49
+ "byte_fallback": false,
50
+ "ignore_merges": false,
51
+ "vocab": {
52
+ "[PAD]": 0,
53
+ "[MASK]": 1,
54
+ "[UNK]": 2,
55
+ "a": 3,
56
+ "c": 4,
57
+ "g": 5,
58
+ "t": 6
59
+ },
60
+ "merges": []
61
+ }
62
+ }
checkpoint-110000/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "mask_token": "[MASK]",
30
+ "model_max_length": 1000000000000000019884624838656,
31
+ "pad_token": "[PAD]",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "[UNK]"
34
+ }
checkpoint-110000/trainer_state.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9166666666666666,
5
+ "eval_steps": 10000,
6
+ "global_step": 110000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08333333333333333,
13
+ "grad_norm": 0.6651873588562012,
14
+ "learning_rate": 0.001,
15
+ "loss": 1.217,
16
+ "step": 10000
17
+ },
18
+ {
19
+ "epoch": 0.08333333333333333,
20
+ "eval_loss": NaN,
21
+ "eval_runtime": 874.824,
22
+ "eval_samples_per_second": 157.659,
23
+ "eval_steps_per_second": 39.415,
24
+ "step": 10000
25
+ },
26
+ {
27
+ "epoch": 0.16666666666666666,
28
+ "grad_norm": 0.3876087963581085,
29
+ "learning_rate": 0.001,
30
+ "loss": 1.1568,
31
+ "step": 20000
32
+ },
33
+ {
34
+ "epoch": 0.16666666666666666,
35
+ "eval_loss": NaN,
36
+ "eval_runtime": 774.5634,
37
+ "eval_samples_per_second": 178.067,
38
+ "eval_steps_per_second": 44.517,
39
+ "step": 20000
40
+ },
41
+ {
42
+ "epoch": 0.25,
43
+ "grad_norm": 0.2238207757472992,
44
+ "learning_rate": 0.001,
45
+ "loss": 1.1522,
46
+ "step": 30000
47
+ },
48
+ {
49
+ "epoch": 0.25,
50
+ "eval_loss": NaN,
51
+ "eval_runtime": 739.1972,
52
+ "eval_samples_per_second": 186.586,
53
+ "eval_steps_per_second": 46.647,
54
+ "step": 30000
55
+ },
56
+ {
57
+ "epoch": 0.3333333333333333,
58
+ "grad_norm": 0.1445535570383072,
59
+ "learning_rate": 0.001,
60
+ "loss": 1.1443,
61
+ "step": 40000
62
+ },
63
+ {
64
+ "epoch": 0.3333333333333333,
65
+ "eval_loss": NaN,
66
+ "eval_runtime": 500.0215,
67
+ "eval_samples_per_second": 275.836,
68
+ "eval_steps_per_second": 68.959,
69
+ "step": 40000
70
+ },
71
+ {
72
+ "epoch": 0.4166666666666667,
73
+ "grad_norm": 0.17618992924690247,
74
+ "learning_rate": 0.001,
75
+ "loss": 1.1404,
76
+ "step": 50000
77
+ },
78
+ {
79
+ "epoch": 0.4166666666666667,
80
+ "eval_loss": NaN,
81
+ "eval_runtime": 589.6171,
82
+ "eval_samples_per_second": 233.921,
83
+ "eval_steps_per_second": 58.48,
84
+ "step": 50000
85
+ },
86
+ {
87
+ "epoch": 0.5,
88
+ "grad_norm": 0.12298904359340668,
89
+ "learning_rate": 0.001,
90
+ "loss": 1.1329,
91
+ "step": 60000
92
+ },
93
+ {
94
+ "epoch": 0.5,
95
+ "eval_loss": NaN,
96
+ "eval_runtime": 599.1773,
97
+ "eval_samples_per_second": 230.189,
98
+ "eval_steps_per_second": 57.547,
99
+ "step": 60000
100
+ },
101
+ {
102
+ "epoch": 0.5833333333333334,
103
+ "grad_norm": 0.21368645131587982,
104
+ "learning_rate": 0.001,
105
+ "loss": 1.1323,
106
+ "step": 70000
107
+ },
108
+ {
109
+ "epoch": 0.5833333333333334,
110
+ "eval_loss": NaN,
111
+ "eval_runtime": 526.8173,
112
+ "eval_samples_per_second": 261.806,
113
+ "eval_steps_per_second": 65.452,
114
+ "step": 70000
115
+ },
116
+ {
117
+ "epoch": 0.6666666666666666,
118
+ "grad_norm": 0.15141108632087708,
119
+ "learning_rate": 0.001,
120
+ "loss": 1.1292,
121
+ "step": 80000
122
+ },
123
+ {
124
+ "epoch": 0.6666666666666666,
125
+ "eval_loss": NaN,
126
+ "eval_runtime": 542.434,
127
+ "eval_samples_per_second": 254.269,
128
+ "eval_steps_per_second": 63.567,
129
+ "step": 80000
130
+ },
131
+ {
132
+ "epoch": 0.75,
133
+ "grad_norm": 0.15512333810329437,
134
+ "learning_rate": 0.001,
135
+ "loss": 1.1264,
136
+ "step": 90000
137
+ },
138
+ {
139
+ "epoch": 0.75,
140
+ "eval_loss": NaN,
141
+ "eval_runtime": 700.6971,
142
+ "eval_samples_per_second": 196.838,
143
+ "eval_steps_per_second": 49.21,
144
+ "step": 90000
145
+ },
146
+ {
147
+ "epoch": 0.8333333333333334,
148
+ "grad_norm": 0.15970458090305328,
149
+ "learning_rate": 0.001,
150
+ "loss": 1.1312,
151
+ "step": 100000
152
+ },
153
+ {
154
+ "epoch": 0.8333333333333334,
155
+ "eval_loss": NaN,
156
+ "eval_runtime": 774.1988,
157
+ "eval_samples_per_second": 178.151,
158
+ "eval_steps_per_second": 44.538,
159
+ "step": 100000
160
+ },
161
+ {
162
+ "epoch": 0.9166666666666666,
163
+ "grad_norm": 0.16049639880657196,
164
+ "learning_rate": 0.001,
165
+ "loss": 1.1305,
166
+ "step": 110000
167
+ },
168
+ {
169
+ "epoch": 0.9166666666666666,
170
+ "eval_loss": NaN,
171
+ "eval_runtime": 612.5819,
172
+ "eval_samples_per_second": 225.152,
173
+ "eval_steps_per_second": 56.288,
174
+ "step": 110000
175
+ }
176
+ ],
177
+ "logging_steps": 10000,
178
+ "max_steps": 120000,
179
+ "num_input_tokens_seen": 0,
180
+ "num_train_epochs": 9223372036854775807,
181
+ "save_steps": 10000,
182
+ "stateful_callbacks": {
183
+ "TrainerControl": {
184
+ "args": {
185
+ "should_epoch_stop": false,
186
+ "should_evaluate": false,
187
+ "should_log": false,
188
+ "should_save": true,
189
+ "should_training_stop": false
190
+ },
191
+ "attributes": {}
192
+ }
193
+ },
194
+ "total_flos": 8.904877671868006e+16,
195
+ "train_batch_size": 4,
196
+ "trial_name": null,
197
+ "trial_params": null
198
+ }
checkpoint-110000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e24d240c0212150205bb434684f1d705979477cac1d7cdb25ba76821568db2
3
+ size 5112
checkpoint-120000/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvNetForMaskedLM"
4
+ ],
5
+ "aux_features_vocab_size": 5,
6
+ "dilation_base": 2,
7
+ "dilation_cycle": 6,
8
+ "dilation_double_every": 1,
9
+ "dilation_max": 32,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "kernel_size": 9,
15
+ "model_type": "ConvNet",
16
+ "n_aux_features": 0,
17
+ "n_layers": 25,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.41.2",
20
+ "vocab_size": 7
21
+ }
checkpoint-120000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bde649b0d713fd5b2ba7286dc1ab9bacff3d4bf38d38661784348d23d47a014
3
+ size 263540548
checkpoint-120000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a03cf05ae3b962c46c40d8346a7cb1ddb98c00fff5ea3197f4630bb73e953f9f
3
+ size 527212602
checkpoint-120000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab2c92317db8387da7d95090bd49efb7856eaec0bb349c768b2e290d8b50f843
3
+ size 14244
checkpoint-120000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:972940722fecab610fecaa9267d56b9e65a388a1d5ec43b18bbdde9fa7a69235
3
+ size 1064
checkpoint-120000/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mask_token": {
3
+ "content": "[MASK]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "[PAD]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "[UNK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-120000/tokenizer.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[MASK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[UNK]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Lowercase"
36
+ },
37
+ "pre_tokenizer": {
38
+ "type": "Whitespace"
39
+ },
40
+ "post_processor": null,
41
+ "decoder": null,
42
+ "model": {
43
+ "type": "BPE",
44
+ "dropout": null,
45
+ "unk_token": "[UNK]",
46
+ "continuing_subword_prefix": null,
47
+ "end_of_word_suffix": null,
48
+ "fuse_unk": false,
49
+ "byte_fallback": false,
50
+ "ignore_merges": false,
51
+ "vocab": {
52
+ "[PAD]": 0,
53
+ "[MASK]": 1,
54
+ "[UNK]": 2,
55
+ "a": 3,
56
+ "c": 4,
57
+ "g": 5,
58
+ "t": 6
59
+ },
60
+ "merges": []
61
+ }
62
+ }
checkpoint-120000/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "mask_token": "[MASK]",
30
+ "model_max_length": 1000000000000000019884624838656,
31
+ "pad_token": "[PAD]",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "[UNK]"
34
+ }
checkpoint-120000/trainer_state.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 10000,
6
+ "global_step": 120000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08333333333333333,
13
+ "grad_norm": 0.6651873588562012,
14
+ "learning_rate": 0.001,
15
+ "loss": 1.217,
16
+ "step": 10000
17
+ },
18
+ {
19
+ "epoch": 0.08333333333333333,
20
+ "eval_loss": NaN,
21
+ "eval_runtime": 874.824,
22
+ "eval_samples_per_second": 157.659,
23
+ "eval_steps_per_second": 39.415,
24
+ "step": 10000
25
+ },
26
+ {
27
+ "epoch": 0.16666666666666666,
28
+ "grad_norm": 0.3876087963581085,
29
+ "learning_rate": 0.001,
30
+ "loss": 1.1568,
31
+ "step": 20000
32
+ },
33
+ {
34
+ "epoch": 0.16666666666666666,
35
+ "eval_loss": NaN,
36
+ "eval_runtime": 774.5634,
37
+ "eval_samples_per_second": 178.067,
38
+ "eval_steps_per_second": 44.517,
39
+ "step": 20000
40
+ },
41
+ {
42
+ "epoch": 0.25,
43
+ "grad_norm": 0.2238207757472992,
44
+ "learning_rate": 0.001,
45
+ "loss": 1.1522,
46
+ "step": 30000
47
+ },
48
+ {
49
+ "epoch": 0.25,
50
+ "eval_loss": NaN,
51
+ "eval_runtime": 739.1972,
52
+ "eval_samples_per_second": 186.586,
53
+ "eval_steps_per_second": 46.647,
54
+ "step": 30000
55
+ },
56
+ {
57
+ "epoch": 0.3333333333333333,
58
+ "grad_norm": 0.1445535570383072,
59
+ "learning_rate": 0.001,
60
+ "loss": 1.1443,
61
+ "step": 40000
62
+ },
63
+ {
64
+ "epoch": 0.3333333333333333,
65
+ "eval_loss": NaN,
66
+ "eval_runtime": 500.0215,
67
+ "eval_samples_per_second": 275.836,
68
+ "eval_steps_per_second": 68.959,
69
+ "step": 40000
70
+ },
71
+ {
72
+ "epoch": 0.4166666666666667,
73
+ "grad_norm": 0.17618992924690247,
74
+ "learning_rate": 0.001,
75
+ "loss": 1.1404,
76
+ "step": 50000
77
+ },
78
+ {
79
+ "epoch": 0.4166666666666667,
80
+ "eval_loss": NaN,
81
+ "eval_runtime": 589.6171,
82
+ "eval_samples_per_second": 233.921,
83
+ "eval_steps_per_second": 58.48,
84
+ "step": 50000
85
+ },
86
+ {
87
+ "epoch": 0.5,
88
+ "grad_norm": 0.12298904359340668,
89
+ "learning_rate": 0.001,
90
+ "loss": 1.1329,
91
+ "step": 60000
92
+ },
93
+ {
94
+ "epoch": 0.5,
95
+ "eval_loss": NaN,
96
+ "eval_runtime": 599.1773,
97
+ "eval_samples_per_second": 230.189,
98
+ "eval_steps_per_second": 57.547,
99
+ "step": 60000
100
+ },
101
+ {
102
+ "epoch": 0.5833333333333334,
103
+ "grad_norm": 0.21368645131587982,
104
+ "learning_rate": 0.001,
105
+ "loss": 1.1323,
106
+ "step": 70000
107
+ },
108
+ {
109
+ "epoch": 0.5833333333333334,
110
+ "eval_loss": NaN,
111
+ "eval_runtime": 526.8173,
112
+ "eval_samples_per_second": 261.806,
113
+ "eval_steps_per_second": 65.452,
114
+ "step": 70000
115
+ },
116
+ {
117
+ "epoch": 0.6666666666666666,
118
+ "grad_norm": 0.15141108632087708,
119
+ "learning_rate": 0.001,
120
+ "loss": 1.1292,
121
+ "step": 80000
122
+ },
123
+ {
124
+ "epoch": 0.6666666666666666,
125
+ "eval_loss": NaN,
126
+ "eval_runtime": 542.434,
127
+ "eval_samples_per_second": 254.269,
128
+ "eval_steps_per_second": 63.567,
129
+ "step": 80000
130
+ },
131
+ {
132
+ "epoch": 0.75,
133
+ "grad_norm": 0.15512333810329437,
134
+ "learning_rate": 0.001,
135
+ "loss": 1.1264,
136
+ "step": 90000
137
+ },
138
+ {
139
+ "epoch": 0.75,
140
+ "eval_loss": NaN,
141
+ "eval_runtime": 700.6971,
142
+ "eval_samples_per_second": 196.838,
143
+ "eval_steps_per_second": 49.21,
144
+ "step": 90000
145
+ },
146
+ {
147
+ "epoch": 0.8333333333333334,
148
+ "grad_norm": 0.15970458090305328,
149
+ "learning_rate": 0.001,
150
+ "loss": 1.1312,
151
+ "step": 100000
152
+ },
153
+ {
154
+ "epoch": 0.8333333333333334,
155
+ "eval_loss": NaN,
156
+ "eval_runtime": 774.1988,
157
+ "eval_samples_per_second": 178.151,
158
+ "eval_steps_per_second": 44.538,
159
+ "step": 100000
160
+ },
161
+ {
162
+ "epoch": 0.9166666666666666,
163
+ "grad_norm": 0.16049639880657196,
164
+ "learning_rate": 0.001,
165
+ "loss": 1.1305,
166
+ "step": 110000
167
+ },
168
+ {
169
+ "epoch": 0.9166666666666666,
170
+ "eval_loss": NaN,
171
+ "eval_runtime": 612.5819,
172
+ "eval_samples_per_second": 225.152,
173
+ "eval_steps_per_second": 56.288,
174
+ "step": 110000
175
+ },
176
+ {
177
+ "epoch": 1.0,
178
+ "grad_norm": 0.1032409593462944,
179
+ "learning_rate": 0.001,
180
+ "loss": 1.1285,
181
+ "step": 120000
182
+ },
183
+ {
184
+ "epoch": 1.0,
185
+ "eval_loss": NaN,
186
+ "eval_runtime": 585.9083,
187
+ "eval_samples_per_second": 235.402,
188
+ "eval_steps_per_second": 58.851,
189
+ "step": 120000
190
+ }
191
+ ],
192
+ "logging_steps": 10000,
193
+ "max_steps": 120000,
194
+ "num_input_tokens_seen": 0,
195
+ "num_train_epochs": 9223372036854775807,
196
+ "save_steps": 10000,
197
+ "stateful_callbacks": {
198
+ "TrainerControl": {
199
+ "args": {
200
+ "should_epoch_stop": false,
201
+ "should_evaluate": false,
202
+ "should_log": false,
203
+ "should_save": true,
204
+ "should_training_stop": true
205
+ },
206
+ "attributes": {}
207
+ }
208
+ },
209
+ "total_flos": 9.714412005674189e+16,
210
+ "train_batch_size": 4,
211
+ "trial_name": null,
212
+ "trial_params": null
213
+ }
checkpoint-120000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e24d240c0212150205bb434684f1d705979477cac1d7cdb25ba76821568db2
3
+ size 5112
checkpoint-20000/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvNetForMaskedLM"
4
+ ],
5
+ "aux_features_vocab_size": 5,
6
+ "dilation_base": 2,
7
+ "dilation_cycle": 6,
8
+ "dilation_double_every": 1,
9
+ "dilation_max": 32,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "kernel_size": 9,
15
+ "model_type": "ConvNet",
16
+ "n_aux_features": 0,
17
+ "n_layers": 25,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.41.2",
20
+ "vocab_size": 7
21
+ }
checkpoint-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ad9ac6b3b5978abc8064224397508f994dff694d460ac0f949031324fe8bae8
3
+ size 263540548
checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88fc9044e7c2e1856c407f01030ffb844c63770bd0511b2868f0c08482a2cdb2
3
+ size 527212602
checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7efde149035b4a6a5c64f8ee01eaa17edfa91c1e5a7933194a15152b3e7090e2
3
+ size 14244
checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b907d95b8c893b8c166b45f79e5cbc1c7d4f6eac179082acaa5709176eb0786
3
+ size 1064
checkpoint-20000/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mask_token": {
3
+ "content": "[MASK]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "[PAD]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "[UNK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-20000/tokenizer.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[MASK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[UNK]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Lowercase"
36
+ },
37
+ "pre_tokenizer": {
38
+ "type": "Whitespace"
39
+ },
40
+ "post_processor": null,
41
+ "decoder": null,
42
+ "model": {
43
+ "type": "BPE",
44
+ "dropout": null,
45
+ "unk_token": "[UNK]",
46
+ "continuing_subword_prefix": null,
47
+ "end_of_word_suffix": null,
48
+ "fuse_unk": false,
49
+ "byte_fallback": false,
50
+ "ignore_merges": false,
51
+ "vocab": {
52
+ "[PAD]": 0,
53
+ "[MASK]": 1,
54
+ "[UNK]": 2,
55
+ "a": 3,
56
+ "c": 4,
57
+ "g": 5,
58
+ "t": 6
59
+ },
60
+ "merges": []
61
+ }
62
+ }
checkpoint-20000/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "mask_token": "[MASK]",
30
+ "model_max_length": 1000000000000000019884624838656,
31
+ "pad_token": "[PAD]",
32
+ "tokenizer_class": "PreTrainedTokenizerFast",
33
+ "unk_token": "[UNK]"
34
+ }