gumran commited on
Commit
d52e265
·
verified ·
1 Parent(s): 1abe7e3

End of training

Browse files
Files changed (6) hide show
  1. README.md +1 -1
  2. config.json +1 -1
  3. config.yaml +5 -5
  4. model.safetensors +2 -2
  5. special_tokens_map.json +18 -30
  6. training_args.bin +2 -2
README.md CHANGED
@@ -36,7 +36,7 @@ This model was trained with SFT.
36
 
37
  - TRL: 0.18.1
38
  - Transformers: 4.52.4
39
- - Pytorch: 2.6.0+cu124
40
  - Datasets: 3.6.0
41
  - Tokenizers: 0.21.1
42
 
 
36
 
37
  - TRL: 0.18.1
38
  - Transformers: 4.52.4
39
+ - Pytorch: 2.7.1+cu118
40
  - Datasets: 3.6.0
41
  - Tokenizers: 0.21.1
42
 
config.json CHANGED
@@ -32,7 +32,7 @@
32
  "max_length": 50
33
  }
34
  },
35
- "torch_dtype": "float16",
36
  "transformers_version": "4.52.4",
37
  "use_cache": true,
38
  "vocab_size": 50259
 
32
  "max_length": 50
33
  }
34
  },
35
+ "torch_dtype": "float32",
36
  "transformers_version": "4.52.4",
37
  "use_cache": true,
38
  "vocab_size": 50259
config.yaml CHANGED
@@ -2,17 +2,18 @@ model_name_or_path: "openai-community/gpt2-large"
2
  dataset_name_or_path: "allenai/tulu-3-sft-olmo-2-mixture-0225"
3
  project_name: "scaling-post-training"
4
  training_args:
 
5
  num_train_epochs: 1
6
  per_device_train_batch_size: 2
7
  per_device_eval_batch_size: 2
8
- gradient_accumulation_steps: 32
9
  warmup_ratio: 0.05
10
  weight_decay: 0.01
11
  logging_steps: 10
12
  eval_strategy: "steps"
13
- eval_steps: 100
14
  report_to: "wandb"
15
- bf16: true
16
  learning_rate: 3.0e-5
17
  lr_scheduler_type: "cosine"
18
  run_name: "gpt2-large-sft"
@@ -21,5 +22,4 @@ training_args:
21
  metric_for_best_model: "eval_loss"
22
  load_best_model_at_end: true
23
  save_total_limit: 1
24
- hub_model_id: "gpt2-large-sft"
25
- max_seq_length: 1024
 
2
  dataset_name_or_path: "allenai/tulu-3-sft-olmo-2-mixture-0225"
3
  project_name: "scaling-post-training"
4
  training_args:
5
+ seed: 42
6
  num_train_epochs: 1
7
  per_device_train_batch_size: 2
8
  per_device_eval_batch_size: 2
9
+ gradient_accumulation_steps: 8
10
  warmup_ratio: 0.05
11
  weight_decay: 0.01
12
  logging_steps: 10
13
  eval_strategy: "steps"
14
+ eval_steps: 50
15
  report_to: "wandb"
16
+ fp16: true
17
  learning_rate: 3.0e-5
18
  lr_scheduler_type: "cosine"
19
  run_name: "gpt2-large-sft"
 
22
  metric_for_best_model: "eval_loss"
23
  load_best_model_at_end: true
24
  save_total_limit: 1
25
+ hub_model_id: "gpt2-large-sft"
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d05e964aa4ec887aa6d74feb4886ada1dc46f292f6598e298b3f82001ecafc9
3
- size 1548110536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8281c2976ad8d88173ee3987577c57dc5fc09dec9e42ccbc57c0efc0bb64613
3
+ size 3096176168
special_tokens_map.json CHANGED
@@ -1,34 +1,22 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
- "bos_token": {
7
- "content": "<|im_start|>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false
12
- },
13
- "eos_token": {
14
- "content": "<|im_end|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "pad_token": {
21
- "content": "<|im_end|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false
26
- },
27
- "unk_token": {
28
- "content": "<|endoftext|>",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
  ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f789ac2f351e9d6ab0c56301cdc999f1a692da331f32f97078f325930e5323c
3
- size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac7eeffffdb956445972568f726a050e85b19840e295e3e495a4ddbd2ec9d9f
3
+ size 6097