okaemon commited on
Commit
0131997
1 Parent(s): f140c32

model improved

Browse files
all_results.json CHANGED
@@ -1,23 +1,23 @@
1
  {
2
  "epoch": 30.0,
3
- "eval_loss": 0.03293671831488609,
4
- "eval_mem_cpu_alloc_delta": 85377,
5
- "eval_mem_cpu_peaked_delta": 33067,
6
  "eval_mem_gpu_alloc_delta": 0,
7
  "eval_mem_gpu_peaked_delta": 569275904,
8
- "eval_runtime": 4.0268,
9
- "eval_samples": 23,
10
- "eval_samples_per_second": 5.712,
11
- "init_mem_cpu_alloc_delta": 1868923,
12
- "init_mem_cpu_peaked_delta": 18306,
13
  "init_mem_gpu_alloc_delta": 1370738688,
14
  "init_mem_gpu_peaked_delta": 0,
15
- "perplexity": 1.0334851364909519,
16
- "train_mem_cpu_alloc_delta": 480835,
17
- "train_mem_cpu_peaked_delta": 329934,
18
  "train_mem_gpu_alloc_delta": 4036706816,
19
  "train_mem_gpu_peaked_delta": 8663361536,
20
- "train_runtime": 399.4076,
21
- "train_samples": 23,
22
- "train_samples_per_second": 1.728
23
  }
 
1
  {
2
  "epoch": 30.0,
3
+ "eval_loss": 0.027720022946596146,
4
+ "eval_mem_cpu_alloc_delta": 0,
5
+ "eval_mem_cpu_peaked_delta": 0,
6
  "eval_mem_gpu_alloc_delta": 0,
7
  "eval_mem_gpu_peaked_delta": 569275904,
8
+ "eval_runtime": 8.6733,
9
+ "eval_samples": 49,
10
+ "eval_samples_per_second": 5.65,
11
+ "init_mem_cpu_alloc_delta": 702398464,
12
+ "init_mem_cpu_peaked_delta": 0,
13
  "init_mem_gpu_alloc_delta": 1370738688,
14
  "init_mem_gpu_peaked_delta": 0,
15
+ "perplexity": 1.0281077975307098,
16
+ "train_mem_cpu_alloc_delta": 18444288,
17
+ "train_mem_cpu_peaked_delta": 0,
18
  "train_mem_gpu_alloc_delta": 4036706816,
19
  "train_mem_gpu_peaked_delta": 8663361536,
20
+ "train_runtime": 924.1697,
21
+ "train_samples": 49,
22
+ "train_samples_per_second": 1.591
23
  }
config.json CHANGED
@@ -1,14 +1,16 @@
1
  {
2
  "_name_or_path": "rinna/japanese-gpt2-medium",
3
  "activation_function": "gelu_new",
4
- "architectures": ["GPT2LMHeadModel"],
 
 
5
  "attn_pdrop": 0.1,
6
  "bos_token_id": 1,
7
  "embd_pdrop": 0.1,
8
  "eos_token_id": 2,
9
  "gradient_checkpointing": false,
10
  "initializer_range": 0.02,
11
- "layer_norm_epsilon": 1e-5,
12
  "model_type": "gpt2",
13
  "n_ctx": 1024,
14
  "n_embd": 1024,
@@ -26,11 +28,10 @@
26
  "task_specific_params": {
27
  "text-generation": {
28
  "do_sample": true,
29
- "max_length": 500,
30
- "min_length": 200
31
  }
32
  },
33
- "transformers_version": "4.4.2",
34
  "use_cache": true,
35
  "vocab_size": 32000
36
  }
 
1
  {
2
  "_name_or_path": "rinna/japanese-gpt2-medium",
3
  "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
  "attn_pdrop": 0.1,
8
  "bos_token_id": 1,
9
  "embd_pdrop": 0.1,
10
  "eos_token_id": 2,
11
  "gradient_checkpointing": false,
12
  "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
  "model_type": "gpt2",
15
  "n_ctx": 1024,
16
  "n_embd": 1024,
 
28
  "task_specific_params": {
29
  "text-generation": {
30
  "do_sample": true,
31
+ "max_length": 50
 
32
  }
33
  },
34
+ "transformers_version": "4.6.1",
35
  "use_cache": true,
36
  "vocab_size": 32000
37
  }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 30.0,
3
- "eval_loss": 0.03293671831488609,
4
- "eval_mem_cpu_alloc_delta": 85377,
5
- "eval_mem_cpu_peaked_delta": 33067,
6
  "eval_mem_gpu_alloc_delta": 0,
7
  "eval_mem_gpu_peaked_delta": 569275904,
8
- "eval_runtime": 4.0268,
9
- "eval_samples": 23,
10
- "eval_samples_per_second": 5.712,
11
- "perplexity": 1.0334851364909519
12
  }
 
1
  {
2
  "epoch": 30.0,
3
+ "eval_loss": 0.027720022946596146,
4
+ "eval_mem_cpu_alloc_delta": 0,
5
+ "eval_mem_cpu_peaked_delta": 0,
6
  "eval_mem_gpu_alloc_delta": 0,
7
  "eval_mem_gpu_peaked_delta": 569275904,
8
+ "eval_runtime": 8.6733,
9
+ "eval_samples": 49,
10
+ "eval_samples_per_second": 5.65,
11
+ "perplexity": 1.0281077975307098
12
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d12438c9ecb3f0be9106cde6cc82fb132611c13e2535515f2ad5b8285a9a3662
3
  size 1369800665
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bda60b0d6cc78a1cdb2b1e3edf663f5e76a0d66dca1cffb8b63ee79dbb26374
3
  size 1369800665
train_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 30.0,
3
- "init_mem_cpu_alloc_delta": 1868923,
4
- "init_mem_cpu_peaked_delta": 18306,
5
  "init_mem_gpu_alloc_delta": 1370738688,
6
  "init_mem_gpu_peaked_delta": 0,
7
- "train_mem_cpu_alloc_delta": 480835,
8
- "train_mem_cpu_peaked_delta": 329934,
9
  "train_mem_gpu_alloc_delta": 4036706816,
10
  "train_mem_gpu_peaked_delta": 8663361536,
11
- "train_runtime": 399.4076,
12
- "train_samples": 23,
13
- "train_samples_per_second": 1.728
14
  }
 
1
  {
2
  "epoch": 30.0,
3
+ "init_mem_cpu_alloc_delta": 702398464,
4
+ "init_mem_cpu_peaked_delta": 0,
5
  "init_mem_gpu_alloc_delta": 1370738688,
6
  "init_mem_gpu_peaked_delta": 0,
7
+ "train_mem_cpu_alloc_delta": 18444288,
8
+ "train_mem_cpu_peaked_delta": 0,
9
  "train_mem_gpu_alloc_delta": 4036706816,
10
  "train_mem_gpu_peaked_delta": 8663361536,
11
+ "train_runtime": 924.1697,
12
+ "train_samples": 49,
13
+ "train_samples_per_second": 1.591
14
  }
trainer_state.json CHANGED
@@ -2,28 +2,34 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 30.0,
5
- "global_step": 690,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 21.74,
12
- "learning_rate": 1.3768115942028985e-05,
13
- "loss": 1.1133,
14
  "step": 500
15
  },
 
 
 
 
 
 
16
  {
17
  "epoch": 30.0,
18
- "step": 690,
19
- "total_flos": 1424967598080000.0,
20
- "train_runtime": 399.4076,
21
- "train_samples_per_second": 1.728
22
  }
23
  ],
24
- "max_steps": 690,
25
  "num_train_epochs": 30,
26
- "total_flos": 1424967598080000.0,
27
  "trial_name": null,
28
  "trial_params": null
29
  }
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 30.0,
5
+ "global_step": 1470,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 10.2,
12
+ "learning_rate": 3.2993197278911564e-05,
13
+ "loss": 1.9694,
14
  "step": 500
15
  },
16
+ {
17
+ "epoch": 20.41,
18
+ "learning_rate": 1.5986394557823133e-05,
19
+ "loss": 0.4335,
20
+ "step": 1000
21
+ },
22
  {
23
  "epoch": 30.0,
24
+ "step": 1470,
25
+ "total_flos": 3035800535040000.0,
26
+ "train_runtime": 924.1697,
27
+ "train_samples_per_second": 1.591
28
  }
29
  ],
30
+ "max_steps": 1470,
31
  "num_train_epochs": 30,
32
+ "total_flos": 3035800535040000.0,
33
  "trial_name": null,
34
  "trial_params": null
35
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59cb39dc91671c522f484e1862e34286b78345983173bb19730f50a974ad81f2
3
- size 2287
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9b36f634f2e25ce41f64466f8899bfd197b34e1f4605d3c6f8209bec172180
3
+ size 2415