model improved
Browse files- all_results.json +14 -14
- config.json +6 -5
- eval_results.json +7 -7
- pytorch_model.bin +1 -1
- train_results.json +7 -7
- trainer_state.json +16 -10
- training_args.bin +2 -2
all_results.json
CHANGED
@@ -1,23 +1,23 @@
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
-
"eval_loss": 0.
|
4 |
-
"eval_mem_cpu_alloc_delta":
|
5 |
-
"eval_mem_cpu_peaked_delta":
|
6 |
"eval_mem_gpu_alloc_delta": 0,
|
7 |
"eval_mem_gpu_peaked_delta": 569275904,
|
8 |
-
"eval_runtime":
|
9 |
-
"eval_samples":
|
10 |
-
"eval_samples_per_second": 5.
|
11 |
-
"init_mem_cpu_alloc_delta":
|
12 |
-
"init_mem_cpu_peaked_delta":
|
13 |
"init_mem_gpu_alloc_delta": 1370738688,
|
14 |
"init_mem_gpu_peaked_delta": 0,
|
15 |
-
"perplexity": 1.
|
16 |
-
"train_mem_cpu_alloc_delta":
|
17 |
-
"train_mem_cpu_peaked_delta":
|
18 |
"train_mem_gpu_alloc_delta": 4036706816,
|
19 |
"train_mem_gpu_peaked_delta": 8663361536,
|
20 |
-
"train_runtime":
|
21 |
-
"train_samples":
|
22 |
-
"train_samples_per_second": 1.
|
23 |
}
|
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
+
"eval_loss": 0.027720022946596146,
|
4 |
+
"eval_mem_cpu_alloc_delta": 0,
|
5 |
+
"eval_mem_cpu_peaked_delta": 0,
|
6 |
"eval_mem_gpu_alloc_delta": 0,
|
7 |
"eval_mem_gpu_peaked_delta": 569275904,
|
8 |
+
"eval_runtime": 8.6733,
|
9 |
+
"eval_samples": 49,
|
10 |
+
"eval_samples_per_second": 5.65,
|
11 |
+
"init_mem_cpu_alloc_delta": 702398464,
|
12 |
+
"init_mem_cpu_peaked_delta": 0,
|
13 |
"init_mem_gpu_alloc_delta": 1370738688,
|
14 |
"init_mem_gpu_peaked_delta": 0,
|
15 |
+
"perplexity": 1.0281077975307098,
|
16 |
+
"train_mem_cpu_alloc_delta": 18444288,
|
17 |
+
"train_mem_cpu_peaked_delta": 0,
|
18 |
"train_mem_gpu_alloc_delta": 4036706816,
|
19 |
"train_mem_gpu_peaked_delta": 8663361536,
|
20 |
+
"train_runtime": 924.1697,
|
21 |
+
"train_samples": 49,
|
22 |
+
"train_samples_per_second": 1.591
|
23 |
}
|
config.json
CHANGED
@@ -1,14 +1,16 @@
|
|
1 |
{
|
2 |
"_name_or_path": "rinna/japanese-gpt2-medium",
|
3 |
"activation_function": "gelu_new",
|
4 |
-
"architectures": [
|
|
|
|
|
5 |
"attn_pdrop": 0.1,
|
6 |
"bos_token_id": 1,
|
7 |
"embd_pdrop": 0.1,
|
8 |
"eos_token_id": 2,
|
9 |
"gradient_checkpointing": false,
|
10 |
"initializer_range": 0.02,
|
11 |
-
"layer_norm_epsilon": 1e-
|
12 |
"model_type": "gpt2",
|
13 |
"n_ctx": 1024,
|
14 |
"n_embd": 1024,
|
@@ -26,11 +28,10 @@
|
|
26 |
"task_specific_params": {
|
27 |
"text-generation": {
|
28 |
"do_sample": true,
|
29 |
-
"max_length":
|
30 |
-
"min_length": 200
|
31 |
}
|
32 |
},
|
33 |
-
"transformers_version": "4.
|
34 |
"use_cache": true,
|
35 |
"vocab_size": 32000
|
36 |
}
|
|
|
1 |
{
|
2 |
"_name_or_path": "rinna/japanese-gpt2-medium",
|
3 |
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
"attn_pdrop": 0.1,
|
8 |
"bos_token_id": 1,
|
9 |
"embd_pdrop": 0.1,
|
10 |
"eos_token_id": 2,
|
11 |
"gradient_checkpointing": false,
|
12 |
"initializer_range": 0.02,
|
13 |
+
"layer_norm_epsilon": 1e-05,
|
14 |
"model_type": "gpt2",
|
15 |
"n_ctx": 1024,
|
16 |
"n_embd": 1024,
|
|
|
28 |
"task_specific_params": {
|
29 |
"text-generation": {
|
30 |
"do_sample": true,
|
31 |
+
"max_length": 50
|
|
|
32 |
}
|
33 |
},
|
34 |
+
"transformers_version": "4.6.1",
|
35 |
"use_cache": true,
|
36 |
"vocab_size": 32000
|
37 |
}
|
eval_results.json
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
-
"eval_loss": 0.
|
4 |
-
"eval_mem_cpu_alloc_delta":
|
5 |
-
"eval_mem_cpu_peaked_delta":
|
6 |
"eval_mem_gpu_alloc_delta": 0,
|
7 |
"eval_mem_gpu_peaked_delta": 569275904,
|
8 |
-
"eval_runtime":
|
9 |
-
"eval_samples":
|
10 |
-
"eval_samples_per_second": 5.
|
11 |
-
"perplexity": 1.
|
12 |
}
|
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
+
"eval_loss": 0.027720022946596146,
|
4 |
+
"eval_mem_cpu_alloc_delta": 0,
|
5 |
+
"eval_mem_cpu_peaked_delta": 0,
|
6 |
"eval_mem_gpu_alloc_delta": 0,
|
7 |
"eval_mem_gpu_peaked_delta": 569275904,
|
8 |
+
"eval_runtime": 8.6733,
|
9 |
+
"eval_samples": 49,
|
10 |
+
"eval_samples_per_second": 5.65,
|
11 |
+
"perplexity": 1.0281077975307098
|
12 |
}
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1369800665
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bda60b0d6cc78a1cdb2b1e3edf663f5e76a0d66dca1cffb8b63ee79dbb26374
|
3 |
size 1369800665
|
train_results.json
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
-
"init_mem_cpu_alloc_delta":
|
4 |
-
"init_mem_cpu_peaked_delta":
|
5 |
"init_mem_gpu_alloc_delta": 1370738688,
|
6 |
"init_mem_gpu_peaked_delta": 0,
|
7 |
-
"train_mem_cpu_alloc_delta":
|
8 |
-
"train_mem_cpu_peaked_delta":
|
9 |
"train_mem_gpu_alloc_delta": 4036706816,
|
10 |
"train_mem_gpu_peaked_delta": 8663361536,
|
11 |
-
"train_runtime":
|
12 |
-
"train_samples":
|
13 |
-
"train_samples_per_second": 1.
|
14 |
}
|
|
|
1 |
{
|
2 |
"epoch": 30.0,
|
3 |
+
"init_mem_cpu_alloc_delta": 702398464,
|
4 |
+
"init_mem_cpu_peaked_delta": 0,
|
5 |
"init_mem_gpu_alloc_delta": 1370738688,
|
6 |
"init_mem_gpu_peaked_delta": 0,
|
7 |
+
"train_mem_cpu_alloc_delta": 18444288,
|
8 |
+
"train_mem_cpu_peaked_delta": 0,
|
9 |
"train_mem_gpu_alloc_delta": 4036706816,
|
10 |
"train_mem_gpu_peaked_delta": 8663361536,
|
11 |
+
"train_runtime": 924.1697,
|
12 |
+
"train_samples": 49,
|
13 |
+
"train_samples_per_second": 1.591
|
14 |
}
|
trainer_state.json
CHANGED
@@ -2,28 +2,34 @@
|
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
"epoch": 30.0,
|
5 |
-
"global_step":
|
6 |
"is_hyper_param_search": false,
|
7 |
"is_local_process_zero": true,
|
8 |
"is_world_process_zero": true,
|
9 |
"log_history": [
|
10 |
{
|
11 |
-
"epoch":
|
12 |
-
"learning_rate":
|
13 |
-
"loss": 1.
|
14 |
"step": 500
|
15 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
{
|
17 |
"epoch": 30.0,
|
18 |
-
"step":
|
19 |
-
"total_flos":
|
20 |
-
"train_runtime":
|
21 |
-
"train_samples_per_second": 1.
|
22 |
}
|
23 |
],
|
24 |
-
"max_steps":
|
25 |
"num_train_epochs": 30,
|
26 |
-
"total_flos":
|
27 |
"trial_name": null,
|
28 |
"trial_params": null
|
29 |
}
|
|
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
"epoch": 30.0,
|
5 |
+
"global_step": 1470,
|
6 |
"is_hyper_param_search": false,
|
7 |
"is_local_process_zero": true,
|
8 |
"is_world_process_zero": true,
|
9 |
"log_history": [
|
10 |
{
|
11 |
+
"epoch": 10.2,
|
12 |
+
"learning_rate": 3.2993197278911564e-05,
|
13 |
+
"loss": 1.9694,
|
14 |
"step": 500
|
15 |
},
|
16 |
+
{
|
17 |
+
"epoch": 20.41,
|
18 |
+
"learning_rate": 1.5986394557823133e-05,
|
19 |
+
"loss": 0.4335,
|
20 |
+
"step": 1000
|
21 |
+
},
|
22 |
{
|
23 |
"epoch": 30.0,
|
24 |
+
"step": 1470,
|
25 |
+
"total_flos": 3035800535040000.0,
|
26 |
+
"train_runtime": 924.1697,
|
27 |
+
"train_samples_per_second": 1.591
|
28 |
}
|
29 |
],
|
30 |
+
"max_steps": 1470,
|
31 |
"num_train_epochs": 30,
|
32 |
+
"total_flos": 3035800535040000.0,
|
33 |
"trial_name": null,
|
34 |
"trial_params": null
|
35 |
}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf9b36f634f2e25ce41f64466f8899bfd197b34e1f4605d3c6f8209bec172180
|
3 |
+
size 2415
|