baiges commited on
Commit
1c285c3
1 Parent(s): 9ddad89

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "models/final_model",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
@@ -28,7 +28,7 @@
28
  "summary_type": "cls_index",
29
  "summary_use_proj": true,
30
  "torch_dtype": "float32",
31
- "transformers_version": "4.42.3",
32
  "use_cache": true,
33
  "vocab_size": 32768
34
  }
 
1
  {
2
+ "_name_or_path": "baiges/CatGPT",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
28
  "summary_type": "cls_index",
29
  "summary_use_proj": true,
30
  "torch_dtype": "float32",
31
+ "transformers_version": "4.44.2",
32
  "use_cache": true,
33
  "vocab_size": 32768
34
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
- "transformers_version": "4.42.3"
7
  }
 
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
+ "transformers_version": "4.44.2"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aca488b3a34599938d4d60881ed94d9bdd771e69ee9e14445f200ce4f813967b
3
  size 444048000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352404be1b1166d0f819ba0bfde674390992da31c000d1f3ad7e3f684eaa1658
3
  size 444048000
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e2fdbbf2a7754074af730de9089f98ee7fb463665aca8d7ffa22b4cdc4bd0c7
3
+ size 888189882
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8532a4b700094891bee71a4cd5621c15b04da2f3e774119925cac6e29b195ab
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c95a339ccdee60c82ce6d430ae6160c74f81f6a8f27cde3b69029269054f566c
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.3409229516983032,
3
+ "best_model_checkpoint": "results/checkpoint-2000",
4
+ "epoch": 1.4553392759687103,
5
+ "eval_steps": 250,
6
+ "global_step": 2000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.18191740949608878,
13
+ "grad_norm": 0.2860318124294281,
14
+ "learning_rate": 2.976310216494586e-05,
15
+ "loss": 1.4647,
16
+ "step": 250
17
+ },
18
+ {
19
+ "epoch": 0.18191740949608878,
20
+ "eval_loss": 1.352885127067566,
21
+ "eval_runtime": 13.6056,
22
+ "eval_samples_per_second": 163.241,
23
+ "eval_steps_per_second": 20.433,
24
+ "step": 250
25
+ },
26
+ {
27
+ "epoch": 0.36383481899217757,
28
+ "grad_norm": 0.2909524440765381,
29
+ "learning_rate": 2.8342402320213494e-05,
30
+ "loss": 1.4643,
31
+ "step": 500
32
+ },
33
+ {
34
+ "epoch": 0.36383481899217757,
35
+ "eval_loss": 1.3520773649215698,
36
+ "eval_runtime": 13.6233,
37
+ "eval_samples_per_second": 163.029,
38
+ "eval_steps_per_second": 20.406,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.5457522284882663,
43
+ "grad_norm": 0.2827763557434082,
44
+ "learning_rate": 2.5756526053283042e-05,
45
+ "loss": 1.4622,
46
+ "step": 750
47
+ },
48
+ {
49
+ "epoch": 0.5457522284882663,
50
+ "eval_loss": 1.350306749343872,
51
+ "eval_runtime": 13.6314,
52
+ "eval_samples_per_second": 162.932,
53
+ "eval_steps_per_second": 20.394,
54
+ "step": 750
55
+ },
56
+ {
57
+ "epoch": 0.7276696379843551,
58
+ "grad_norm": 0.29242751002311707,
59
+ "learning_rate": 2.223129492047081e-05,
60
+ "loss": 1.4625,
61
+ "step": 1000
62
+ },
63
+ {
64
+ "epoch": 0.7276696379843551,
65
+ "eval_loss": 1.3474653959274292,
66
+ "eval_runtime": 13.6357,
67
+ "eval_samples_per_second": 162.882,
68
+ "eval_steps_per_second": 20.388,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.9095870474804438,
73
+ "grad_norm": 0.2826482653617859,
74
+ "learning_rate": 1.8074563242173716e-05,
75
+ "loss": 1.4591,
76
+ "step": 1250
77
+ },
78
+ {
79
+ "epoch": 0.9095870474804438,
80
+ "eval_loss": 1.3454625606536865,
81
+ "eval_runtime": 13.6252,
82
+ "eval_samples_per_second": 163.006,
83
+ "eval_steps_per_second": 20.403,
84
+ "step": 1250
85
+ },
86
+ {
87
+ "epoch": 1.0915044569765326,
88
+ "grad_norm": 0.28281259536743164,
89
+ "learning_rate": 1.3649333544377501e-05,
90
+ "loss": 1.4496,
91
+ "step": 1500
92
+ },
93
+ {
94
+ "epoch": 1.0915044569765326,
95
+ "eval_loss": 1.3437364101409912,
96
+ "eval_runtime": 13.6378,
97
+ "eval_samples_per_second": 162.856,
98
+ "eval_steps_per_second": 20.384,
99
+ "step": 1500
100
+ },
101
+ {
102
+ "epoch": 1.2734218664726216,
103
+ "grad_norm": 0.2805255651473999,
104
+ "learning_rate": 9.34205597173652e-06,
105
+ "loss": 1.4453,
106
+ "step": 1750
107
+ },
108
+ {
109
+ "epoch": 1.2734218664726216,
110
+ "eval_loss": 1.3426544666290283,
111
+ "eval_runtime": 13.6433,
112
+ "eval_samples_per_second": 162.791,
113
+ "eval_steps_per_second": 20.376,
114
+ "step": 1750
115
+ },
116
+ {
117
+ "epoch": 1.4553392759687103,
118
+ "grad_norm": 0.2710939645767212,
119
+ "learning_rate": 5.528880047481714e-06,
120
+ "loss": 1.4438,
121
+ "step": 2000
122
+ },
123
+ {
124
+ "epoch": 1.4553392759687103,
125
+ "eval_loss": 1.3409229516983032,
126
+ "eval_runtime": 13.6488,
127
+ "eval_samples_per_second": 162.725,
128
+ "eval_steps_per_second": 20.368,
129
+ "step": 2000
130
+ }
131
+ ],
132
+ "logging_steps": 250,
133
+ "max_steps": 2748,
134
+ "num_input_tokens_seen": 0,
135
+ "num_train_epochs": 2,
136
+ "save_steps": 500,
137
+ "stateful_callbacks": {
138
+ "TrainerControl": {
139
+ "args": {
140
+ "should_epoch_stop": false,
141
+ "should_evaluate": false,
142
+ "should_log": false,
143
+ "should_save": true,
144
+ "should_training_stop": false
145
+ },
146
+ "attributes": {}
147
+ }
148
+ },
149
+ "total_flos": 1.67219584303104e+17,
150
+ "train_batch_size": 40,
151
+ "trial_name": null,
152
+ "trial_params": null
153
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37234b8c597153081cceecae088e34e3af06badf790f270f4f82ad98044cdda
3
+ size 5112