joearul commited on
Commit
90a761a
·
verified ·
1 Parent(s): b2a6480
README.md CHANGED
@@ -31,15 +31,15 @@ More information needed
31
 
32
  The following hyperparameters were used during training:
33
  - learning_rate: 0.0005
34
- - train_batch_size: 32
35
  - eval_batch_size: 8
36
  - seed: 42
37
  - gradient_accumulation_steps: 8
38
- - total_train_batch_size: 256
39
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
  - lr_scheduler_type: cosine
41
  - lr_scheduler_warmup_steps: 1000
42
- - num_epochs: 100
43
  - mixed_precision_training: Native AMP
44
 
45
  ### Training results
 
31
 
32
  The following hyperparameters were used during training:
33
  - learning_rate: 0.0005
34
+ - train_batch_size: 4
35
  - eval_batch_size: 8
36
  - seed: 42
37
  - gradient_accumulation_steps: 8
38
+ - total_train_batch_size: 32
39
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
  - lr_scheduler_type: cosine
41
  - lr_scheduler_warmup_steps: 1000
42
+ - num_epochs: 5
43
  - mixed_precision_training: Native AMP
44
 
45
  ### Training results
config.json CHANGED
@@ -10,7 +10,7 @@
10
  "initializer_range": 0.02,
11
  "layer_norm_epsilon": 1e-05,
12
  "model_type": "gpt2",
13
- "n_ctx": 2,
14
  "n_embd": 768,
15
  "n_head": 12,
16
  "n_inner": null,
@@ -28,5 +28,5 @@
28
  "torch_dtype": "float32",
29
  "transformers_version": "4.38.2",
30
  "use_cache": true,
31
- "vocab_size": 2151
32
  }
 
10
  "initializer_range": 0.02,
11
  "layer_norm_epsilon": 1e-05,
12
  "model_type": "gpt2",
13
+ "n_ctx": 4,
14
  "n_embd": 768,
15
  "n_head": 12,
16
  "n_inner": null,
 
28
  "torch_dtype": "float32",
29
  "transformers_version": "4.38.2",
30
  "use_cache": true,
31
+ "vocab_size": 10000
32
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f52f8a1292ac6e5c387992eb89f10e2eee591eb264ee03321d2ac5486da9b648
3
- size 349992576
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90cb4672d4203626dec85e5a6c5f4c2f62a7cc198d9aa12b4223928656bc1c1a
3
+ size 374104704
runs/Mar21_16-17-46_09a0a2fe07b8/events.out.tfevents.1711037874.09a0a2fe07b8.895.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491812a3283e1f9dbdfd9176f8ce06d971132849e1bb5ba99fc4d152d6d2b1d9
3
+ size 9066
runs/Mar21_16-19-50_09a0a2fe07b8/events.out.tfevents.1711037995.09a0a2fe07b8.895.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d6e3d076af47b58e860263de6fbf54b634752ce5928d2ab8cb33cd6ca357c3c
3
+ size 9066
runs/Mar21_16-21-31_09a0a2fe07b8/events.out.tfevents.1711038096.09a0a2fe07b8.895.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8a10530a47635a403dd6ad8ae11aed3f1cc230fc07db0f63dbe4b36af3f9715
3
+ size 4576
runs/Mar21_16-22-04_09a0a2fe07b8/events.out.tfevents.1711038134.09a0a2fe07b8.895.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10ad60e0644329a2c5fe265aa3e669c9ce4deccc21bb21a6cd4ea6b77767681a
3
+ size 4576
runs/Mar21_16-22-54_09a0a2fe07b8/events.out.tfevents.1711038180.09a0a2fe07b8.895.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c63c59fb77bc490ae37eb297634871960c10895dffeec99aabacd6b97025fb1
3
+ size 9064
runs/Mar21_16-27-29_09a0a2fe07b8/events.out.tfevents.1711038456.09a0a2fe07b8.895.5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf2c9a5dd982ad18afa62a876fd343554b97831237c2895ca68cb31c2a8f681
3
+ size 4928
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:993503e1a20a4bb2d809c1dd7b741777b5f08da0bfeace9c0db2cad9f7d1a275
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:923bc57551b1102c6e0be1b3877f4b1f5bcaee49a52a667822c08d7f43752ffa
3
  size 4856
vocab.json CHANGED
The diff for this file is too large to render. See raw diff