empty-michael commited on
Commit
b497d54
1 Parent(s): c2e5a7b

End of training

Browse files
Files changed (5) hide show
  1. README.md +15 -2
  2. all_results.json +33 -0
  3. eval_results.json +15 -0
  4. train_results.json +21 -0
  5. trainer_state.json +0 -0
README.md CHANGED
@@ -1,11 +1,24 @@
1
  ---
 
2
  tags:
3
  - generated_from_trainer
 
 
4
  metrics:
5
  - accuracy
6
  model-index:
7
  - name: tinystories_1layer_attn_mlp_C10k_k100
8
- results: []
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # tinystories_1layer_attn_mlp_C10k_k100
15
 
16
- This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
  - Loss: 1.8957
19
  - Accuracy: 0.5429
 
1
  ---
2
+ base_model: roneneldan/TinyStories-1Layer-21M
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - roneneldan/TinyStories
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: tinystories_1layer_attn_mlp_C10k_k100
11
+ results:
12
+ - task:
13
+ name: Causal Language Modeling
14
+ type: text-generation
15
+ dataset:
16
+ name: roneneldan/TinyStories
17
+ type: roneneldan/TinyStories
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.5429091526514649
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # tinystories_1layer_attn_mlp_C10k_k100
28
 
29
+ This model is a fine-tuned version of [roneneldan/TinyStories-1Layer-21M](https://huggingface.co/roneneldan/TinyStories-1Layer-21M) on the roneneldan/TinyStories dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 1.8957
32
  - Accuracy: 0.5429
all_results.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "dead_code_fraction": 1.0,
5
+ "dead_code_fraction/layer0": 1.0,
6
+ "epoch": 1.04,
7
+ "eval_MSE/layer0": 611.1571513346564,
8
+ "eval_accuracy": 0.5429091526514649,
9
+ "eval_dead_code_fraction/layer0": 0.0,
10
+ "eval_input_norm/layer0": 31.997479090978388,
11
+ "eval_loss": 1.89570152759552,
12
+ "eval_multicode_k": 1,
13
+ "eval_output_norm/layer0": 15.087154228553715,
14
+ "eval_runtime": 73.291,
15
+ "eval_samples": 4623,
16
+ "eval_samples_per_second": 63.077,
17
+ "eval_steps_per_second": 7.886,
18
+ "input_norm": 0.0,
19
+ "input_norm/layer0": 0.0,
20
+ "loss": 2.0762174885749816,
21
+ "max_norm": 79.63946533203125,
22
+ "max_norm/layer0": 79.63946533203125,
23
+ "mean_norm": 68.03947448730469,
24
+ "mean_norm/layer0": 68.03947448730469,
25
+ "multicode_k": 1,
26
+ "output_norm": 0.0,
27
+ "output_norm/layer0": 0.0,
28
+ "perplexity": 6.657216988297924,
29
+ "runtime": 12054.7701,
30
+ "samples_per_second": 39.818,
31
+ "steps_per_second": 0.83,
32
+ "train_samples": 459760
33
+ }
eval_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.04,
3
+ "eval_MSE/layer0": 611.1571513346564,
4
+ "eval_accuracy": 0.5429091526514649,
5
+ "eval_dead_code_fraction/layer0": 0.0,
6
+ "eval_input_norm/layer0": 31.997479090978388,
7
+ "eval_loss": 1.89570152759552,
8
+ "eval_multicode_k": 1,
9
+ "eval_output_norm/layer0": 15.087154228553715,
10
+ "eval_runtime": 73.291,
11
+ "eval_samples": 4623,
12
+ "eval_samples_per_second": 63.077,
13
+ "eval_steps_per_second": 7.886,
14
+ "perplexity": 6.657216988297924
15
+ }
train_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "dead_code_fraction": 1.0,
5
+ "dead_code_fraction/layer0": 1.0,
6
+ "epoch": 1.04,
7
+ "input_norm": 0.0,
8
+ "input_norm/layer0": 0.0,
9
+ "loss": 2.0762174885749816,
10
+ "max_norm": 79.63946533203125,
11
+ "max_norm/layer0": 79.63946533203125,
12
+ "mean_norm": 68.03947448730469,
13
+ "mean_norm/layer0": 68.03947448730469,
14
+ "multicode_k": 1,
15
+ "output_norm": 0.0,
16
+ "output_norm/layer0": 0.0,
17
+ "runtime": 12054.7701,
18
+ "samples_per_second": 39.818,
19
+ "steps_per_second": 0.83,
20
+ "train_samples": 459760
21
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff