lapp0 commited on
Commit
8e782e8
1 Parent(s): 5961958

End of training

Browse files
README.md CHANGED
@@ -41,38 +41,38 @@ More information needed
41
 
42
  # Benchmark Metrics Comparison
43
 
44
- | Metric | attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0 | teacher |
45
- | :--- | :--- | :--- |
46
- | ai2_arc (acc) | 0.305 | 0.354 |
47
- | ai2_arc (acc_norm) | 0.302 | 0.339 |
48
- | arc_challenge (acc) | 0.173 | 0.188 |
49
- | arc_challenge (acc_norm) | 0.223 | 0.222 |
50
- | arc_easy (acc) | 0.37 | 0.436 |
51
- | arc_easy (acc_norm) | 0.34 | 0.396 |
52
- | boolq (acc) | 0.387 | 0.51 |
53
- | cola (mcc) | 0.044 | 0.01 |
54
- | glue (acc) | 0.412 | 0.403 |
55
- | glue (f1) | 0.451 | 0.529 |
56
- | glue (mcc) | 0.044 | 0.01 |
57
- | hellaswag (acc) | 0.315 | 0.343 |
58
- | hellaswag (acc_norm) | 0.344 | 0.393 |
59
- | mnli (acc) | 0.338 | 0.338 |
60
- | mnli_mismatch (acc) | 0.351 | 0.346 |
61
- | mrpc (acc) | 0.353 | 0.515 |
62
- | mrpc (f1) | 0.143 | 0.631 |
63
- | qnli (acc) | 0.497 | 0.491 |
64
- | qqp (acc) | 0.406 | 0.367 |
65
- | qqp (f1) | 0.501 | 0.512 |
66
- | rte (acc) | 0.549 | 0.516 |
67
- | sst2 (acc) | 0.545 | 0.511 |
68
- | wikitext (bits_per_byte) | 1.127 | 0.98 |
69
- | wikitext (byte_perplexity) | 2.184 | 1.973 |
70
- | wikitext (word_perplexity) | 65.25 | 37.82 |
71
- | wnli (acc) | 0.451 | 0.451 |
72
 
73
  # Resource Usage Comparison
74
 
75
- - VRAM Use: 7.7830 GB
76
 
77
  # Distillation (Teacher -> Student) Architecture Difference:
78
 
@@ -102,7 +102,7 @@ Trained on 145,724,804 tokens from the [wikimedia/wikipedia](https://huggingface
102
  # Training Objective
103
 
104
  ```
105
- DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=layer-2))
106
  ```
107
 
108
  # Hyperparameters
@@ -119,9 +119,9 @@ The following hyperparameters were used during training:
119
  - lr_scheduler_type: `cosine_with_min_lr`
120
  - lr_scheduler_warmup_ratio: `0.5`
121
  - num_epochs: `1.0`
122
- - distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=layer-2))`
123
  - train_embeddings: `True`
124
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f0d1223cb50>`
125
  - student_model_name_or_path: `None`
126
  - student_config_name_or_path: `None`
127
  - student_model_config: `None`
 
41
 
42
  # Benchmark Metrics Comparison
43
 
44
+ | Metric | attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5 | attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0 | teacher |
45
+ | :--- | :--- | :--- | :--- |
46
+ | ai2_arc (acc) | 0.313 | 0.305 | 0.354 |
47
+ | ai2_arc (acc_norm) | 0.31 | 0.302 | 0.339 |
48
+ | arc_challenge (acc) | 0.181 | 0.173 | 0.188 |
49
+ | arc_challenge (acc_norm) | 0.224 | 0.223 | 0.222 |
50
+ | arc_easy (acc) | 0.378 | 0.37 | 0.436 |
51
+ | arc_easy (acc_norm) | 0.353 | 0.34 | 0.396 |
52
+ | boolq (acc) | 0.49 | 0.387 | 0.51 |
53
+ | cola (mcc) | -0.041 | 0.044 | 0.01 |
54
+ | glue (acc) | 0.396 | 0.412 | 0.403 |
55
+ | glue (f1) | 0.516 | 0.451 | 0.529 |
56
+ | glue (mcc) | -0.041 | 0.044 | 0.01 |
57
+ | hellaswag (acc) | 0.32 | 0.315 | 0.343 |
58
+ | hellaswag (acc_norm) | 0.348 | 0.344 | 0.393 |
59
+ | mnli (acc) | 0.336 | 0.338 | 0.338 |
60
+ | mnli_mismatch (acc) | 0.343 | 0.351 | 0.346 |
61
+ | mrpc (acc) | 0.444 | 0.353 | 0.515 |
62
+ | mrpc (f1) | 0.478 | 0.143 | 0.631 |
63
+ | qnli (acc) | 0.488 | 0.497 | 0.491 |
64
+ | qqp (acc) | 0.356 | 0.406 | 0.367 |
65
+ | qqp (f1) | 0.522 | 0.501 | 0.512 |
66
+ | rte (acc) | 0.56 | 0.549 | 0.516 |
67
+ | sst2 (acc) | 0.498 | 0.545 | 0.511 |
68
+ | wikitext (bits_per_byte) | 1.118 | 1.127 | 0.98 |
69
+ | wikitext (byte_perplexity) | 2.17 | 2.184 | 1.973 |
70
+ | wikitext (word_perplexity) | 63.05 | 65.25 | 37.82 |
71
+ | wnli (acc) | 0.408 | 0.451 | 0.451 |
72
 
73
  # Resource Usage Comparison
74
 
75
+ - VRAM Use: 8.2855 GB
76
 
77
  # Distillation (Teacher -> Student) Architecture Difference:
78
 
 
102
  # Training Objective
103
 
104
  ```
105
+ DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=5, loss_fn=cos, layer_mapper=all))
106
  ```
107
 
108
  # Hyperparameters
 
119
  - lr_scheduler_type: `cosine_with_min_lr`
120
  - lr_scheduler_warmup_ratio: `0.5`
121
  - num_epochs: `1.0`
122
+ - distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=5, loss_fn=cos, layer_mapper=all))`
123
  - train_embeddings: `True`
124
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f05c40e2050>`
125
  - student_model_name_or_path: `None`
126
  - student_config_name_or_path: `None`
127
  - student_model_config: `None`
benchmarks.shelve.bak CHANGED
@@ -1,2 +1,3 @@
1
  'teacher', (0, 26029753)
2
  'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
 
 
1
  'teacher', (0, 26029753)
2
  'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
3
+ 'attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5', (52060160, 26029753)
benchmarks.shelve.dat CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b08da2c1102a7b8635c1aac31997fbdc32e594beca1614e4a38096dec1f9bf07
3
- size 52059833
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:777b1d28fe282a5405865474430509b697a416a88b6dec206322e7edbf2f1e2d
3
+ size 78089913
benchmarks.shelve.dir CHANGED
@@ -1,2 +1,3 @@
1
  'teacher', (0, 26029753)
2
  'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
 
 
1
  'teacher', (0, 26029753)
2
  'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
3
+ 'attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5', (52060160, 26029753)
tokenizer.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 1023,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": "BatchLongest",
11
- "direction": "Right",
12
- "pad_to_multiple_of": null,
13
- "pad_id": 50256,
14
- "pad_type_id": 0,
15
- "pad_token": "<|endoftext|>"
16
- },
17
  "added_tokens": [
18
  {
19
  "id": 50256,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,