Kat380 commited on
Commit
e7abee2
1 Parent(s): ec4a7c3

update model checkpoints

Browse files
README.md CHANGED
@@ -1,14 +1,11 @@
1
  ---
2
- license: gemma
3
  library_name: peft
4
  tags:
5
  - alignment-handbook
6
- - trl
7
- - sft
8
  - generated_from_trainer
9
- base_model: google/gemma-7b
10
  datasets:
11
  - llama-duo/synth_summarize_dataset_dedup
 
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-64k
14
  results: []
@@ -21,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 5.5381
25
 
26
  ## Model description
27
 
@@ -42,43 +39,38 @@ More information needed
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
  - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 2
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 16
51
- - total_eval_batch_size: 8
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 15
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.9333 | 1.0 | 406 | 2.4449 |
62
- | 0.8399 | 2.0 | 812 | 2.3513 |
63
- | 0.7898 | 3.0 | 1218 | 2.4311 |
64
- | 0.7295 | 4.0 | 1624 | 2.5969 |
65
- | 0.6377 | 5.0 | 2030 | 2.8601 |
66
- | 0.5891 | 6.0 | 2436 | 3.1033 |
67
- | 0.5095 | 7.0 | 2842 | 3.5735 |
68
- | 0.4514 | 8.0 | 3248 | 3.9319 |
69
- | 0.3872 | 9.0 | 3654 | 4.4386 |
70
- | 0.3371 | 10.0 | 4060 | 4.8561 |
71
- | 0.2993 | 11.0 | 4466 | 5.2020 |
72
- | 0.29 | 12.0 | 4872 | 5.4070 |
73
- | 0.2802 | 13.0 | 5278 | 5.5084 |
74
- | 0.2693 | 14.0 | 5684 | 5.5384 |
75
- | 0.2715 | 15.0 | 6090 | 5.5381 |
76
 
77
 
78
  ### Framework versions
79
 
80
  - PEFT 0.10.0
81
  - Transformers 4.40.0
82
- - Pytorch 2.2.1+cu121
83
  - Datasets 2.18.0
84
  - Tokenizers 0.19.1
 
1
  ---
 
2
  library_name: peft
3
  tags:
4
  - alignment-handbook
 
 
5
  - generated_from_trainer
 
6
  datasets:
7
  - llama-duo/synth_summarize_dataset_dedup
8
+ base_model: google/gemma-7b
9
  model-index:
10
  - name: gemma7b-summarize-gpt4o-64k
11
  results: []
 
18
 
19
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 2.5157
22
 
23
  ## Model description
24
 
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 0.0002
41
  - train_batch_size: 4
42
+ - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 8
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 10
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.3554 | 0.9954 | 109 | 2.6451 |
59
+ | 1.0898 | 2.0 | 219 | 2.5083 |
60
+ | 1.0434 | 2.9954 | 328 | 2.4801 |
61
+ | 0.9864 | 4.0 | 438 | 2.4743 |
62
+ | 0.9371 | 4.9954 | 547 | 2.4854 |
63
+ | 0.9157 | 6.0 | 657 | 2.4642 |
64
+ | 0.8657 | 6.9954 | 766 | 2.5076 |
65
+ | 0.8393 | 8.0 | 876 | 2.5159 |
66
+ | 0.8462 | 8.9954 | 985 | 2.5185 |
67
+ | 0.8359 | 9.9543 | 1090 | 2.5157 |
 
 
 
 
 
68
 
69
 
70
  ### Framework versions
71
 
72
  - PEFT 0.10.0
73
  - Transformers 4.40.0
74
+ - Pytorch 2.1.2+cu121
75
  - Datasets 2.18.0
76
  - Tokenizers 0.19.1
adapter_config.json CHANGED
@@ -10,23 +10,18 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
  "v_proj",
25
- "o_proj",
26
- "gate_proj",
27
- "k_proj",
28
- "up_proj",
29
- "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 64,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 32,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "v_proj",
24
+ "q_proj"
 
 
 
 
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:530dc4e1db6c3022c545725157c6e5d2e7ecf9b878bff1f8f1217168bfc5439e
3
- size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a298c3833fe7338cd4c0311b9ecf38c8370339602da5926c9ab18098266f0a
3
+ size 25705248
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_loss": 5.538125038146973,
4
- "eval_runtime": 2.0562,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.863,
7
- "eval_steps_per_second": 0.973,
8
- "total_flos": 4.655418812370256e+18,
9
- "train_loss": 0.9284773617542436,
10
- "train_runtime": 39545.4398,
11
- "train_samples": 59995,
12
- "train_samples_per_second": 2.463,
13
- "train_steps_per_second": 0.154
14
  }
 
1
  {
2
+ "epoch": 9.954337899543379,
3
+ "eval_loss": 2.515676259994507,
4
+ "eval_runtime": 0.2332,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.882,
7
+ "eval_steps_per_second": 4.288,
8
+ "total_flos": 3.327732991202951e+18,
9
+ "train_loss": 2.136770288659892,
10
+ "train_runtime": 2636.8816,
11
+ "train_samples": 64610,
12
+ "train_samples_per_second": 26.554,
13
+ "train_steps_per_second": 0.413
14
  }
config.json CHANGED
@@ -23,9 +23,9 @@
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
- "bnb_4bit_quant_storage": "bfloat16",
27
  "bnb_4bit_quant_type": "nf4",
28
- "bnb_4bit_use_double_quant": true,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
 
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "uint8",
27
  "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": false,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_loss": 5.538125038146973,
4
- "eval_runtime": 2.0562,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.863,
7
- "eval_steps_per_second": 0.973
8
  }
 
1
  {
2
+ "epoch": 9.954337899543379,
3
+ "eval_loss": 2.515676259994507,
4
+ "eval_runtime": 0.2332,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.882,
7
+ "eval_steps_per_second": 4.288
8
  }
runs/Jun13_06-35-32_gpu1-2/events.out.tfevents.1718231867.gpu1-2.1149169.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4a1cbd13dabfedd2947086fc7c972e092e4e141b4de2b42a321c0f7eb62c0d1
3
+ size 54615
runs/Jun13_06-35-32_gpu1-2/events.out.tfevents.1718234504.gpu1-2.1149169.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc27e99d1c02ca03be4cb57d3d5c9799a043e9730d9316d1740ac833876cbcc9
3
+ size 359
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 15.0,
3
- "total_flos": 4.655418812370256e+18,
4
- "train_loss": 0.9284773617542436,
5
- "train_runtime": 39545.4398,
6
- "train_samples": 59995,
7
- "train_samples_per_second": 2.463,
8
- "train_steps_per_second": 0.154
9
  }
 
1
  {
2
+ "epoch": 9.954337899543379,
3
+ "total_flos": 3.327732991202951e+18,
4
+ "train_loss": 2.136770288659892,
5
+ "train_runtime": 2636.8816,
6
+ "train_samples": 64610,
7
+ "train_samples_per_second": 26.554,
8
+ "train_steps_per_second": 0.413
9
  }
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:086f802354141d938ececcb24962509b618fe3c83ba49547cc844ad2a9563e39
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:342607486cb1c55e0124fb386be3828031f9153fde8abaf253c30aa5f8095a8b
3
  size 5176