haqishen commited on
Commit
5f10221
1 Parent(s): 9e1a888

update model weights

Browse files
cfg.yaml CHANGED
@@ -1,6 +1,5 @@
1
  architecture:
2
  backbone_dtype: bfloat16
3
- force_embedding_gradients: false
4
  gradient_checkpointing: true
5
  intermediate_dropout: 0.0
6
  pretrained: true
@@ -27,19 +26,19 @@ dataset:
27
  personalize: false
28
  prompt_column:
29
  - instruction
30
- system_column: None
31
  text_answer_separator: <|answer|>
32
  text_prompt_start: <|prompt|>
33
  text_system_start: <|system|>
34
- train_dataframe: /home/qishen/src/h2o-llmstudio/data/user/japanese_hh-rlhf-49k/train-00000-of-00001-157934b4864eb8e0.parquet
35
  validation_dataframe: None
36
  validation_size: 0.01
37
  validation_strategy: automatic
38
  environment:
39
  compile_model: false
40
- deepspeed_allgather_bucket_size: 500000000
41
  deepspeed_method: ZeRO2
42
- deepspeed_reduce_bucket_size: 500000000
43
  deepspeed_stage3_param_persistence_threshold: 1000000
44
  deepspeed_stage3_prefetch_bucket_size: 1000000
45
  find_unused_parameters: false
@@ -47,22 +46,23 @@ environment:
47
  - '0'
48
  - '1'
49
  huggingface_branch: main
50
- mixed_precision: true
 
51
  number_of_workers: 8
52
  seed: -1
53
  trust_remote_code: true
54
  use_deepspeed: true
55
- experiment_name: Llama-3-8B-Instruct
56
  llm_backbone: meta-llama/Meta-Llama-3-8B-Instruct
57
  logging:
58
  logger: None
59
  neptune_project: ''
60
- output_directory: /home/qishen/src/h2o-llmstudio/output/user/Llama-3-8B-Instruct/
61
  prediction:
62
  batch_size_inference: 0
63
  do_sample: false
64
- max_length_inference: 256
65
- max_time: 120.0
66
  metric: Perplexity
67
  metric_gpt_model: gpt-3.5-turbo-0301
68
  metric_gpt_template: general
@@ -77,11 +77,9 @@ prediction:
77
  problem_type: text_causal_language_modeling
78
  tokenizer:
79
  add_prompt_answer_tokens: false
80
- max_length: 8160
81
- max_length_answer: 4064
82
- max_length_prompt: 4096
83
  padding_quantile: 1.0
84
- use_fast: true
85
  training:
86
  batch_size: 2
87
  differential_learning_rate: 1.0e-05
@@ -90,19 +88,22 @@ training:
90
  epochs: 1
91
  evaluate_before_training: false
92
  evaluation_epochs: 1.0
93
- grad_accumulation: 1
 
94
  gradient_clip: 0.0
95
- learning_rate: 0.0001
96
  lora: true
97
  lora_alpha: 16
98
  lora_dropout: 0.05
99
  lora_r: 4
100
- lora_target_modules: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
 
101
  loss_function: TokenAveragedCrossEntropy
102
  optimizer: AdamW
103
- save_best_checkpoint: false
104
  schedule: Cosine
105
  train_validation_data: false
106
- use_flash_attention_2: false
107
- warmup_epochs: 0.0
 
108
  weight_decay: 0.0
 
1
  architecture:
2
  backbone_dtype: bfloat16
 
3
  gradient_checkpointing: true
4
  intermediate_dropout: 0.0
5
  pretrained: true
 
26
  personalize: false
27
  prompt_column:
28
  - instruction
29
+ system_column: system
30
  text_answer_separator: <|answer|>
31
  text_prompt_start: <|prompt|>
32
  text_system_start: <|system|>
33
+ train_dataframe: /home/user/src/h2o-llmstudio/data/user/japanese_hh-rlhf-49k/japanese_hh-rlhf-49k.csv
34
  validation_dataframe: None
35
  validation_size: 0.01
36
  validation_strategy: automatic
37
  environment:
38
  compile_model: false
39
+ deepspeed_allgather_bucket_size: 100000000
40
  deepspeed_method: ZeRO2
41
+ deepspeed_reduce_bucket_size: 100000000
42
  deepspeed_stage3_param_persistence_threshold: 1000000
43
  deepspeed_stage3_prefetch_bucket_size: 1000000
44
  find_unused_parameters: false
 
46
  - '0'
47
  - '1'
48
  huggingface_branch: main
49
+ mixed_precision: false
50
+ mixed_precision_dtype: bfloat16
51
  number_of_workers: 8
52
  seed: -1
53
  trust_remote_code: true
54
  use_deepspeed: true
55
+ experiment_name: llama-3-8b-ja
56
  llm_backbone: meta-llama/Meta-Llama-3-8B-Instruct
57
  logging:
58
  logger: None
59
  neptune_project: ''
60
+ output_directory: /home/user/src/h2o-llmstudio/output/user/llama-3-8b-ja/
61
  prediction:
62
  batch_size_inference: 0
63
  do_sample: false
64
+ max_length_inference: 512
65
+ max_time: 0.0
66
  metric: Perplexity
67
  metric_gpt_model: gpt-3.5-turbo-0301
68
  metric_gpt_template: general
 
77
  problem_type: text_causal_language_modeling
78
  tokenizer:
79
  add_prompt_answer_tokens: false
80
+ max_length: 1024
 
 
81
  padding_quantile: 1.0
82
+ tokenizer_kwargs: '{"use_fast": true, "add_prefix_space": false}'
83
  training:
84
  batch_size: 2
85
  differential_learning_rate: 1.0e-05
 
88
  epochs: 1
89
  evaluate_before_training: false
90
  evaluation_epochs: 1.0
91
+ freeze_layers: []
92
+ grad_accumulation: 4
93
  gradient_clip: 0.0
94
+ learning_rate: 1.0e-05
95
  lora: true
96
  lora_alpha: 16
97
  lora_dropout: 0.05
98
  lora_r: 4
99
+ lora_target_modules: ''
100
+ lora_unfreeze_layers: []
101
  loss_function: TokenAveragedCrossEntropy
102
  optimizer: AdamW
103
+ save_checkpoint: last
104
  schedule: Cosine
105
  train_validation_data: false
106
+ use_dora: false
107
+ use_flash_attention_2: true
108
+ warmup_epochs: 0.05
109
  weight_decay: 0.0
generation_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 128000,
4
- "eos_token_id": 128001,
5
- "max_new_tokens": 256,
6
- "max_time": 120.0,
7
- "min_new_tokens": 2,
8
- "pad_token_id": 128001,
9
- "transformers_version": "4.38.2"
10
  }
 
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 128000,
4
+ "eos_token_id": 128009,
5
+ "pad_token_id": 128009,
6
+ "temperature": null,
7
+ "top_k": null,
8
+ "top_p": null,
9
+ "transformers_version": "4.40.2"
10
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61f4c8da26c66d743d1bd7815b2970c7ced7440f1c78cd6b8f386c8c1ebf9a5b
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97f086559f12923ca1b59a1c1c0367d4b0913ab6bfc66efd687af54e5dff8a41
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37c782e9bd0d0b1456039b3167ba69a9872cffc263af643e522b615136cf34ac
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac035c87cda3f09cc332ab50ec588537b44b447d165a81fb92d510155198b777
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7633ed632f3d6ffef34ce6d1a0f0fc978a1a96f4a1030d91939ebc562629f056
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99313f6223273a020ecc15a3a249307eb34e49ef4ca24ce60982cda6430c7272
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f006959be3288f0af300a0f6bda45f98e14237fbed62da6d8994656cfff21f2
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7049e9604e4838318948587dd94b0f31cd101a4a90529ec266540ac1d2035f
3
  size 1168138808