Update metadata with huggingface_hub
Browse files
README.md
CHANGED
@@ -196,6 +196,104 @@ Training Procedure:
|
|
196 |
Training processing: 'dataset = dataset.shuffle(seed=55)
|
197 |
|
198 |
dataset = dataset[''train''].train_test_split(test_size=0.1)'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
---
|
200 |
## Training procedure
|
201 |
|
|
|
196 |
Training processing: 'dataset = dataset.shuffle(seed=55)
|
197 |
|
198 |
dataset = dataset[''train''].train_test_split(test_size=0.1)'
|
199 |
+
training_regime:
|
200 |
+
output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
|
201 |
+
overwrite_output_dir: false
|
202 |
+
do_train: false
|
203 |
+
do_eval: true
|
204 |
+
do_predict: false
|
205 |
+
evaluation_strategy: epoch
|
206 |
+
prediction_loss_only: false
|
207 |
+
per_device_train_batch_size: 2
|
208 |
+
per_device_eval_batch_size: 8
|
209 |
+
gradient_accumulation_steps: 2
|
210 |
+
eval_accumulation_steps: 1
|
211 |
+
eval_delay: 0
|
212 |
+
learning_rate: 0.0004
|
213 |
+
weight_decay: 0.01
|
214 |
+
adam_beta1: 0.9
|
215 |
+
adam_beta2: 0.999
|
216 |
+
adam_epsilon: 1.0e-08
|
217 |
+
max_grad_norm: 0.3
|
218 |
+
num_train_epochs: 10
|
219 |
+
max_steps: -1
|
220 |
+
lr_scheduler_type: cosine
|
221 |
+
warmup_ratio: 0.1
|
222 |
+
warmup_steps: 0
|
223 |
+
log_level: passive
|
224 |
+
log_level_replica: warning
|
225 |
+
log_on_each_node: true
|
226 |
+
logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234
|
227 |
+
logging_strategy: steps
|
228 |
+
logging_first_step: false
|
229 |
+
logging_steps: 500
|
230 |
+
logging_nan_inf_filter: true
|
231 |
+
save_strategy: epoch
|
232 |
+
save_steps: 500
|
233 |
+
save_total_limit: 5
|
234 |
+
save_safetensors: true
|
235 |
+
save_on_each_node: false
|
236 |
+
no_cuda: false
|
237 |
+
use_mps_device: false
|
238 |
+
seed: 42
|
239 |
+
jit_mode_eval: false
|
240 |
+
use_ipex: false
|
241 |
+
bf16: false
|
242 |
+
fp16: true
|
243 |
+
fp16_opt_level: O1
|
244 |
+
half_precision_backend: auto
|
245 |
+
bf16_full_eval: false
|
246 |
+
fp16_full_eval: false
|
247 |
+
local_rank: 0
|
248 |
+
tpu_metrics_debug: false
|
249 |
+
debug: []
|
250 |
+
dataloader_drop_last: false
|
251 |
+
dataloader_num_workers: 0
|
252 |
+
past_index: -1
|
253 |
+
run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
|
254 |
+
disable_tqdm: false
|
255 |
+
remove_unused_columns: true
|
256 |
+
load_best_model_at_end: true
|
257 |
+
metric_for_best_model: eval_loss
|
258 |
+
greater_is_better: false
|
259 |
+
ignore_data_skip: false
|
260 |
+
sharded_ddp: []
|
261 |
+
fsdp: []
|
262 |
+
fsdp_min_num_params: 0
|
263 |
+
fsdp_config:
|
264 |
+
fsdp_min_num_params: 0
|
265 |
+
xla: false
|
266 |
+
xla_fsdp_grad_ckpt: false
|
267 |
+
label_smoothing_factor: 0.0
|
268 |
+
optim: adamw_torch
|
269 |
+
adafactor: false
|
270 |
+
group_by_length: false
|
271 |
+
length_column_name: length
|
272 |
+
report_to:
|
273 |
+
- tensorboard
|
274 |
+
dataloader_pin_memory: true
|
275 |
+
skip_memory_metrics: true
|
276 |
+
use_legacy_prediction_loop: false
|
277 |
+
push_to_hub: true
|
278 |
+
hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3
|
279 |
+
hub_strategy: all_checkpoints
|
280 |
+
hub_token: <HUB_TOKEN>
|
281 |
+
hub_private_repo: false
|
282 |
+
gradient_checkpointing: true
|
283 |
+
include_inputs_for_metrics: false
|
284 |
+
fp16_backend: auto
|
285 |
+
push_to_hub_token: <PUSH_TO_HUB_TOKEN>
|
286 |
+
mp_parameters: ''
|
287 |
+
auto_find_batch_size: false
|
288 |
+
full_determinism: false
|
289 |
+
ray_scope: last
|
290 |
+
ddp_timeout: 1800
|
291 |
+
torch_compile: false
|
292 |
+
training_data:
|
293 |
+
name: Weni/zeroshot-3.0.3
|
294 |
+
'preprocessing ': 'dataset = dataset.shuffle(seed=55)
|
295 |
+
|
296 |
+
dataset = dataset[''train''].train_test_split(test_size=0.1)'
|
297 |
---
|
298 |
## Training procedure
|
299 |
|