See oumi train config

oumi version: 0.1.3

data:
  train:
    datasets:
    - dataset_name: HuggingFaceH4/ultrachat_200k
      dataset_path: null
      subset: null
      split: train_sft
      dataset_kwargs: {}
      sample_count: null
      mixture_proportion: null
      shuffle: false
      seed: null
      shuffle_buffer_size: 1000
      trust_remote_code: true
      transform_num_workers: null
    collator_name: null
    pack: false
    stream: false
    target_col: null
    mixture_strategy: first_exhausted
    seed: null
    use_async_dataset: false
    use_torchdata: null
  test:
    datasets: []
    collator_name: null
    pack: false
    stream: false
    target_col: null
    mixture_strategy: first_exhausted
    seed: null
    use_async_dataset: false
    use_torchdata: null
  validation:
    datasets: []
    collator_name: null
    pack: false
    stream: false
    target_col: null
    mixture_strategy: first_exhausted
    seed: null
    use_async_dataset: false
    use_torchdata: null
model:
  model_name: meta-llama/Meta-Llama-3.1-8B
  adapter_model: null
  tokenizer_name: null
  tokenizer_pad_token: null
  tokenizer_kwargs: {}
  model_max_length: 8192
  load_pretrained_weights: true
  trust_remote_code: true
  torch_dtype_str: bfloat16
  compile: false
  chat_template: llama3-instruct
  attn_implementation: flash_attention_2
  device_map: auto
  model_kwargs: {}
  enable_liger_kernel: true
  shard_for_eval: false
  freeze_layers: []
training:
  use_peft: false
  trainer_type: TRL_SFT
  enable_gradient_checkpointing: true
  gradient_checkpointing_kwargs:
    use_reentrant: false
  output_dir: output/llama8b-ultrachat
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 8
  gradient_accumulation_steps: 8
  max_steps: -1
  num_train_epochs: 1
  save_epoch: false
  save_steps: 800
  save_final_model: true
  seed: 42
  run_name: llama8b-ultrachat.sky-2025-01-30-21-19-10-053582_sky-e018-bf996_1
  metrics_function: null
  log_level: info
  dep_log_level: warning
  enable_wandb: true
  enable_tensorboard: true
  logging_strategy: steps
  logging_dir: null
  logging_steps: 100
  logging_first_step: false
  eval_strategy: 'no'
  eval_steps: 500
  learning_rate: 2.0e-05
  lr_scheduler_type: linear
  lr_scheduler_kwargs: {}
  warmup_ratio: null
  warmup_steps: null
  optimizer: paged_adamw_8bit
  weight_decay: 0.0
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  sgd_momentum: 0.0
  mixed_precision_dtype: NONE
  compile: false
  include_performance_metrics: true
  include_alternative_mfu_metrics: false
  log_model_summary: false
  resume_from_checkpoint: null
  try_resume_from_last_checkpoint: false
  dataloader_num_workers: 8
  dataloader_prefetch_factor: 32
  dataloader_main_process_only: null
  ddp_find_unused_parameters: false
  max_grad_norm: 1.0
  trainer_kwargs:
    max_seq_length: 8192
  profiler:
    save_dir: null
    enable_cpu_profiling: false
    enable_cuda_profiling: false
    record_shapes: false
    profile_memory: false
    with_stack: false
    with_flops: false
    with_modules: false
    row_limit: 50
    schedule:
      enable_schedule: false
      wait: 0
      warmup: 1
      active: 3
      repeat: 1
      skip_first: 1
  telemetry:
    telemetry_dir: telemetry
    collect_telemetry_for_all_ranks: false
    track_gpu_temperature: false
  empty_device_cache_steps: 50
  nccl_default_timeout_minutes: null
peft:
  lora_r: 8
  lora_alpha: 8
  lora_dropout: 0.0
  lora_target_modules: null
  lora_modules_to_save: null
  lora_bias: none
  lora_init_weights: DEFAULT
  lora_task_type: CAUSAL_LM
  q_lora: false
  q_lora_bits: 4
  bnb_4bit_quant_type: fp4
  use_bnb_nested_quant: false
  bnb_4bit_quant_storage: uint8
  bnb_4bit_compute_dtype: float32
  peft_save_mode: ADAPTER_ONLY
fsdp:
  enable_fsdp: false
  sharding_strategy: FULL_SHARD
  cpu_offload: false
  mixed_precision: null
  backward_prefetch: BACKWARD_PRE
  forward_prefetch: false
  use_orig_params: null
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: NO_WRAP
  min_num_params: 100000
  transformer_layer_cls: null
  sync_module_states: true

See oumi cloud config

name: llama8b-ultrachat-sft

num_nodes: 1
resources:
  cloud: gcp
  accelerators: "A100-80GB:4"
  use_spot: false
  disk_size: 2000 # Disk size in GBs

working_dir: .

file_mounts:
  ~/.netrc: ~/.netrc  # WandB credentials
  # Mount HF token, which is needed to download locked-down models from HF Hub.
  # This is created on the local machine by running `huggingface-cli login`.
  ~/.cache/huggingface/token: ~/.cache/huggingface/token

envs:
  WANDB_PROJECT: oumi-train
  OUMI_RUN_NAME: llama8b-ultrachat
  OUMI_USER_NAME: penfever
  ACCELERATE_LOG_LEVEL: info
  # https://github.com/huggingface/tokenizers/issues/899#issuecomment-1027739758
  TOKENIZERS_PARALLELISM: false
setup: |
  set -e
  pip install uv && uv pip install -e .[gpu,evaluation] hf_transfer
  # Install model from HF Hub. This tool increases download speed compared to
  # downloading the model during training.
  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Meta-Llama-3.1-8B --exclude original/*
  pip install -U flash-attn --no-build-isolation

run: |
  set -e  # Exit if any command failed.
  source ./configs/examples/misc/sky_init.sh

  set -x
  oumi distributed torchrun \
    -m oumi train \
    -c configs/recipes/llama3_1/sft/8b_full/base_ultrachat.yaml \
    --training.run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \

  echo "Node ${SKYPILOT_NODE_RANK} is all done!"

Llama-3-8B-UltraChat-200K-Oumi

This model is a fine-tuned version of meta-llama/Meta-Llama-3.1-8B on the HuggingFaceH4/ultrachat_200k dataset. It achieves a training loss of 1.0435.

Model description

This model was trained as a partial reproduction of results from the recent WildChat-50M paper.

@misc{feuer2025wildchat50mdeepdiverole,
      title={WILDCHAT-50M: A Deep Dive Into the Role of Synthetic Data in Post-Training}, 
      author={Benjamin Feuer and Chinmay Hegde},
      year={2025},
      eprint={2501.18511},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2501.18511}, 
}

Intended uses & limitations

This model is intended for research use; it has not received any safety oriented post-training.

Artifacts

The following is a list of artifacts which may be present in this repository, as well as brief descriptions of what they contain.

Logs

Contains logs from the training process, one for each rank.

Telemetry

devices_info.txt: A file containing information about the devices used to train the model.

telemetry_callback_metrics.json: File containing metrics from the training process such as loss and number of tokens seen.

telemetry_callback_wandb.json: File containing weights and biases parameters.

telemetry_callback.json: File containing metadata such as time to train and number of epochs trained.

training_config.yaml: File containing the training configuration used to train the model (also found in this README)

world_size.json: File containing the world size used to train the model.

Datasets

Summary statistics about the datasets used to train this model.

HuggingFaceH4/ultrachat_200k

Split: train_sft

Version: 0.0.0

Dataset size: 3047427114 bytes

Download size: 1624049723 bytes

Size: 4671476837 bytes

Rows: 207865

Columns: ['prompt', 'prompt_id', 'messages']

Results

Training Loss

Training Loss	Epoch	Tokens Seen
1.043	0.999	246 Mn

Evaluation

Following the paper, our benchmark results are reported using Evalchemy. For more details on the evaluation metrics, please refer to the paper. We compare to this baseline model used in the paper.

Metric	Oumi Repro	Baseline
MTBench	5.2313	5.0187
Alpaca Eval (LC)	1.6157	4.1260
BBH	0.4861	0.4845
GPQA	0.2903	0.3204
MATH	0.0552	0.0458
MUSR	0.4116	0.3917
IFEval (Prompt Level, Strict)	0.1978	0.2643
MMLU Pro	0.3118	0.3198
MixEval	0.5935	0.63
Average	0.321	0.333

penfever
/

oumi-l8b-ultrachat