Hemlock-Philter-14B

Training Configuration

Parameter Value
Training Mode ORPO
Base Model nbeerbower/Hemlock-Tincture-14B
Learning Rate 8e-06
Epochs 2
Batch Size 2
Gradient Accumulation 8
Effective Batch Size 16
Max Sequence Length 2048
Optimizer paged_adamw_8bit
LR Scheduler cosine
Warmup Ratio 0.05
Weight Decay 0.01
Max Grad Norm 0.3
Seed 42
Beta 0.1
Max Prompt Length 1024
LoRA Rank (r) 128
LoRA Alpha 128
LoRA Dropout 0.05
Target Modules up_proj, down_proj, gate_proj, k_proj, q_proj, v_proj, o_proj
Quantization 4-bit (NF4)
GPU NVIDIA RTX A6000

Reproduce this training run

This model was trained with Merlina. Save the JSON below to data/configs/<name>.json (or import it via the Load Configuration dialog) to reproduce the exact training setup. Credentials are not included — Merlina will use your own HF_TOKEN and WANDB_API_KEY from .env or the form.

{
  "_metadata": {
    "name": "Hemlock-Philter-14B",
    "description": "Training configuration shared from a Merlina-trained model.",
    "tags": [],
    "schema": "merlina/training-config",
    "schema_version": 1,
    "merlina_version": "2.0.3"
  },
  "base_model": "nbeerbower/Hemlock-Tincture-14B",
  "output_name": "Hemlock-Philter-14B",
  "use_lora": true,
  "lora_r": 128,
  "lora_alpha": 128,
  "lora_dropout": 0.05,
  "target_modules": [
    "up_proj",
    "down_proj",
    "gate_proj",
    "k_proj",
    "q_proj",
    "v_proj",
    "o_proj"
  ],
  "modules_to_save": [],
  "lora_task_type": "CAUSAL_LM",
  "learning_rate": 8e-06,
  "num_epochs": 2,
  "batch_size": 2,
  "gradient_accumulation_steps": 8,
  "max_length": 2048,
  "max_prompt_length": 1024,
  "model_type": "auto",
  "training_mode": "orpo",
  "beta": 0.1,
  "label_smoothing": 0.0,
  "gamma": 0.5,
  "vision_model_id": null,
  "stage": null,
  "unfreeze_vision_top_n": null,
  "image_token_id": null,
  "min_pixels": null,
  "max_pixels": null,
  "image_column": null,
  "caption_column": null,
  "instruction": null,
  "streaming": null,
  "model_name": null,
  "image_resolution": null,
  "lora_rank": null,
  "lora_target_modules": null,
  "lora_use_dora": null,
  "mid_training_samples": null,
  "dataset_jsonl_path": null,
  "dataset_name": null,
  "dataset_split": null,
  "sample_prompts": null,
  "sample_num_steps": null,
  "dataset": {
    "source": {
      "source_type": "huggingface",
      "repo_id": "hemlang/Hemlock2-DPO",
      "split": "train",
      "file_path": null,
      "file_format": null,
      "dataset_id": null,
      "streaming": false,
      "streaming_batch_size": 10000,
      "column_mapping": {}
    },
    "additional_sources": [],
    "format": {
      "format_type": "tokenizer",
      "custom_templates": null,
      "enable_thinking": false
    },
    "model_name": null,
    "column_mapping": {},
    "convert_messages_format": false,
    "deduplicate": false,
    "dedupe_strategy": "prompt_chosen",
    "test_size": 0.01,
    "max_samples": null,
    "system_prompt": null,
    "system_prompt_mode": "fill_empty",
    "training_mode": "orpo"
  },
  "seed": 42,
  "max_grad_norm": 0.3,
  "warmup_ratio": 0.05,
  "eval_steps": 0.2,
  "use_4bit": true,
  "use_wandb": true,
  "push_to_hub": true,
  "merge_lora_before_upload": true,
  "hf_hub_private": true,
  "export_gguf": false,
  "gguf_quant_types": [
    "Q4_K_M"
  ],
  "keep_gguf_fp16": false,
  "shuffle_dataset": true,
  "weight_decay": 0.01,
  "lr_scheduler_type": "cosine",
  "gradient_checkpointing": true,
  "logging_steps": 1,
  "optimizer_type": "paged_adamw_8bit",
  "adam_beta1": 0.9,
  "adam_beta2": 0.999,
  "adam_epsilon": 1e-08,
  "adafactor_relative_step": false,
  "adafactor_scale_parameter": false,
  "adafactor_warmup_init": false,
  "adafactor_decay_rate": -0.8,
  "adafactor_beta1": null,
  "adafactor_clip_threshold": 1.0,
  "attn_implementation": "auto",
  "use_liger": false,
  "torch_compile": false,
  "neftune_alpha": null,
  "eval_on_start": false,
  "gpu_ids": null,
  "multi_gpu_strategy": "auto",
  "wandb_project": null,
  "wandb_run_name": null,
  "wandb_tags": null,
  "wandb_notes": null
}

Trained with Merlina

Merlina on GitHub

Downloads last month
19
Safetensors
Model size
15B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for nbeerbower/Hemlock-Philter-14B

Base model

Qwen/Qwen2.5-14B
Finetuned
(1)
this model
Quantizations
2 models

Dataset used to train nbeerbower/Hemlock-Philter-14B