| """ |
| finetune_local.py β Local adaptation of Soci_FineTune_3_Incremental |
| Fine-tunes Qwen2.5-0.5B-Instruct on Soci city-simulation tasks using Unsloth. |
| |
| Differences from the Colab version: |
| - No Google Drive / google.colab dependencies |
| - Local checkpoint and adapter storage in data/training/ |
| - Loads live conversation data from data/training/processed/ |
| - HF token from HF_TOKEN env var (or .env file) |
| - --debug flag for quick 1-epoch smoke test (no HF push) |
| - --resume flag to continue from saved LoRA adapters |
| |
| Usage (from project root): |
| # Debug / smoke test (fast, no push): |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --debug |
| |
| # Full round-1 training on default 0.5b model + push to HF: |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py |
| |
| # Fine-tune specific model sizes: |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 7b |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 8b |
| |
| # Resume round 2 for a specific model: |
| "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 7b --resume |
| |
| Model profiles (base model -> HF repo): |
| 0.5b -> RayMelius/soci-agent-q4 (Qwen2.5-0.5B, batch=2, seq=2048) |
| 1.5b -> RayMelius/soci-agent-1b5 (Qwen2.5-1.5B, batch=2, seq=2048) |
| 3b -> RayMelius/soci-agent-3b (Qwen2.5-3B, batch=2, seq=2048) |
| 7b -> RayMelius/soci-agent-7b (Qwen2.5-7B, batch=1, seq=1024) |
| 8b -> RayMelius/soci-agent-8b (Llama-3.1-8B, batch=1, seq=1024) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| import io |
| import os |
|
|
| |
| if sys.platform == "win32": |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") |
|
|
| |
| |
| |
| os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1") |
| os.environ.setdefault("TORCH_COMPILE_DISABLE", "1") |
|
|
| |
| |
| |
| import unsloth |
| import transformers.utils.hub |
| import transformers.tokenization_utils_base |
| _noop = lambda *a, **kw: [] |
| transformers.tokenization_utils_base.list_repo_templates = _noop |
| transformers.utils.hub.list_repo_templates = _noop |
|
|
| import argparse |
| import json |
| import os |
| import shutil |
| from datetime import datetime |
| from pathlib import Path |
|
|
| |
| parser = argparse.ArgumentParser(description="Soci local fine-tune") |
| parser.add_argument("--resume", action="store_true", help="Resume from saved LoRA adapters") |
| parser.add_argument("--debug", action="store_true", help="Debug/smoke-test: 1 epoch, 20 examples, no push") |
| parser.add_argument("--no-push", action="store_true", help="Skip HF Hub push") |
| parser.add_argument("--no-gguf", action="store_true", help="Skip GGUF export") |
| parser.add_argument("--epochs", type=int, default=None, help="Override epoch count") |
| parser.add_argument("--hf-repo", default=None, help="HF repo ID (overrides default)") |
| parser.add_argument("--base-model", default="0.5b", |
| choices=["0.5b", "1.5b", "3b", "7b", "8b"], |
| help="Base model size to fine-tune (default: 0.5b)") |
| args = parser.parse_args() |
|
|
| |
| _MODEL_PROFILES = { |
| "0.5b": dict( |
| model_id = "unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit", |
| repo_name = "soci-agent-q4", |
| seq_len = 2048, |
| batch = 2, |
| grad_accum = 4, |
| lora_r = 16, |
| lora_targets = ["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj"], |
| ), |
| "1.5b": dict( |
| model_id = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit", |
| repo_name = "soci-agent-1b5", |
| seq_len = 2048, |
| batch = 2, |
| grad_accum = 4, |
| lora_r = 16, |
| lora_targets = ["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj"], |
| ), |
| "3b": dict( |
| model_id = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit", |
| repo_name = "soci-agent-3b", |
| seq_len = 2048, |
| batch = 2, |
| grad_accum = 4, |
| lora_r = 16, |
| lora_targets = ["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj"], |
| ), |
| |
| |
| |
| "7b": dict( |
| model_id = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit", |
| repo_name = "soci-agent-7b", |
| seq_len = 512, |
| batch = 1, |
| grad_accum = 8, |
| lora_r = 8, |
| lora_targets = ["q_proj", "v_proj"], |
| ), |
| "8b": dict( |
| model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", |
| repo_name = "soci-agent-8b", |
| seq_len = 512, |
| batch = 1, |
| grad_accum = 8, |
| lora_r = 8, |
| lora_targets = ["q_proj", "v_proj"], |
| ), |
| } |
| _PROFILE = _MODEL_PROFILES[args.base_model] |
|
|
| |
| TRAIN_DIR = Path("data/training") |
| MODEL_DIR = TRAIN_DIR / args.base_model |
| LORA_SAVE_DIR = MODEL_DIR / "lora_adapters" |
| DATA_ARCHIVE_DIR = MODEL_DIR / "data_archive" |
| GGUF_DIR = MODEL_DIR / "gguf" |
| CHECKPOINTS_DIR = MODEL_DIR / "checkpoints" |
| ROUND_FILE = MODEL_DIR / "training_round.json" |
| CORE_DATA_FILE = TRAIN_DIR / "core_examples.json" |
| LIVE_DATA_FILE = TRAIN_DIR / "processed" / "soci_training.jsonl" |
|
|
| for d in [LORA_SAVE_DIR, DATA_ARCHIVE_DIR, GGUF_DIR, CHECKPOINTS_DIR]: |
| d.mkdir(parents=True, exist_ok=True) |
|
|
| |
| MAX_SEQ_LENGTH = _PROFILE["seq_len"] |
| HF_USERNAME = "RayMelius" |
| HF_REPO_ID = args.hf_repo or f"{HF_USERNAME}/{_PROFILE['repo_name']}" |
|
|
| |
| try: |
| from dotenv import load_dotenv |
| load_dotenv() |
| except ImportError: |
| pass |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| if not HF_TOKEN: |
| |
| env_file = Path(".env") |
| if env_file.exists(): |
| for line in env_file.read_text().splitlines(): |
| if line.startswith("HF_TOKEN="): |
| HF_TOKEN = line.split("=", 1)[1].strip().strip('"') |
|
|
| |
| import torch |
| if not torch.cuda.is_available(): |
| print("[WARN] No CUDA GPU detected β training will be very slow on CPU.") |
| print(" Consider running on Colab or a machine with a GPU.") |
| else: |
| print(f"GPU : {torch.cuda.get_device_name(0)}") |
| print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") |
|
|
| |
| |
| |
| |
| |
| import functools |
| import unsloth_zoo.fused_losses.cross_entropy_loss as _unsloth_ce |
|
|
| @functools.cache |
| def _safe_chunk_multiplier(vocab_size, target_gb=None): |
| if target_gb is None: |
| try: |
| free, _ = torch.cuda.mem_get_info(0) |
| free_gb = free / (1024 ** 3) * 0.5 |
| except Exception: |
| free_gb = 0.0 |
| target_gb = max(free_gb, 0.1) |
| if target_gb <= 1e-9: |
| target_gb = 0.1 |
| multiplier = (vocab_size * 4 / (1024 ** 3)) / target_gb |
| multiplier = multiplier / 4 |
| return multiplier |
|
|
| _unsloth_ce._get_chunk_multiplier = _safe_chunk_multiplier |
| print("Patched unsloth fused CE loss for low-VRAM GPU") |
|
|
| |
| RESUME = args.resume |
| if RESUME and ROUND_FILE.exists(): |
| round_info = json.loads(ROUND_FILE.read_text()) |
| CURRENT_ROUND = round_info["round"] + 1 |
| print(f"Resuming from round {round_info['round']} -> round {CURRENT_ROUND}") |
| print(f"Previous loss: {round_info.get('final_loss', 'N/A')}") |
| elif RESUME: |
| CURRENT_ROUND = 2 |
| print("No round file found, assuming round 2") |
| else: |
| CURRENT_ROUND = 1 |
| print("Starting fresh (round 1)") |
|
|
| |
| from unsloth import FastLanguageModel |
|
|
| if RESUME and LORA_SAVE_DIR.exists() and any(LORA_SAVE_DIR.iterdir()): |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = str(LORA_SAVE_DIR), |
| max_seq_length = MAX_SEQ_LENGTH, |
| dtype = None, |
| load_in_4bit = True, |
| ) |
| print(f"Resumed LoRA adapters from {LORA_SAVE_DIR}") |
| else: |
| if RESUME: |
| print(f"[WARN] No LoRA adapters at {LORA_SAVE_DIR}, starting fresh.") |
| CURRENT_ROUND = 1 |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = _PROFILE["model_id"], |
| max_seq_length = MAX_SEQ_LENGTH, |
| dtype = None, |
| load_in_4bit = True, |
| ) |
| print(f"Fresh base model loaded (round 1): {_PROFILE['model_id']}") |
|
|
| |
| if CURRENT_ROUND == 1: |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r = _PROFILE["lora_r"], |
| target_modules = _PROFILE["lora_targets"], |
| lora_alpha = _PROFILE["lora_r"], |
| lora_dropout = 0, |
| bias = "none", |
| use_gradient_checkpointing = "unsloth", |
| random_state = 42, |
| ) |
| print("Fresh LoRA adapters attached") |
| else: |
| model.gradient_checkpointing_enable() |
| print(f"Resumed LoRA adapters from round {CURRENT_ROUND - 1}") |
|
|
| model.print_trainable_parameters() |
|
|
| |
| SYSTEM_PROMPT = ( |
| "You are the reasoning engine for Soci, an LLM-powered city population simulator. " |
| "You control AI agents (NPCs) living in a city. Each agent has a persona, needs " |
| "(hunger, energy, social, purpose, comfort, fun), memories, and relationships. " |
| "You receive structured context and must respond ONLY with valid JSON. " |
| "Never add explanation outside the JSON." |
| ) |
|
|
| |
| print("\nLoading training data...") |
|
|
| |
| core_examples: list[dict] = [] |
| if CORE_DATA_FILE.exists(): |
| core_examples = json.loads(CORE_DATA_FILE.read_text(encoding="utf-8")) |
| print(f" Core examples: {len(core_examples)}") |
| else: |
| print(f" [WARN] {CORE_DATA_FILE} not found β run extract step or collect_training_data.py first") |
|
|
| |
| live_examples: list[dict] = [] |
| if LIVE_DATA_FILE.exists(): |
| with open(LIVE_DATA_FILE, encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| ex = json.loads(line) |
| |
| msgs = ex.get("messages", []) |
| if len(msgs) >= 3: |
| |
| user_content = msgs[1]["content"] |
| asst_content = msgs[2]["content"] |
| |
| persona_ctx = msgs[0]["content"] |
| |
| instruction = f"{persona_ctx}\n\n{user_content}" |
| live_examples.append({ |
| "instruction": instruction, |
| "response": asst_content, |
| }) |
| except (json.JSONDecodeError, KeyError): |
| pass |
| print(f" Live examples: {len(live_examples)} (from Render simulation)") |
|
|
| |
| replay_examples: list[dict] = [] |
| if CURRENT_ROUND > 1: |
| for archive_f in sorted(DATA_ARCHIVE_DIR.glob("round_*.json")): |
| try: |
| batch = json.loads(archive_f.read_text(encoding="utf-8")) |
| replay_examples.extend(batch) |
| except Exception: |
| pass |
| print(f" Replay examples: {len(replay_examples)}") |
|
|
| |
| new_examples_this_round: list[dict] = [ |
| |
| |
| |
| |
| ] |
| if new_examples_this_round: |
| print(f" New examples this round: {len(new_examples_this_round)}") |
|
|
| |
| seen: set[str] = set() |
| all_examples: list[dict] = [] |
| for ex in core_examples + live_examples + new_examples_this_round + replay_examples: |
| key = ex.get("instruction", "")[:100] |
| if key not in seen: |
| seen.add(key) |
| all_examples.append(ex) |
|
|
| if args.debug: |
| all_examples = all_examples[:20] |
| print(f" DEBUG mode: using {len(all_examples)} examples") |
|
|
| print(f" Total (deduped): {len(all_examples)}") |
|
|
| |
| from datasets import Dataset |
|
|
| def format_example(ex: dict) -> dict: |
| msgs = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": ex["instruction"]}, |
| {"role": "assistant", "content": ex["response"]}, |
| ] |
| return {"text": tokenizer.apply_chat_template( |
| msgs, tokenize=False, add_generation_prompt=False |
| )} |
|
|
| dataset = Dataset.from_list(all_examples).map(format_example) |
| print(f"Formatted {len(dataset)} examples. Sample:") |
| print(dataset[0]["text"][:400]) |
|
|
| |
| from trl import SFTTrainer, SFTConfig |
| from unsloth import is_bfloat16_supported |
|
|
| if args.debug: |
| LR, EPOCHS, WARMUP, SCHEDULER = 2e-4, 1, 2, "linear" |
| print(f"\nDEBUG: 1 epoch smoke test") |
| elif CURRENT_ROUND == 1: |
| LR, EPOCHS, WARMUP, SCHEDULER = 2e-4, 3, 5, "linear" |
| print(f"\nRound 1: Full training β LR={LR}, epochs={EPOCHS}") |
| else: |
| LR, EPOCHS, WARMUP, SCHEDULER = 5e-5, 2, 10, "cosine" |
| print(f"\nRound {CURRENT_ROUND}: Incremental β LR={LR}, epochs={EPOCHS}") |
|
|
| if args.epochs is not None: |
| EPOCHS = args.epochs |
| print(f"Epoch override: {EPOCHS}") |
|
|
| trainer = SFTTrainer( |
| model = model, |
| tokenizer = tokenizer, |
| train_dataset = dataset, |
| dataset_text_field = "text", |
| max_seq_length = MAX_SEQ_LENGTH, |
| dataset_num_proc = 2, |
| args = SFTConfig( |
| per_device_train_batch_size = _PROFILE["batch"], |
| gradient_accumulation_steps = _PROFILE["grad_accum"], |
| warmup_steps = WARMUP, |
| num_train_epochs = EPOCHS, |
| learning_rate = LR, |
| fp16 = not is_bfloat16_supported(), |
| bf16 = is_bfloat16_supported(), |
| logging_steps = 5, |
| optim = "adamw_8bit", |
| weight_decay = 0.01, |
| lr_scheduler_type = SCHEDULER, |
| seed = 42, |
| output_dir = str(CHECKPOINTS_DIR), |
| report_to = "none", |
| dataset_text_field = "text", |
| max_seq_length = MAX_SEQ_LENGTH, |
| ), |
| ) |
|
|
| print(f"\nTraining round {CURRENT_ROUND} on {len(dataset)} examples...") |
| torch.cuda.empty_cache() |
| stats = trainer.train() |
| print(f"\nRound {CURRENT_ROUND} complete!") |
| print(f" Steps: {stats.global_step} | Final loss: {stats.training_loss:.4f}") |
|
|
| |
| print(f"\nSaving LoRA adapters to {LORA_SAVE_DIR}...") |
| model.save_pretrained(str(LORA_SAVE_DIR)) |
| tokenizer.save_pretrained(str(LORA_SAVE_DIR)) |
| print(" Saved.") |
|
|
| |
| round_info = { |
| "round": CURRENT_ROUND, |
| "final_loss": stats.training_loss, |
| "global_steps": stats.global_step, |
| "total_examples": len(all_examples), |
| "new_examples": len(new_examples_this_round) + len(live_examples), |
| "learning_rate": LR, |
| "epochs": EPOCHS, |
| "timestamp": datetime.now().isoformat(), |
| } |
| ROUND_FILE.write_text(json.dumps(round_info, indent=2)) |
| print(f" Round info: {ROUND_FILE}") |
|
|
| |
| all_new = new_examples_this_round + live_examples |
| if all_new: |
| archive_file = DATA_ARCHIVE_DIR / f"round_{CURRENT_ROUND:03d}.json" |
| archive_file.write_text(json.dumps(all_new, indent=2, ensure_ascii=False)) |
| print(f" Archived {len(all_new)} new examples") |
|
|
| |
| history_file = TRAIN_DIR / "training_history.jsonl" |
| with open(history_file, "a", encoding="utf-8") as f: |
| f.write(json.dumps(round_info) + "\n") |
|
|
| |
| print(f"\n=== Testing after Round {CURRENT_ROUND} ===\n") |
| FastLanguageModel.for_inference(model) |
|
|
| def ask(question: str, label: str = "") -> None: |
| msgs = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": question}, |
| ] |
| encoded = tokenizer.apply_chat_template( |
| msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt" |
| ) |
| if hasattr(encoded, "input_ids"): |
| inp = encoded.input_ids.to("cuda") |
| else: |
| inp = encoded.to("cuda") |
| out = model.generate( |
| input_ids=inp, max_new_tokens=200, |
| temperature=0.7, top_p=0.9, do_sample=True, |
| ) |
| resp = tokenizer.decode(out[0][inp.shape[1]:], skip_special_tokens=True) |
| print(f"[{label}]") |
| print(f"Q: {question[:100]}...") |
| try: |
| parsed = json.loads(resp) |
| print(f"A (valid JSON):\n{json.dumps(parsed, indent=2)}") |
| except Exception: |
| print(f"A (raw): {resp}") |
| print("-" * 60) |
|
|
| ask( |
| "You are playing Elena Vasquez, 34, software engineer. " |
| "Needs: energy=0.3, hunger=0.7. Location: office. Time: 12:30. " |
| "Decide next action. JSON: {\"action\": str, \"location\": str, \"reason\": str}", |
| "decide_action", |
| ) |
| ask( |
| "You are playing Marcus Chen talking to Zoe. " |
| "Zoe says: 'Marcus, I bombed my exam.' Continue as Marcus. " |
| "JSON: {\"speech\": str, \"emotion\": str}", |
| "conversation_turn", |
| ) |
|
|
| |
| |
| |
| import platform |
| _on_windows = platform.system() == "Windows" |
| skip_gguf = args.no_gguf or args.debug or _on_windows |
| if _on_windows and not args.no_gguf and not args.debug: |
| print("\nSkipping GGUF export (Windows β llama.cpp build not supported via unsloth on Win)") |
| print(" To export GGUF manually, use llama.cpp's convert_hf_to_gguf.py") |
| print(f" LoRA merged weights saved to: {GGUF_DIR}/ (after push)") |
|
|
| if not skip_gguf: |
| print(f"\nExporting GGUF Q4_K_M (takes a few minutes)...") |
| model.save_pretrained_gguf(str(GGUF_DIR), tokenizer, quantization_method="q4_k_m") |
| gguf_files = list(GGUF_DIR.glob("*.gguf")) |
| for gf in gguf_files: |
| print(f" GGUF: {gf.name} ({gf.stat().st_size / 1e6:.0f} MB)") |
| else: |
| if args.debug: |
| print("\nSkipping GGUF export (debug mode)") |
| gguf_files = [] |
|
|
| |
| skip_push = args.no_push or args.debug |
| if skip_push: |
| print("\nSkipping HF push (debug mode or --no-push)") |
| else: |
| if not HF_TOKEN: |
| print("\n[WARN] No HF_TOKEN found β skipping push.") |
| print(" Set HF_TOKEN env var or add to .env file.") |
| else: |
| from huggingface_hub import login, HfApi |
| print(f"\nPushing to HuggingFace: {HF_REPO_ID}") |
| login(token=HF_TOKEN) |
| api = HfApi() |
| api.create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True) |
|
|
| |
| print(" Uploading LoRA adapters...") |
| api.upload_folder( |
| folder_path = str(LORA_SAVE_DIR), |
| repo_id = HF_REPO_ID, |
| repo_type = "model", |
| path_in_repo= "lora_adapters", |
| ) |
| print(f" LoRA -> https://huggingface.co/{HF_REPO_ID}/tree/main/lora_adapters") |
|
|
| |
| for gf in gguf_files: |
| mb = gf.stat().st_size / 1e6 |
| print(f" Uploading {gf.name} ({mb:.0f} MB)...") |
| api.upload_file( |
| path_or_fileobj = str(gf), |
| path_in_repo = gf.name, |
| repo_id = HF_REPO_ID, |
| repo_type = "model", |
| ) |
| print(f" Done: https://huggingface.co/{HF_REPO_ID}/blob/main/{gf.name}") |
|
|
| |
| api.upload_file( |
| path_or_fileobj = str(ROUND_FILE), |
| path_in_repo = "training_round.json", |
| repo_id = HF_REPO_ID, |
| repo_type = "model", |
| ) |
|
|
| print(f"\nUpload complete! Model at: https://huggingface.co/{HF_REPO_ID}") |
|
|
| |
| print("\n=== Training History ===\n") |
| if history_file.exists(): |
| print(f"{'Round':>6} {'Loss':>8} {'Steps':>7} {'Examples':>9} {'New':>5} {'LR':>10} {'Date':>12}") |
| print("-" * 65) |
| with open(history_file, encoding="utf-8") as f: |
| for line in f: |
| r = json.loads(line) |
| date = r.get("timestamp", "")[:10] |
| print(f"{r['round']:>6} {r['final_loss']:>8.4f} {r['global_steps']:>7} " |
| f"{r['total_examples']:>9} {r['new_examples']:>5} " |
| f"{r['learning_rate']:>10.1e} {date:>12}") |
|
|
| print(f"\nTo resume: python scripts/finetune_local.py --resume") |
| print(f"LoRA adapters: {LORA_SAVE_DIR}") |
| if gguf_files: |
| print(f"GGUF: {gguf_files[0]}") |
| print(f"\nOllama integration:") |
| print(f" ollama create soci-agent -f Modelfile") |
| print(f" set SOCI_PROVIDER=ollama && set OLLAMA_MODEL=soci-agent") |
|
|