| |
| """ |
| Harbour Fine-tuning Script for qwen3.6:35b (Qwen3.6-35B-A3B MoE) |
| Uses LoRA with CPU training (121GB RAM available) |
| """ |
|
|
| import json |
| import torch |
| from pathlib import Path |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling, |
| ) |
| from peft import LoraConfig, get_peft_model, TaskType |
| from datasets import Dataset |
|
|
| |
| MODEL_NAME = "Qwen/Qwen3.6-35B-A3B" |
| TRAIN_FILE = Path("/home/fivetech/finetune/harbour_train.jsonl") |
| VAL_FILE = Path("/home/fivetech/finetune/harbour_val.jsonl") |
| OUTPUT_DIR = Path("/home/fivetech/finetune/output") |
| MAX_SEQ_LENGTH = 2048 |
|
|
| print("=" * 60) |
| print("Harbour Fine-tuning - qwen3.6:35b (MoE) with LoRA") |
| print("=" * 60) |
|
|
| |
| print("\n1. Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_NAME, |
| trust_remote_code=True, |
| padding_side="right", |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| print("2. Loading dataset...") |
|
|
| def load_jsonl(path): |
| data = [] |
| with open(path) as f: |
| for line in f: |
| data.append(json.loads(line)) |
| return data |
|
|
| train_data = load_jsonl(TRAIN_FILE) |
| val_data = load_jsonl(VAL_FILE) |
|
|
| print(f" Train: {len(train_data)} entries") |
| print(f" Val: {len(val_data)} entries") |
|
|
| |
| print("3. Formatting conversations...") |
|
|
| def format_conversation(entry): |
| """Convert messages to Qwen ChatML format.""" |
| messages = entry["messages"] |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=False, |
| ) |
| return {"text": text} |
|
|
| train_dataset = Dataset.from_list([format_conversation(e) for e in train_data]) |
| val_dataset = Dataset.from_list([format_conversation(e) for e in val_data]) |
|
|
| |
| print("4. Tokenizing...") |
|
|
| def tokenize_function(examples): |
| return tokenizer( |
| examples["text"], |
| truncation=True, |
| max_length=MAX_SEQ_LENGTH, |
| padding=False, |
| ) |
|
|
| train_dataset = train_dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=["text"], |
| desc="Tokenizing train", |
| ) |
| val_dataset = val_dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=["text"], |
| desc="Tokenizing val", |
| ) |
|
|
| print(f" Train tokens: {sum(len(x) for x in train_dataset['input_ids']):,}") |
| print(f" Val tokens: {sum(len(x) for x in val_dataset['input_ids']):,}") |
|
|
| |
| print("5. Loading model (CPU mode)...") |
| print(" This may take a few minutes...") |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| torch_dtype=torch.float32, |
| device_map="cpu", |
| trust_remote_code=True, |
| ) |
|
|
| |
| print("6. Configuring LoRA...") |
| lora_config = LoraConfig( |
| task_type=TaskType.CAUSAL_LM, |
| r=16, |
| lora_alpha=32, |
| lora_dropout=0.05, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], |
| bias="none", |
| ) |
|
|
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
|
|
| |
| print("7. Setting up training...") |
| training_args = TrainingArguments( |
| output_dir=str(OUTPUT_DIR), |
| num_train_epochs=3, |
| per_device_train_batch_size=1, |
| gradient_accumulation_steps=16, |
| learning_rate=1e-4, |
| weight_decay=0.01, |
| warmup_ratio=0.1, |
| lr_scheduler_type="cosine", |
| logging_steps=5, |
| save_steps=50, |
| save_total_limit=3, |
| eval_strategy="steps", |
| eval_steps=50, |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| bf16=False, |
| fp16=False, |
| dataloader_num_workers=1, |
| report_to="none", |
| remove_unused_columns=False, |
| max_grad_norm=1.0, |
| ) |
|
|
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False, |
| ) |
|
|
| |
| print("8. Creating trainer...") |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=val_dataset, |
| data_collator=data_collator, |
| ) |
|
|
| |
| print("\n9. Starting training...") |
| print("=" * 60) |
| trainer.train() |
|
|
| |
| print("\n10. Saving model...") |
| trainer.save_model(str(OUTPUT_DIR / "final")) |
| tokenizer.save_pretrained(str(OUTPUT_DIR / "final")) |
|
|
| print("\n" + "=" * 60) |
| print("Training complete!") |
| print(f"Model saved to: {OUTPUT_DIR / 'final'}") |
| print("=" * 60) |
|
|