File size: 4,417 Bytes
71dd803 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | #!/usr/bin/env python3
"""
Harbour Fine-tuning Script for qwen3.6:35b (Qwen3.6-35B-A3B MoE)
Uses LoRA with CPU training (121GB RAM available)
"""
import json
import torch
from pathlib import Path
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
# Configuration
MODEL_NAME = "Qwen/Qwen3.6-35B-A3B"
TRAIN_FILE = Path("/home/fivetech/finetune/harbour_train.jsonl")
VAL_FILE = Path("/home/fivetech/finetune/harbour_val.jsonl")
OUTPUT_DIR = Path("/home/fivetech/finetune/output")
MAX_SEQ_LENGTH = 2048
print("=" * 60)
print("Harbour Fine-tuning - qwen3.6:35b (MoE) with LoRA")
print("=" * 60)
# 1. Load tokenizer
print("\n1. Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side="right",
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 2. Load dataset
print("2. Loading dataset...")
def load_jsonl(path):
data = []
with open(path) as f:
for line in f:
data.append(json.loads(line))
return data
train_data = load_jsonl(TRAIN_FILE)
val_data = load_jsonl(VAL_FILE)
print(f" Train: {len(train_data)} entries")
print(f" Val: {len(val_data)} entries")
# 3. Format conversations for Qwen ChatML
print("3. Formatting conversations...")
def format_conversation(entry):
"""Convert messages to Qwen ChatML format."""
messages = entry["messages"]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
return {"text": text}
train_dataset = Dataset.from_list([format_conversation(e) for e in train_data])
val_dataset = Dataset.from_list([format_conversation(e) for e in val_data])
# 4. Tokenize
print("4. Tokenizing...")
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=MAX_SEQ_LENGTH,
padding=False,
)
train_dataset = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"],
desc="Tokenizing train",
)
val_dataset = val_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"],
desc="Tokenizing val",
)
print(f" Train tokens: {sum(len(x) for x in train_dataset['input_ids']):,}")
print(f" Val tokens: {sum(len(x) for x in val_dataset['input_ids']):,}")
# 5. Load model (CPU with float32)
print("5. Loading model (CPU mode)...")
print(" This may take a few minutes...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
)
# 6. LoRA configuration
print("6. Configuring LoRA...")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 7. Training arguments
print("7. Setting up training...")
training_args = TrainingArguments(
output_dir=str(OUTPUT_DIR),
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=1e-4,
weight_decay=0.01,
warmup_ratio=0.1,
lr_scheduler_type="cosine",
logging_steps=5,
save_steps=50,
save_total_limit=3,
eval_strategy="steps",
eval_steps=50,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
bf16=False,
fp16=False,
dataloader_num_workers=1,
report_to="none",
remove_unused_columns=False,
max_grad_norm=1.0,
)
# 8. Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# 9. Create trainer
print("8. Creating trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
)
# 10. Train
print("\n9. Starting training...")
print("=" * 60)
trainer.train()
# 11. Save
print("\n10. Saving model...")
trainer.save_model(str(OUTPUT_DIR / "final"))
tokenizer.save_pretrained(str(OUTPUT_DIR / "final"))
print("\n" + "=" * 60)
print("Training complete!")
print(f"Model saved to: {OUTPUT_DIR / 'final'}")
print("=" * 60)
|