Harbour / train.py
fivetech's picture
Upload train.py with huggingface_hub
71dd803 verified
Raw
History Blame Contribute Delete
4.42 kB
#!/usr/bin/env python3
"""
Harbour Fine-tuning Script for qwen3.6:35b (Qwen3.6-35B-A3B MoE)
Uses LoRA with CPU training (121GB RAM available)
"""
import json
import torch
from pathlib import Path
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
# Configuration
MODEL_NAME = "Qwen/Qwen3.6-35B-A3B"
TRAIN_FILE = Path("/home/fivetech/finetune/harbour_train.jsonl")
VAL_FILE = Path("/home/fivetech/finetune/harbour_val.jsonl")
OUTPUT_DIR = Path("/home/fivetech/finetune/output")
MAX_SEQ_LENGTH = 2048
print("=" * 60)
print("Harbour Fine-tuning - qwen3.6:35b (MoE) with LoRA")
print("=" * 60)
# 1. Load tokenizer
print("\n1. Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side="right",
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 2. Load dataset
print("2. Loading dataset...")
def load_jsonl(path):
data = []
with open(path) as f:
for line in f:
data.append(json.loads(line))
return data
train_data = load_jsonl(TRAIN_FILE)
val_data = load_jsonl(VAL_FILE)
print(f" Train: {len(train_data)} entries")
print(f" Val: {len(val_data)} entries")
# 3. Format conversations for Qwen ChatML
print("3. Formatting conversations...")
def format_conversation(entry):
"""Convert messages to Qwen ChatML format."""
messages = entry["messages"]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
return {"text": text}
train_dataset = Dataset.from_list([format_conversation(e) for e in train_data])
val_dataset = Dataset.from_list([format_conversation(e) for e in val_data])
# 4. Tokenize
print("4. Tokenizing...")
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=MAX_SEQ_LENGTH,
padding=False,
)
train_dataset = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"],
desc="Tokenizing train",
)
val_dataset = val_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"],
desc="Tokenizing val",
)
print(f" Train tokens: {sum(len(x) for x in train_dataset['input_ids']):,}")
print(f" Val tokens: {sum(len(x) for x in val_dataset['input_ids']):,}")
# 5. Load model (CPU with float32)
print("5. Loading model (CPU mode)...")
print(" This may take a few minutes...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
)
# 6. LoRA configuration
print("6. Configuring LoRA...")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 7. Training arguments
print("7. Setting up training...")
training_args = TrainingArguments(
output_dir=str(OUTPUT_DIR),
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=1e-4,
weight_decay=0.01,
warmup_ratio=0.1,
lr_scheduler_type="cosine",
logging_steps=5,
save_steps=50,
save_total_limit=3,
eval_strategy="steps",
eval_steps=50,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
bf16=False,
fp16=False,
dataloader_num_workers=1,
report_to="none",
remove_unused_columns=False,
max_grad_norm=1.0,
)
# 8. Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# 9. Create trainer
print("8. Creating trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
)
# 10. Train
print("\n9. Starting training...")
print("=" * 60)
trainer.train()
# 11. Save
print("\n10. Saving model...")
trainer.save_model(str(OUTPUT_DIR / "final"))
tokenizer.save_pretrained(str(OUTPUT_DIR / "final"))
print("\n" + "=" * 60)
print("Training complete!")
print(f"Model saved to: {OUTPUT_DIR / 'final'}")
print("=" * 60)