sdlc-agent / src /tools /finetune_modal.py
Veeru-c's picture
initial commit
06bd253
import modal
import os
app = modal.App("finetune-census-phi3")
# Volumes
vol_dataset = modal.Volume.from_name("finetune-dataset")
vol_checkpoints = modal.Volume.from_name("model-checkpoints", create_if_missing=True)
# Image: Build from CUDA base to ensure compatibility
image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
.apt_install("git") \
.run_commands(
"pip install --upgrade pip",
"pip install packaging ninja psutil",
"pip install unsloth_zoo", # This will install compatible torch/torchvision
"pip install torchvision", # Ensure torchvision is installed
# Skip flash-attn - it causes OOM during build and is optional
"pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
"pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
) \
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@app.function(
image=image,
volumes={
"/data/dataset": vol_dataset,
"/data/checkpoints": vol_checkpoints
},
gpu="H200", # Fastest GPU - 3-4x faster than A100
timeout=86400, # 24 hours
)
def finetune():
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import torch
print("πŸš€ Starting Fine-tuning Job...")
# 1. Configuration
max_seq_length = 2048 # Can go up to 4096 for Phi-3
dtype = None # Auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage
model_name = "unsloth/Phi-3-mini-4k-instruct"
# 2. Load Model and Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 3. Add LoRA Adapters
model = FastLanguageModel.get_peft_model(
model,
r=16, # Rank
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias="none", # Supports any, but = "none" is optimized
use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
random_state=3407,
use_rslora=False, # Rank stabilized LoRA
loftq_config=None, # LoftQ
)
# 4. Load Dataset
# We generated JSONL files.
# Format: {"instruction": ..., "input": ..., "output": ...}
dataset = load_dataset("json", data_files={"train": "/data/dataset/train.jsonl", "test": "/data/dataset/val.jsonl"})
# 5. Formatting Function
# Alpaca format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
dataset = dataset.map(formatting_prompts_func, batched=True)
# 6. Training Arguments (Optimized for H200)
training_args = TrainingArguments(
per_device_train_batch_size=4, # Increased for H200's 141GB memory
gradient_accumulation_steps=2, # Effective batch size = 8
warmup_steps=100, # Increased for larger dataset
max_steps=10000, # ~4% of full epoch, completes in ~90 minutes
# num_train_epochs=1, # Full epoch takes ~30 hours with 1.9M samples
learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=100, # Log less frequently
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none", # Disable wandb logging
save_strategy="steps",
save_steps=10000, # Save checkpoints every 10k steps
save_total_limit=2, # Keep only 2 checkpoints
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False, # Can make training 5x faster for short sequences
args=training_args,
)
# 7. Train
print("Training...")
trainer_stats = trainer.train()
# 8. Save Model
print("Saving model to /data/checkpoints/phi3-census-lora...")
model.save_pretrained("/data/checkpoints/phi3-census-lora")
tokenizer.save_pretrained("/data/checkpoints/phi3-census-lora")
# Also save to GGUF if possible? Unsloth supports it.
# model.save_pretrained_gguf("/data/checkpoints/phi3-census-gguf", tokenizer, quantization_method = "q4_k_m")
# Commit volume
vol_checkpoints.commit()
print("βœ… Fine-tuning Complete!")
@app.local_entrypoint()
def main():
finetune.remote()