Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /tools /finetune_modal.py

Veeru-c

initial commit

06bd253 23 days ago

raw

history blame contribute delete

5.65 kB

	import modal
	import os

	app = modal.App("finetune-census-phi3")

	# Volumes
	vol_dataset = modal.Volume.from_name("finetune-dataset")
	vol_checkpoints = modal.Volume.from_name("model-checkpoints", create_if_missing=True)

	# Image: Build from CUDA base to ensure compatibility
	image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
	.apt_install("git") \
	.run_commands(
	"pip install --upgrade pip",
	"pip install packaging ninja psutil",
	"pip install unsloth_zoo", # This will install compatible torch/torchvision
	"pip install torchvision", # Ensure torchvision is installed
	# Skip flash-attn - it causes OOM during build and is optional
	"pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
	"pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
	) \
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})

	@app.function(
	image=image,
	volumes={
	"/data/dataset": vol_dataset,
	"/data/checkpoints": vol_checkpoints
	},
	gpu="H200", # Fastest GPU - 3-4x faster than A100
	timeout=86400, # 24 hours
	)
	def finetune():
	from unsloth import FastLanguageModel
	from trl import SFTTrainer
	from transformers import TrainingArguments
	from datasets import load_dataset
	import torch

	print("🚀 Starting Fine-tuning Job...")

	# 1. Configuration
	max_seq_length = 2048 # Can go up to 4096 for Phi-3
	dtype = None # Auto detection
	load_in_4bit = True # Use 4bit quantization to reduce memory usage

	model_name = "unsloth/Phi-3-mini-4k-instruct"

	# 2. Load Model and Tokenizer
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	)

	# 3. Add LoRA Adapters
	model = FastLanguageModel.get_peft_model(
	model,
	r=16, # Rank
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",],
	lora_alpha=16,
	lora_dropout=0, # Supports any, but = 0 is optimized
	bias="none", # Supports any, but = "none" is optimized
	use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
	random_state=3407,
	use_rslora=False, # Rank stabilized LoRA
	loftq_config=None, # LoftQ
	)

	# 4. Load Dataset
	# We generated JSONL files.
	# Format: {"instruction": ..., "input": ..., "output": ...}
	dataset = load_dataset("json", data_files={"train": "/data/dataset/train.jsonl", "test": "/data/dataset/val.jsonl"})

	# 5. Formatting Function
	# Alpaca format
	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""

	EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
	def formatting_prompts_func(examples):
	instructions = examples["instruction"]
	inputs = examples["input"]
	outputs = examples["output"]
	texts = []
	for instruction, input, output in zip(instructions, inputs, outputs):
	# Must add EOS_TOKEN, otherwise your generation will go on forever!
	text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
	texts.append(text)
	return { "text" : texts, }

	dataset = dataset.map(formatting_prompts_func, batched=True)

	# 6. Training Arguments (Optimized for H200)
	training_args = TrainingArguments(
	per_device_train_batch_size=4, # Increased for H200's 141GB memory
	gradient_accumulation_steps=2, # Effective batch size = 8
	warmup_steps=100, # Increased for larger dataset
	max_steps=10000, # ~4% of full epoch, completes in ~90 minutes
	# num_train_epochs=1, # Full epoch takes ~30 hours with 1.9M samples
	learning_rate=2e-4,
	fp16=not torch.cuda.is_bf16_supported(),
	bf16=torch.cuda.is_bf16_supported(),
	logging_steps=100, # Log less frequently
	optim="adamw_8bit",
	weight_decay=0.01,
	lr_scheduler_type="linear",
	seed=3407,
	output_dir="outputs",
	report_to="none", # Disable wandb logging
	save_strategy="steps",
	save_steps=10000, # Save checkpoints every 10k steps
	save_total_limit=2, # Keep only 2 checkpoints
	)

	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset["train"],
	eval_dataset=dataset["test"],
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	dataset_num_proc=2,
	packing=False, # Can make training 5x faster for short sequences
	args=training_args,
	)

	# 7. Train
	print("Training...")
	trainer_stats = trainer.train()

	# 8. Save Model
	print("Saving model to /data/checkpoints/phi3-census-lora...")
	model.save_pretrained("/data/checkpoints/phi3-census-lora")
	tokenizer.save_pretrained("/data/checkpoints/phi3-census-lora")

	# Also save to GGUF if possible? Unsloth supports it.
	# model.save_pretrained_gguf("/data/checkpoints/phi3-census-gguf", tokenizer, quantization_method = "q4_k_m")

	# Commit volume
	vol_checkpoints.commit()
	print("✅ Fine-tuning Complete!")

	@app.local_entrypoint()
	def main():
	finetune.remote()