|
|
""" |
|
|
Schemas for fine-tuning pipeline configuration and management. |
|
|
|
|
|
This module contains Pydantic models for training jobs, configurations, |
|
|
and evaluation results. |
|
|
""" |
|
|
|
|
|
from datetime import datetime |
|
|
from enum import Enum |
|
|
from typing import Any, Dict, List, Optional |
|
|
from uuid import UUID |
|
|
|
|
|
from pydantic import BaseModel, Field, field_validator |
|
|
|
|
|
|
|
|
class TrainingStatus(str, Enum): |
|
|
"""Status of a training job.""" |
|
|
|
|
|
PENDING = "pending" |
|
|
RUNNING = "running" |
|
|
COMPLETED = "completed" |
|
|
FAILED = "failed" |
|
|
CANCELLED = "cancelled" |
|
|
|
|
|
|
|
|
class TrainingStrategy(str, Enum): |
|
|
"""Training strategy type.""" |
|
|
|
|
|
SUPERVISED = "supervised" |
|
|
RLHF = "rlhf" |
|
|
DPO = "dpo" |
|
|
|
|
|
|
|
|
class ModelType(str, Enum): |
|
|
"""Type of model to train.""" |
|
|
|
|
|
LLM = "llm" |
|
|
MODERATION = "moderation" |
|
|
|
|
|
|
|
|
class DatasetSplit(BaseModel): |
|
|
"""Dataset split configuration.""" |
|
|
|
|
|
train_ratio: float = Field( |
|
|
default=0.8, |
|
|
ge=0.1, |
|
|
le=0.9, |
|
|
description="Ratio of data for training", |
|
|
) |
|
|
validation_ratio: float = Field( |
|
|
default=0.1, |
|
|
ge=0.05, |
|
|
le=0.3, |
|
|
description="Ratio of data for validation", |
|
|
) |
|
|
test_ratio: float = Field( |
|
|
default=0.1, |
|
|
ge=0.05, |
|
|
le=0.3, |
|
|
description="Ratio of data for testing", |
|
|
) |
|
|
|
|
|
@field_validator("test_ratio") |
|
|
@classmethod |
|
|
def validate_ratios_sum_to_one(cls, v, info): |
|
|
"""Validate that all ratios sum to 1.0.""" |
|
|
if hasattr(info, "data"): |
|
|
train_ratio = info.data.get("train_ratio", 0.8) |
|
|
validation_ratio = info.data.get("validation_ratio", 0.1) |
|
|
total = train_ratio + validation_ratio + v |
|
|
if abs(total - 1.0) > 0.001: |
|
|
raise ValueError("Train, validation, and test ratios must sum to 1.0") |
|
|
return v |
|
|
|
|
|
|
|
|
class TrainingConfig(BaseModel): |
|
|
"""Configuration for training job.""" |
|
|
|
|
|
|
|
|
model_name: str = Field( |
|
|
..., |
|
|
description="Base model name or path", |
|
|
min_length=1, |
|
|
max_length=200, |
|
|
) |
|
|
model_type: ModelType = Field( |
|
|
default=ModelType.LLM, |
|
|
description="Type of model to train", |
|
|
) |
|
|
|
|
|
|
|
|
strategy: TrainingStrategy = Field( |
|
|
default=TrainingStrategy.SUPERVISED, |
|
|
description="Training strategy to use", |
|
|
) |
|
|
|
|
|
|
|
|
min_quality_score: float = Field( |
|
|
default=0.7, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Minimum quality score for training data", |
|
|
) |
|
|
require_feedback: bool = Field( |
|
|
default=True, |
|
|
description="Only use responses with human feedback", |
|
|
) |
|
|
feedback_types: List[str] = Field( |
|
|
default=["good"], |
|
|
description="Feedback types to include in training", |
|
|
) |
|
|
max_toxicity_score: float = Field( |
|
|
default=0.3, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Maximum toxicity score for training data", |
|
|
) |
|
|
dataset_split: DatasetSplit = Field( |
|
|
default_factory=DatasetSplit, |
|
|
description="Dataset split configuration", |
|
|
) |
|
|
|
|
|
|
|
|
learning_rate: float = Field( |
|
|
default=2e-5, |
|
|
ge=1e-6, |
|
|
le=1e-3, |
|
|
description="Learning rate for training", |
|
|
) |
|
|
batch_size: int = Field( |
|
|
default=8, |
|
|
ge=1, |
|
|
le=128, |
|
|
description="Training batch size", |
|
|
) |
|
|
gradient_accumulation_steps: int = Field( |
|
|
default=4, |
|
|
ge=1, |
|
|
le=32, |
|
|
description="Gradient accumulation steps", |
|
|
) |
|
|
num_epochs: int = Field( |
|
|
default=3, |
|
|
ge=1, |
|
|
le=20, |
|
|
description="Number of training epochs", |
|
|
) |
|
|
max_length: int = Field( |
|
|
default=512, |
|
|
ge=128, |
|
|
le=2048, |
|
|
description="Maximum sequence length", |
|
|
) |
|
|
warmup_steps: int = Field( |
|
|
default=100, |
|
|
ge=0, |
|
|
le=1000, |
|
|
description="Number of warmup steps", |
|
|
) |
|
|
weight_decay: float = Field( |
|
|
default=0.01, |
|
|
ge=0.0, |
|
|
le=0.1, |
|
|
description="Weight decay for regularization", |
|
|
) |
|
|
|
|
|
|
|
|
use_lora: bool = Field( |
|
|
default=True, |
|
|
description="Use LoRA (Low-Rank Adaptation) for efficient fine-tuning", |
|
|
) |
|
|
lora_rank: int = Field( |
|
|
default=16, |
|
|
ge=4, |
|
|
le=128, |
|
|
description="LoRA rank parameter", |
|
|
) |
|
|
lora_alpha: int = Field( |
|
|
default=32, |
|
|
ge=8, |
|
|
le=256, |
|
|
description="LoRA alpha parameter", |
|
|
) |
|
|
use_mixed_precision: bool = Field( |
|
|
default=True, |
|
|
description="Use mixed precision training", |
|
|
) |
|
|
save_steps: int = Field( |
|
|
default=500, |
|
|
ge=50, |
|
|
le=5000, |
|
|
description="Save checkpoint every N steps", |
|
|
) |
|
|
eval_steps: int = Field( |
|
|
default=100, |
|
|
ge=10, |
|
|
le=1000, |
|
|
description="Evaluate every N steps", |
|
|
) |
|
|
|
|
|
|
|
|
experiment_name: Optional[str] = Field( |
|
|
None, |
|
|
max_length=100, |
|
|
description="Name for experiment tracking", |
|
|
) |
|
|
tags: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Tags for organizing experiments", |
|
|
) |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"model_name": "microsoft/DialoGPT-small", |
|
|
"model_type": "llm", |
|
|
"strategy": "supervised", |
|
|
"min_quality_score": 0.8, |
|
|
"require_feedback": True, |
|
|
"feedback_types": ["good"], |
|
|
"learning_rate": 2e-5, |
|
|
"batch_size": 8, |
|
|
"num_epochs": 3, |
|
|
"use_lora": True, |
|
|
"experiment_name": "quality-improvement-v1", |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class TrainingJobRequest(BaseModel): |
|
|
"""Request to start a training job.""" |
|
|
|
|
|
config: TrainingConfig = Field(..., description="Training configuration") |
|
|
description: Optional[str] = Field( |
|
|
None, |
|
|
max_length=500, |
|
|
description="Description of the training job", |
|
|
) |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"config": { |
|
|
"model_name": "microsoft/DialoGPT-small", |
|
|
"strategy": "supervised", |
|
|
"learning_rate": 2e-5, |
|
|
"batch_size": 8, |
|
|
"num_epochs": 3, |
|
|
}, |
|
|
"description": "Fine-tune model on high-quality responses", |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class TrainingMetrics(BaseModel): |
|
|
"""Training metrics and statistics.""" |
|
|
|
|
|
|
|
|
current_epoch: int = Field(..., description="Current training epoch") |
|
|
total_epochs: int = Field(..., description="Total number of epochs") |
|
|
current_step: int = Field(..., description="Current training step") |
|
|
total_steps: int = Field(..., description="Total number of steps") |
|
|
progress_percentage: float = Field( |
|
|
..., ge=0.0, le=100.0, description="Training progress percentage" |
|
|
) |
|
|
|
|
|
|
|
|
train_loss: Optional[float] = Field(None, description="Current training loss") |
|
|
eval_loss: Optional[float] = Field(None, description="Current evaluation loss") |
|
|
best_eval_loss: Optional[float] = Field(None, description="Best evaluation loss so far") |
|
|
|
|
|
|
|
|
learning_rate: Optional[float] = Field(None, description="Current learning rate") |
|
|
grad_norm: Optional[float] = Field(None, description="Gradient norm") |
|
|
examples_per_second: Optional[float] = Field(None, description="Training speed") |
|
|
|
|
|
|
|
|
elapsed_time: Optional[float] = Field(None, description="Elapsed time in seconds") |
|
|
estimated_remaining: Optional[float] = Field( |
|
|
None, description="Estimated remaining time in seconds" |
|
|
) |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"current_epoch": 2, |
|
|
"total_epochs": 3, |
|
|
"current_step": 450, |
|
|
"total_steps": 600, |
|
|
"progress_percentage": 75.0, |
|
|
"train_loss": 0.85, |
|
|
"eval_loss": 0.92, |
|
|
"best_eval_loss": 0.89, |
|
|
"learning_rate": 1.5e-5, |
|
|
"examples_per_second": 12.5, |
|
|
"elapsed_time": 1800.0, |
|
|
"estimated_remaining": 600.0, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class EvaluationResult(BaseModel): |
|
|
"""Results from model evaluation.""" |
|
|
|
|
|
|
|
|
perplexity: Optional[float] = Field(None, description="Model perplexity") |
|
|
bleu_score: Optional[float] = Field(None, description="BLEU score") |
|
|
rouge_l: Optional[float] = Field(None, description="ROUGE-L score") |
|
|
|
|
|
|
|
|
avg_quality_score: Optional[float] = Field(None, description="Average quality score") |
|
|
avg_toxicity_score: Optional[float] = Field(None, description="Average toxicity score") |
|
|
response_length_avg: Optional[float] = Field(None, description="Average response length") |
|
|
|
|
|
|
|
|
sample_inputs: List[str] = Field(default_factory=list, description="Sample input messages") |
|
|
sample_outputs: List[str] = Field(default_factory=list, description="Sample generated outputs") |
|
|
sample_scores: List[float] = Field(default_factory=list, description="Sample quality scores") |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"perplexity": 15.2, |
|
|
"bleu_score": 0.65, |
|
|
"rouge_l": 0.72, |
|
|
"avg_quality_score": 0.83, |
|
|
"avg_toxicity_score": 0.05, |
|
|
"response_length_avg": 45.2, |
|
|
"sample_inputs": ["How to set up a bot?"], |
|
|
"sample_outputs": ["To set up a bot, follow these steps..."], |
|
|
"sample_scores": [0.9], |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class TrainingJob(BaseModel): |
|
|
"""Training job information.""" |
|
|
|
|
|
id: UUID = Field(..., description="Training job ID") |
|
|
status: TrainingStatus = Field(..., description="Current job status") |
|
|
config: TrainingConfig = Field(..., description="Training configuration") |
|
|
description: Optional[str] = Field(None, description="Job description") |
|
|
|
|
|
|
|
|
created_at: datetime = Field(..., description="Job creation time") |
|
|
started_at: Optional[datetime] = Field(None, description="Job start time") |
|
|
completed_at: Optional[datetime] = Field(None, description="Job completion time") |
|
|
|
|
|
|
|
|
metrics: Optional[TrainingMetrics] = Field(None, description="Training metrics") |
|
|
evaluation: Optional[EvaluationResult] = Field(None, description="Evaluation results") |
|
|
|
|
|
|
|
|
model_path: Optional[str] = Field(None, description="Path to trained model") |
|
|
model_version: Optional[str] = Field(None, description="Model version identifier") |
|
|
logs_path: Optional[str] = Field(None, description="Path to training logs") |
|
|
|
|
|
|
|
|
error_message: Optional[str] = Field(None, description="Error message if failed") |
|
|
error_details: Optional[Dict[str, Any]] = Field(None, description="Detailed error information") |
|
|
|
|
|
model_config = { |
|
|
"from_attributes": True, |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"id": "123e4567-e89b-12d3-a456-426614174000", |
|
|
"status": "running", |
|
|
"config": { |
|
|
"model_name": "microsoft/DialoGPT-small", |
|
|
"strategy": "supervised", |
|
|
"learning_rate": 2e-5, |
|
|
}, |
|
|
"description": "Quality improvement training", |
|
|
"created_at": "2025-01-07T10:00:00Z", |
|
|
"started_at": "2025-01-07T10:05:00Z", |
|
|
"model_version": "v1.2.0", |
|
|
} |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
class TrainingJobList(BaseModel): |
|
|
"""List of training jobs.""" |
|
|
|
|
|
jobs: List[TrainingJob] = Field(..., description="List of training jobs") |
|
|
total: int = Field(..., description="Total number of jobs") |
|
|
limit: int = Field(..., description="Limit used in query") |
|
|
offset: int = Field(..., description="Offset used in query") |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"jobs": [], |
|
|
"total": 15, |
|
|
"limit": 10, |
|
|
"offset": 0, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class ModelDeployRequest(BaseModel): |
|
|
"""Request to deploy a trained model.""" |
|
|
|
|
|
job_id: UUID = Field(..., description="Training job ID") |
|
|
model_type: ModelType = Field(..., description="Type of model to deploy") |
|
|
set_as_default: bool = Field(default=True, description="Set as default model for the service") |
|
|
backup_current: bool = Field(default=True, description="Backup current model before deployment") |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"job_id": "123e4567-e89b-12d3-a456-426614174000", |
|
|
"model_type": "llm", |
|
|
"set_as_default": True, |
|
|
"backup_current": True, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class ModelDeployResponse(BaseModel): |
|
|
"""Response from model deployment.""" |
|
|
|
|
|
success: bool = Field(..., description="Whether deployment was successful") |
|
|
model_version: str = Field(..., description="Deployed model version") |
|
|
previous_version: Optional[str] = Field(None, description="Previous model version") |
|
|
backup_path: Optional[str] = Field(None, description="Path to backup model") |
|
|
message: str = Field(..., description="Deployment status message") |
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"example": { |
|
|
"success": True, |
|
|
"model_version": "v1.2.0", |
|
|
"previous_version": "v1.1.5", |
|
|
"backup_path": "/models/backups/llm_v1.1.5", |
|
|
"message": "Model deployed successfully", |
|
|
} |
|
|
} |
|
|
} |
|
|
|