| | |
| | """ |
| | FinEE Training Pipeline v1.0 |
| | |
| | Master orchestrator for training the Finance Entity Extractor. |
| | Handles data generation, domain adaptation, fine-tuning, and export. |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import subprocess |
| | import sys |
| | import logging |
| | import time |
| | from pathlib import Path |
| | from datetime import datetime |
| | from typing import List, Dict, Any |
| |
|
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(levelname)s - %(message)s', |
| | datefmt='%Y-%m-%d %H:%M:%S' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | CONFIG = { |
| | "version": "1.0.0", |
| | "project_name": "finee", |
| | |
| | "models": { |
| | "base": "microsoft/Phi-3-mini-4k-instruct", |
| | "domain": "models/base/phi3-finance-base", |
| | "final": "models/finee-v1.0", |
| | "adapter": "models/adapters/finee-adapter-v1", |
| | }, |
| | |
| | "data_generation": { |
| | "script": "scripts/generate_comprehensive_data.py", |
| | "output_dir": "data/training", |
| | "samples": 10000, |
| | }, |
| | |
| | "domain_pretrain": { |
| | "enabled": False, |
| | "script": "scripts/domain_pretrain.py", |
| | "iters": 2000, |
| | }, |
| | |
| | "finetune": { |
| | "script": "scripts/retrain_v8.py", |
| | "iters": 1000, |
| | "batch_size": 4, |
| | "learning_rate": 1e-5, |
| | "lora_layers": 16, |
| | }, |
| | |
| | "evaluation": { |
| | "script": "scripts/test_multi_bank.py", |
| | "benchmark_dir": "data/benchmark", |
| | }, |
| | |
| | "export": { |
| | "script": "scripts/upload_to_hf.py", |
| | "repo_id": "Ranjit0034/finance-entity-extractor", |
| | } |
| | } |
| |
|
| | class Pipeline: |
| | def __init__(self, dry_run: bool = False): |
| | self.dry_run = dry_run |
| | self.start_time = time.time() |
| | self.ensure_directories() |
| |
|
| | def ensure_directories(self): |
| | """Create necessary directories.""" |
| | dirs = [ |
| | "data/training", |
| | "data/benchmark", |
| | "models/base", |
| | "models/adapters", |
| | "logs" |
| | ] |
| | for d in dirs: |
| | Path(d).mkdir(parents=True, exist_ok=True) |
| |
|
| | def run_step(self, name: str, cmd: List[str], cwd: str = ".") -> bool: |
| | """Run a single pipeline step.""" |
| | logger.info(f"▶️ STARTING STEP: {name}") |
| | logger.info(f"Command: {' '.join(cmd)}") |
| | |
| | if self.dry_run: |
| | logger.info("Dry run - Skipping execution") |
| | return True |
| | |
| | try: |
| | subprocess.run(cmd, cwd=cwd, check=True) |
| | logger.info(f"✅ COMPLETED STEP: {name}") |
| | return True |
| | except subprocess.CalledProcessError as e: |
| | logger.error(f"❌ FAILED STEP: {name}") |
| | logger.error(str(e)) |
| | return False |
| |
|
| | def check_dependencies(self): |
| | """Verify dependencies are installed.""" |
| | logger.info("Verifying dependencies...") |
| | try: |
| | import mlx.core |
| | import finee |
| | logger.info(f"Found finee version: {finee.__version__}") |
| | return True |
| | except ImportError as e: |
| | logger.error(f"Missing dependency: {e}") |
| | logger.error("Please run: pip install -e .[metal]") |
| | return False |
| |
|
| | def generate_data(self): |
| | """Step 1: Generate synthetic training data.""" |
| | script = CONFIG["data_generation"]["script"] |
| | return self.run_step( |
| | "Data Generation", |
| | [sys.executable, script] |
| | ) |
| |
|
| | def domain_pretrain(self): |
| | """Step 2: Domain Adaptation (Optional).""" |
| | if not CONFIG["domain_pretrain"]["enabled"]: |
| | logger.info("Skipping domain pre-training (disabled in config)") |
| | return True |
| | |
| | script = CONFIG["domain_pretrain"]["script"] |
| | return self.run_step( |
| | "Domain Pre-training", |
| | [sys.executable, script] |
| | ) |
| |
|
| | def finetune(self): |
| | """Step 3: Fine-tuning.""" |
| | |
| | |
| | cmd = [ |
| | "mlx_lm.lora", |
| | "--model", CONFIG["models"]["base"], |
| | "--train", |
| | "--data", CONFIG["data_generation"]["output_dir"], |
| | "--adapter-path", CONFIG["models"]["adapter"], |
| | "--iters", str(CONFIG["finetune"]["iters"]), |
| | "--batch-size", str(CONFIG["finetune"]["batch_size"]), |
| | "--learning-rate", str(CONFIG["finetune"]["learning_rate"]), |
| | "--lora-layers", str(CONFIG["finetune"]["lora_layers"]), |
| | "--seed", "42" |
| | ] |
| | return self.run_step("Fine-tuning", cmd) |
| |
|
| | def fuse_model(self): |
| | """Step 4: Fuse adapters.""" |
| | cmd = [ |
| | "mlx_lm.fuse", |
| | "--model", CONFIG["models"]["base"], |
| | "--adapter-path", CONFIG["models"]["adapter"], |
| | "--save-path", CONFIG["models"]["final"] |
| | ] |
| | return self.run_step("Model Fusion", cmd) |
| |
|
| | def evaluate(self): |
| | """Step 5: Evaluation.""" |
| | script = CONFIG["evaluation"]["script"] |
| | return self.run_step( |
| | "Evaluation", |
| | [sys.executable, script] |
| | ) |
| |
|
| | def export(self): |
| | """Step 6: Export/Upload.""" |
| | script = CONFIG["export"]["script"] |
| | return self.run_step( |
| | "HugginFace Export", |
| | [sys.executable, script] |
| | ) |
| |
|
| | def run_all(self): |
| | """Run full pipeline.""" |
| | if not self.check_dependencies(): |
| | return |
| | |
| | steps = [ |
| | self.generate_data, |
| | self.domain_pretrain, |
| | self.finetune, |
| | self.fuse_model, |
| | self.evaluate, |
| | self.export |
| | ] |
| | |
| | for step in steps: |
| | if not step(): |
| | logger.error("Pipeline aborted due to failure.") |
| | sys.exit(1) |
| | |
| | duration = time.time() - self.start_time |
| | logger.info(f"🎉 Pipeline completed successfully in {duration/60:.2f} minutes") |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="FinEE Training Pipeline") |
| | parser.add_argument("--step", choices=["data", "pretrain", "finetune", "fuse", "eval", "export", "all"], default="all") |
| | parser.add_argument("--dry-run", action="store_true", help="Print commands without executing") |
| | args = parser.parse_args() |
| | |
| | pipeline = Pipeline(dry_run=args.dry_run) |
| | |
| | if args.step == "all": |
| | pipeline.run_all() |
| | else: |
| | pipeline.check_dependencies() |
| | steps = { |
| | "data": pipeline.generate_data, |
| | "pretrain": pipeline.domain_pretrain, |
| | "finetune": pipeline.finetune, |
| | "fuse": pipeline.fuse_model, |
| | "eval": pipeline.evaluate, |
| | "export": pipeline.export |
| | } |
| | steps[args.step]() |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|