| |
| """ |
| Example: ModelOpt Quantization and Export with SGLang |
| |
| This example demonstrates the streamlined workflow for quantizing a model with |
| ModelOpt and automatically exporting it for deployment with SGLang. |
| """ |
|
|
| import argparse |
| import os |
| from typing import Optional |
|
|
| import torch |
|
|
| import sglang as sgl |
| from sglang.srt.configs.device_config import DeviceConfig |
| from sglang.srt.configs.load_config import LoadConfig |
| from sglang.srt.configs.model_config import ModelConfig |
| from sglang.srt.distributed.parallel_state import ( |
| init_distributed_environment, |
| initialize_model_parallel, |
| ) |
| from sglang.srt.model_loader.loader import get_model_loader |
|
|
|
|
| def _validate_export(export_dir: str) -> bool: |
| """Validate that an exported model directory contains the expected files.""" |
| import glob |
|
|
| required_files = ["config.json", "tokenizer_config.json"] |
|
|
| if not os.path.exists(export_dir): |
| return False |
|
|
| |
| for file in required_files: |
| if not os.path.exists(os.path.join(export_dir, file)): |
| return False |
|
|
| |
| model_patterns = [ |
| "model*.safetensors", |
| "pytorch_model*.bin", |
| ] |
|
|
| has_model_file = False |
| for pattern in model_patterns: |
| matching_files = glob.glob(os.path.join(export_dir, pattern)) |
| if matching_files: |
| has_model_file = True |
| break |
|
|
| return has_model_file |
|
|
|
|
| def _get_export_info(export_dir: str) -> Optional[dict]: |
| """Get information about an exported model.""" |
| import json |
|
|
| if not _validate_export(export_dir): |
| return None |
|
|
| try: |
| config_path = os.path.join(export_dir, "config.json") |
| with open(config_path, "r") as f: |
| config = json.load(f) |
|
|
| return { |
| "model_type": config.get("model_type", "unknown"), |
| "architectures": config.get("architectures", []), |
| "quantization_config": config.get("quantization_config", {}), |
| "export_dir": export_dir, |
| } |
| except Exception: |
| return None |
|
|
|
|
| def quantize_and_export_model( |
| model_path: str, |
| export_dir: str, |
| quantization_method: str = "modelopt_fp8", |
| checkpoint_save_path: Optional[str] = None, |
| device: str = "cuda", |
| ) -> None: |
| """ |
| Quantize a model with ModelOpt and export it for SGLang deployment. |
| |
| Args: |
| model_path: Path to the original model |
| export_dir: Directory to export the quantized model |
| quantization_method: Quantization method ("modelopt_fp8" or "modelopt_fp4") |
| checkpoint_save_path: Optional path to save ModelOpt checkpoint |
| device: Device to use for quantization |
| """ |
| print("π Starting ModelOpt quantization and export workflow") |
| print(f"π₯ Input model: {model_path}") |
| print(f"π€ Export directory: {export_dir}") |
| print(f"βοΈ Quantization method: {quantization_method}") |
|
|
| |
| if not torch.distributed.is_initialized(): |
| print("π§ Initializing distributed environment...") |
| |
| os.environ["RANK"] = "0" |
| os.environ["WORLD_SIZE"] = "1" |
| os.environ["MASTER_ADDR"] = "localhost" |
| os.environ["MASTER_PORT"] = "12355" |
| os.environ["LOCAL_RANK"] = "0" |
|
|
| init_distributed_environment( |
| world_size=1, |
| rank=0, |
| local_rank=0, |
| backend="nccl" if device == "cuda" else "gloo", |
| ) |
| initialize_model_parallel( |
| tensor_model_parallel_size=1, |
| pipeline_model_parallel_size=1, |
| ) |
|
|
| |
| model_config = ModelConfig( |
| model_path=model_path, |
| quantization=quantization_method, |
| trust_remote_code=True, |
| ) |
|
|
| load_config = LoadConfig( |
| modelopt_checkpoint_save_path=checkpoint_save_path, |
| modelopt_export_path=export_dir, |
| ) |
| device_config = DeviceConfig(device=device) |
|
|
| |
| print("π Loading and quantizing model...") |
| model_loader = get_model_loader(load_config, model_config) |
|
|
| try: |
| model_loader.load_model( |
| model_config=model_config, |
| device_config=device_config, |
| ) |
| print("β
Model quantized successfully!") |
|
|
| |
| if _validate_export(export_dir): |
| print("β
Export validation passed!") |
|
|
| info = _get_export_info(export_dir) |
| if info: |
| print("π Model info:") |
| print(f" - Type: {info['model_type']}") |
| print(f" - Architecture: {info['architectures']}") |
| print(f" - Quantization: {info['quantization_config']}") |
| else: |
| print("β Export validation failed!") |
| return |
|
|
| except Exception as e: |
| print(f"β Quantization failed: {e}") |
| return |
|
|
| print("\nπ Workflow completed successfully!") |
| print(f"π Quantized model exported to: {export_dir}") |
| print("\nπ To use the exported model:") |
| print( |
| f" python -m sglang.launch_server --model-path {export_dir} --quantization modelopt" |
| ) |
| print("\n # Or in Python:") |
| print(" import sglang as sgl") |
| print(f" llm = sgl.Engine(model_path='{export_dir}', quantization='modelopt')") |
| print(" # Note: 'modelopt' auto-detects FP4/FP8 from model config") |
|
|
|
|
| def deploy_exported_model( |
| export_dir: str, |
| host: str = "127.0.0.1", |
| port: int = 30000, |
| ) -> None: |
| """ |
| Deploy an exported ModelOpt quantized model with SGLang. |
| |
| Args: |
| export_dir: Directory containing the exported model |
| host: Host to bind the server to |
| port: Port to bind the server to |
| """ |
| print(f"π Deploying exported model from: {export_dir}") |
|
|
| |
| if not _validate_export(export_dir): |
| print("β Invalid export directory!") |
| return |
|
|
| try: |
| |
| |
| llm = sgl.Engine( |
| model_path=export_dir, |
| quantization="modelopt", |
| host=host, |
| port=port, |
| ) |
|
|
| print("β
Model deployed successfully!") |
| print(f"π Server running at http://{host}:{port}") |
|
|
| |
| prompts = ["Hello, how are you?", "What is the capital of France?"] |
| sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 100} |
|
|
| print("\nπ§ͺ Running example inference...") |
| outputs = llm.generate(prompts, sampling_params) |
|
|
| for i, output in enumerate(outputs): |
| print(f"Prompt {i+1}: {prompts[i]}") |
| print(f"Output: {output['text']}") |
| print() |
|
|
| except Exception as e: |
| print(f"β Deployment failed: {e}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="ModelOpt Quantization and Export with SGLang", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| # Quantize and export a model (recommended workflow) |
| python modelopt_quantize_and_export.py quantize \\ |
| --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \\ |
| --export-dir ./quantized_model \\ |
| --quantization-method modelopt_fp8 |
| |
| # Deploy a pre-exported model |
| python modelopt_quantize_and_export.py deploy \\ |
| --export-dir ./quantized_model |
| """, |
| ) |
|
|
| subparsers = parser.add_subparsers(dest="command", help="Available commands") |
|
|
| |
| quantize_parser = subparsers.add_parser( |
| "quantize", help="Quantize and export a model" |
| ) |
| quantize_parser.add_argument( |
| "--model-path", required=True, help="Path to the model to quantize" |
| ) |
| quantize_parser.add_argument( |
| "--export-dir", required=True, help="Directory to export the quantized model" |
| ) |
| quantize_parser.add_argument( |
| "--quantization-method", |
| choices=["modelopt_fp8", "modelopt_fp4"], |
| default="modelopt_fp8", |
| help="Quantization method to use", |
| ) |
| quantize_parser.add_argument( |
| "--checkpoint-save-path", help="Optional path to save ModelOpt checkpoint" |
| ) |
| quantize_parser.add_argument( |
| "--device", default="cuda", help="Device to use for quantization" |
| ) |
|
|
| |
| |
|
|
| |
| deploy_parser = subparsers.add_parser("deploy", help="Deploy an exported model") |
| deploy_parser.add_argument( |
| "--export-dir", required=True, help="Directory containing the exported model" |
| ) |
| deploy_parser.add_argument( |
| "--host", default="127.0.0.1", help="Host to bind the server to" |
| ) |
| deploy_parser.add_argument( |
| "--port", type=int, default=30000, help="Port to bind the server to" |
| ) |
|
|
| args = parser.parse_args() |
|
|
| if args.command == "quantize": |
| quantize_and_export_model( |
| model_path=args.model_path, |
| export_dir=args.export_dir, |
| quantization_method=args.quantization_method, |
| checkpoint_save_path=args.checkpoint_save_path, |
| device=args.device, |
| ) |
| elif args.command == "deploy": |
| deploy_exported_model( |
| export_dir=args.export_dir, |
| host=args.host, |
| port=args.port, |
| ) |
| else: |
| parser.print_help() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|