Spaces:
Sleeping
Sleeping
| """ | |
| Hyper-advanced configuration system with environment-aware settings. | |
| """ | |
| from pydantic_settings import BaseSettings | |
| from pydantic import Field, validator | |
| from typing import Dict, List, Optional, Literal, Any | |
| from enum import Enum | |
| from pathlib import Path | |
| import torch | |
| class OptimizationLevel(str, Enum): | |
| NONE = "none" | |
| BASIC = "basic" | |
| ADVANCED = "advanced" | |
| HYPER = "hyper" | |
| class QuantizationType(str, Enum): | |
| NONE = "none" | |
| INT8 = "int8" | |
| INT4 = "int4" | |
| GPTQ = "gptq" | |
| GGUF = "gguf" | |
| ONNX = "onnx" | |
| class DeviceType(str, Enum): | |
| CPU = "cpu" | |
| CUDA = "cuda" | |
| MPS = "mps" # Apple Silicon | |
| AUTO = "auto" | |
| class HyperAdvancedConfig(BaseSettings): | |
| """Hyper-advanced configuration for production RAG system.""" | |
| # ===== Paths ===== | |
| base_dir: Path = Path(__file__).parent | |
| data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data") | |
| models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models") | |
| cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache") | |
| logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs") | |
| # ===== Model Configuration ===== | |
| embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" | |
| embedding_quantization: QuantizationType = QuantizationType.ONNX | |
| embedding_device: DeviceType = DeviceType.CPU | |
| embedding_batch_size: int = 32 | |
| llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" | |
| llm_quantization: QuantizationType = QuantizationType.GGUF | |
| llm_device: DeviceType = DeviceType.CPU | |
| llm_max_tokens: int = 1024 | |
| llm_temperature: float = 0.1 | |
| llm_top_p: float = 0.95 | |
| llm_repetition_penalty: float = 1.1 | |
| # ===== RAG Optimization ===== | |
| optimization_level: OptimizationLevel = OptimizationLevel.HYPER | |
| chunk_size: int = 512 | |
| chunk_overlap: int = 64 | |
| dynamic_top_k: Dict[str, int] = { | |
| "simple": 2, # < 5 words | |
| "medium": 4, # 5-15 words | |
| "complex": 6, # 15-30 words | |
| "expert": 8 # > 30 words | |
| } | |
| # ===== Advanced Caching ===== | |
| enable_embedding_cache: bool = True | |
| enable_semantic_cache: bool = True # Cache similar queries | |
| enable_response_cache: bool = True | |
| cache_max_size_mb: int = 1024 # 1GB cache limit | |
| cache_ttl_seconds: int = 3600 # 1 hour | |
| # ===== Pre-filtering ===== | |
| enable_keyword_filter: bool = True | |
| enable_semantic_filter: bool = True # Use embeddings for pre-filter | |
| enable_hybrid_filter: bool = True # Combine keyword + semantic | |
| filter_threshold: float = 0.3 # Cosine similarity threshold | |
| max_candidates: int = 100 # Max candidates for filtering | |
| # ===== Prompt Optimization ===== | |
| enable_prompt_compression: bool = True | |
| enable_prompt_summarization: bool = True # Summarize chunks | |
| max_prompt_tokens: int = 1024 | |
| compression_ratio: float = 0.5 # Keep 50% of original content | |
| # ===== Inference Optimization ===== | |
| enable_kv_cache: bool = True # Key-value caching for LLM | |
| enable_speculative_decoding: bool = False # Experimental | |
| enable_continuous_batching: bool = True # vLLM feature | |
| inference_batch_size: int = 1 | |
| num_beams: int = 1 # For beam search | |
| # ===== Memory Optimization ===== | |
| enable_memory_mapping: bool = True # MMAP for large models | |
| enable_weight_offloading: bool = False # Offload to disk if needed | |
| max_memory_usage_gb: float = 4.0 # Limit memory usage | |
| # ===== Monitoring & Metrics ===== | |
| enable_prometheus: bool = True | |
| enable_tracing: bool = True # OpenTelemetry tracing | |
| metrics_port: int = 9090 | |
| health_check_interval: int = 30 | |
| # ===== Distributed Features ===== | |
| enable_redis_cache: bool = False | |
| enable_celery_tasks: bool = False | |
| enable_model_sharding: bool = False # Shard model across devices | |
| # ===== Experimental Features ===== | |
| enable_retrieval_augmentation: bool = False # Learn to retrieve better | |
| enable_feedback_loop: bool = False # Learn from user feedback | |
| enable_adaptive_chunking: bool = False # Dynamic chunk sizes | |
| # ===== Performance Targets ===== | |
| target_latency_ms: Dict[str, int] = { | |
| "p95": 200, # 95% of queries under 200ms | |
| "p99": 500, # 99% under 500ms | |
| "max": 1000 # Never exceed 1s | |
| } | |
| # ===== Automatic Configuration ===== | |
| def auto_detect_device(cls, v): | |
| if v == DeviceType.AUTO: | |
| if torch.cuda.is_available(): | |
| return DeviceType.CUDA | |
| elif torch.backends.mps.is_available(): | |
| return DeviceType.MPS | |
| else: | |
| return DeviceType.CPU | |
| return v | |
| def use_quantized_llm(self) -> bool: | |
| """Check if we're using quantized LLM.""" | |
| return self.llm_quantization != QuantizationType.NONE | |
| def is_cpu_only(self) -> bool: | |
| """Check if running on CPU only.""" | |
| return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU | |
| def model_paths(self) -> Dict[str, Path]: | |
| """Get all model paths.""" | |
| return { | |
| "embedding": self.models_dir / self.embedding_model.split("/")[-1], | |
| "llm": self.models_dir / self.llm_model.split("/")[-1] | |
| } | |
| def get_optimization_flags(self) -> Dict[str, bool]: | |
| """Get optimization flags based on level.""" | |
| flags = { | |
| "basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], | |
| "advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], | |
| "hyper": self.optimization_level == OptimizationLevel.HYPER, | |
| "experimental": self.optimization_level == OptimizationLevel.HYPER | |
| } | |
| return flags | |
| class Config: | |
| env_file = ".env" | |
| env_file_encoding = "utf-8" | |
| case_sensitive = False | |
| # Global config instance | |
| config = HyperAdvancedConfig() | |
| # For backward compatibility | |
| if __name__ == "__main__": | |
| print("⚡ Hyper-Advanced Configuration Loaded:") | |
| print(f" - Optimization Level: {config.optimization_level}") | |
| print(f" - LLM Device: {config.llm_device}") | |
| print(f" - Quantization: {config.llm_quantization}") | |
| print(f" - CPU Only: {config.is_cpu_only}") | |