#!/usr/bin/env python3 """ Centralized Configuration Management for Unified AI Services Manages configuration for NER, OCR, and RAG services """ import os import logging from pathlib import Path from typing import Optional, Dict, Any, List from dotenv import load_dotenv # Load environment variables env_path = Path(__file__).parent / '.env' if env_path.exists(): load_dotenv(dotenv_path=env_path) else: load_dotenv() # Load from default location # Setup logging logging.basicConfig( level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper()), format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class BaseConfig: """Base configuration class with common settings""" def __init__(self): # Server Configuration self.HOST = os.getenv("HOST", "0.0.0.0") self.DEBUG = os.getenv("DEBUG", "False").lower() == "true" # Database Configuration (shared by NER and RAG) self.POSTGRES_HOST = os.getenv("POSTGRES_HOST", "") self.POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432")) self.POSTGRES_USER = os.getenv("POSTGRES_USER", "") self.POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "") self.POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres") # Azure OpenAI Configuration (shared by NER and RAG) self.AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "") self.AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "") self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large") self.AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-3-large") # Azure Storage Configuration (shared by NER and RAG) self.AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "") self.AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "") self.BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog") # Processing Configuration self.MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", "50")) * 1024 * 1024 # Convert MB to bytes self.REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # CORS Configuration self.ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*") def validate_azure_openai(self) -> bool: """Validate Azure OpenAI configuration""" return bool( self.AZURE_OPENAI_ENDPOINT and self.AZURE_OPENAI_API_KEY and self.AZURE_OPENAI_ENDPOINT != "YOUR_AZURE_OPENAI_ENDPOINT" and self.AZURE_OPENAI_API_KEY != "YOUR_AZURE_OPENAI_KEY" ) def validate_postgres(self) -> bool: """Validate PostgreSQL configuration""" return bool( self.POSTGRES_HOST and self.POSTGRES_USER and self.POSTGRES_PASSWORD and self.POSTGRES_DATABASE ) def validate_azure_storage(self) -> bool: """Validate Azure Storage configuration""" return bool( self.AZURE_STORAGE_ACCOUNT_URL and self.AZURE_BLOB_SAS_TOKEN ) class NERConfig(BaseConfig): """Configuration for NER Service""" def __init__(self): super().__init__() self.PORT = int(os.getenv("NER_PORT", "8500")) # DeepSeek Configuration self.DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "") self.DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") self.DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528") # OCR Service Configuration self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400") # NER Specific Settings self.MAX_TEXT_LENGTH = 100000 # 100KB self.SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'} self.SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'} # Entity and Relationship Types self.ENTITY_TYPES = [ "PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT", "VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM", "WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL" ] self.RELATIONSHIP_TYPES = [ # Standard relationships "works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages", "leads", "reports_to", "collaborates_with", "partners_with", "supplies_to", "acquires", "invests_in", "headquartered_in", "operates_in", "born_in", "lives_in", "studied_at", "graduated_from", "worked_at", "visited", "attended", "participated_in", "sponsored", "developed", "created", "invented", "discovered", "published", "authored", "edited", # Thai relationships "ทำงานที่", "ก่อตั้ง", "ตั้งอยู่ที่", "เป็นส่วนหนึ่งของ", "เกี่ยวข้องกับ", "เป็นเจ้าของ", "จัดการ", "นำโดย", "รายงานต่อ", "ร่วมงานกับ", "เป็นพันธมิตรกับ", "จัดหาให้", "ซื้อกิจการ", "ลงทุนใน", "สำนักงานใหญ่ที่", "ดำเนินการใน", "เกิดที่", "อาศัยอยู่ที่", "ศึกษาที่", "จบการศึกษาจาก", # Law enforcement relationships "arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of", "witness_of", "victim_of", "suspect_in", "charged_with", "convicted_of", "sentenced_by", "defended_by", "prosecuted_by", "testified_against", "alibi_for", "found_at", "seized_from", "linked_to", "จับกุมโดย", "สอบสวนโดย", "ยึดจาก", "ใช้ในการ", "หลักฐานของ", "พยานใน", "เหยื่อของ", "ผู้ต้องสงสัยใน", "ถูกตั้งข้อหา", "ถูกตัดสิน", "ถูกพิพากษาโดย", "ต่อสู้คดีโดย", "ฟ้องร้องโดย", "ให้การต่อต้าน", "เป็นข้อแก้ตัวสำหรับ", "พบที่", "ยึดจาก", "เชื่อมโยงกับ", # Criminal relationships "possess_illegal", "transport_illegal", "sell_illegal", "buy_illegal", "hide_evidence", "plan_crime", "commit_crime", "flee_from", "escape_from", "hide_at", "meet_with", "communicate_with", "threaten", "blackmail", "bribe", "corrupt", "money_launder", "ครอบครองของผิดกฎหมาย", "ขนส่งของผิดกฎหมาย", "ขายของผิดกฎหมาย", "ซื้อของผิดกฎหมาย", "ซ่อนหลักฐาน", "วางแผนอาชญากรรม", "กระทำอาชญากรรม", "หลบหนีจาก", "แอบซ่อนที่", "พบปะกับ", "ติดต่อกับ", "ข่มขู่", "แบล็คเมล์", "ให้สินบน", "ทุจริต", "ฟอกเงิน" ] def validate_deepseek(self) -> bool: """Validate DeepSeek configuration""" return bool( self.DEEPSEEK_ENDPOINT and self.DEEPSEEK_API_KEY and self.DEEPSEEK_ENDPOINT != "YOUR_DEEPSEEK_ENDPOINT" and self.DEEPSEEK_API_KEY != "YOUR_DEEPSEEK_API_KEY" ) class OCRConfig(BaseConfig): """Configuration for OCR Service""" def __init__(self): super().__init__() self.PORT = int(os.getenv("OCR_PORT", "8400")) # Azure Document Intelligence Configuration self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "") self.AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "") # Web scraping configuration self.MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10")) self.USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") def validate_azure_document_intelligence(self) -> bool: """Validate Azure Document Intelligence configuration""" return bool( self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and self.AZURE_DOCUMENT_INTELLIGENCE_KEY and self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and self.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY" ) class RAGConfig(BaseConfig): """Configuration for RAG Service""" def __init__(self): super().__init__() self.PORT = int(os.getenv("RAG_PORT", "8401")) # OCR Service Configuration self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400") # PostgreSQL Configuration (specific to RAG) self.PG_HOST = self.POSTGRES_HOST self.PG_PORT = self.POSTGRES_PORT self.PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch") # RAG uses different default DB self.PG_USER = self.POSTGRES_USER self.PG_PASSWORD = self.POSTGRES_PASSWORD self.PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require") # Chunking Configuration self.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000")) self.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200")) self.MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50")) # Azure OpenAI Configuration (RAG specific) self.AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small") self.AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview") class UnifiedConfig: """Unified configuration for all services""" def __init__(self): self.ner = NERConfig() self.ocr = OCRConfig() self.rag = RAGConfig() # Main app configuration self.MAIN_PORT = int(os.getenv("MAIN_PORT", "8000")) self.MAIN_HOST = os.getenv("MAIN_HOST", "0.0.0.0") # Service URLs (for inter-service communication) self.NER_SERVICE_URL = f"http://localhost:{self.ner.PORT}" self.OCR_SERVICE_URL = f"http://localhost:{self.ocr.PORT}" self.RAG_SERVICE_URL = f"http://localhost:{self.rag.PORT}" # Service Health Check Configuration self.HEALTH_CHECK_TIMEOUT = 30 self.HEALTH_CHECK_RETRIES = 3 self.HEALTH_CHECK_INTERVAL = 5 # Load balancing and routing self.SERVICE_WEIGHTS = { "ner": 1.0, "ocr": 1.0, "rag": 1.0 } def validate_all(self) -> Dict[str, Dict[str, bool]]: """Validate all service configurations""" validation_results = { "ner": { "deepseek": self.ner.validate_deepseek(), "azure_openai": self.ner.validate_azure_openai(), "postgres": self.ner.validate_postgres(), "azure_storage": self.ner.validate_azure_storage() }, "ocr": { "azure_document_intelligence": self.ocr.validate_azure_document_intelligence() }, "rag": { "azure_openai": self.rag.validate_azure_openai(), "postgres": self.rag.validate_postgres() } } return validation_results def get_service_config(self, service_name: str) -> BaseConfig: """Get configuration for a specific service""" service_configs = { "ner": self.ner, "ocr": self.ocr, "rag": self.rag } return service_configs.get(service_name.lower()) def get_database_config(self) -> Dict[str, str]: """Get database configuration for services that need it""" return { "host": self.ner.POSTGRES_HOST, "port": str(self.ner.POSTGRES_PORT), "user": self.ner.POSTGRES_USER, "password": self.ner.POSTGRES_PASSWORD, "database": self.ner.POSTGRES_DATABASE, "ssl_mode": getattr(self.rag, 'PG_SSL_MODE', 'require') } def get_azure_openai_config(self) -> Dict[str, str]: """Get Azure OpenAI configuration for services that need it""" return { "endpoint": self.ner.AZURE_OPENAI_ENDPOINT, "api_key": self.ner.AZURE_OPENAI_API_KEY, "embedding_model": self.ner.EMBEDDING_MODEL, "deployment_name": self.ner.AZURE_OPENAI_DEPLOYMENT_NAME } def print_configuration_summary(self): """Print a summary of all configurations""" print("🔧 Configuration Summary") print("=" * 50) # Validate all configurations validation_results = self.validate_all() # NER Service print(f"📝 NER Service (Port {self.ner.PORT}):") print(f" DeepSeek: {'✅' if validation_results['ner']['deepseek'] else '❌'}") print(f" Azure OpenAI: {'✅' if validation_results['ner']['azure_openai'] else '❌'}") print(f" PostgreSQL: {'✅' if validation_results['ner']['postgres'] else '❌'}") print(f" Azure Storage: {'✅' if validation_results['ner']['azure_storage'] else '❌'}") print(f" OCR Service URL: {self.ner.OCR_SERVICE_URL}") # OCR Service print(f"\n🔍 OCR Service (Port {self.ocr.PORT}):") print(f" Azure Document Intelligence: {'✅' if validation_results['ocr']['azure_document_intelligence'] else '❌'}") print(f" Max File Size: {self.ocr.MAX_FILE_SIZE / (1024*1024):.0f} MB") # RAG Service print(f"\n🧠 RAG Service (Port {self.rag.PORT}):") print(f" Azure OpenAI: {'✅' if validation_results['rag']['azure_openai'] else '❌'}") print(f" PostgreSQL: {'✅' if validation_results['rag']['postgres'] else '❌'}") print(f" OCR Service URL: {self.rag.OCR_SERVICE_URL}") print(f" Chunk Size: {self.rag.CHUNK_SIZE}") # Main App print(f"\n🌐 Main App (Port {self.MAIN_PORT}):") print(f" NER Service: {self.NER_SERVICE_URL}") print(f" OCR Service: {self.OCR_SERVICE_URL}") print(f" RAG Service: {self.RAG_SERVICE_URL}") # Database Configuration print(f"\n🗄️ Database Configuration:") print(f" Host: {self.ner.POSTGRES_HOST}") print(f" Port: {self.ner.POSTGRES_PORT}") print(f" User: {self.ner.POSTGRES_USER}") print(f" NER Database: {self.ner.POSTGRES_DATABASE}") print(f" RAG Database: {self.rag.PG_DATABASE}") # Critical Issues all_validations = [] for service, validations in validation_results.items(): all_validations.extend(validations.values()) if not all(all_validations): print(f"\n⚠️ CONFIGURATION ISSUES DETECTED:") for service, validations in validation_results.items(): for component, is_valid in validations.items(): if not is_valid: print(f" ❌ {service.upper()}: {component} not configured") else: print(f"\n✅ All configurations are valid!") # Global configuration instance config = UnifiedConfig() def get_config() -> UnifiedConfig: """Get the global configuration instance""" return config def validate_environment() -> bool: """Validate the entire environment configuration""" validation_results = config.validate_all() # Check critical components critical_components = [ validation_results['ner']['azure_openai'], validation_results['ner']['postgres'], validation_results['ocr']['azure_document_intelligence'], validation_results['rag']['azure_openai'], validation_results['rag']['postgres'] ] return all(critical_components) if __name__ == "__main__": """Test configuration loading and validation""" print("🧪 Testing Configuration Loading") print("=" * 40) try: config.print_configuration_summary() if validate_environment(): print("\n🎉 Environment validation passed!") print("All critical services are properly configured.") else: print("\n❌ Environment validation failed!") print("Some critical services are not properly configured.") print("Please check your .env file and update missing values.") except Exception as e: print(f"\n❌ Configuration loading failed: {e}") logger.error(f"Configuration error: {e}")