SB-PoC / configs.py
Chirapath's picture
First draft coding project
963ae98 verified
#!/usr/bin/env python3
"""
Centralized Configuration Management for Unified AI Services
Manages configuration for NER, OCR, and RAG services
"""
import os
import logging
from pathlib import Path
from typing import Optional, Dict, Any, List
from dotenv import load_dotenv
# Load environment variables
env_path = Path(__file__).parent / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
else:
load_dotenv() # Load from default location
# Setup logging
logging.basicConfig(
level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class BaseConfig:
"""Base configuration class with common settings"""
def __init__(self):
# Server Configuration
self.HOST = os.getenv("HOST", "0.0.0.0")
self.DEBUG = os.getenv("DEBUG", "False").lower() == "true"
# Database Configuration (shared by NER and RAG)
self.POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
self.POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
self.POSTGRES_USER = os.getenv("POSTGRES_USER", "")
self.POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
self.POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
# Azure OpenAI Configuration (shared by NER and RAG)
self.AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
self.AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
self.AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-3-large")
# Azure Storage Configuration (shared by NER and RAG)
self.AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
self.AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
self.BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
# Processing Configuration
self.MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", "50")) * 1024 * 1024 # Convert MB to bytes
self.REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300"))
# CORS Configuration
self.ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*")
def validate_azure_openai(self) -> bool:
"""Validate Azure OpenAI configuration"""
return bool(
self.AZURE_OPENAI_ENDPOINT and
self.AZURE_OPENAI_API_KEY and
self.AZURE_OPENAI_ENDPOINT != "YOUR_AZURE_OPENAI_ENDPOINT" and
self.AZURE_OPENAI_API_KEY != "YOUR_AZURE_OPENAI_KEY"
)
def validate_postgres(self) -> bool:
"""Validate PostgreSQL configuration"""
return bool(
self.POSTGRES_HOST and
self.POSTGRES_USER and
self.POSTGRES_PASSWORD and
self.POSTGRES_DATABASE
)
def validate_azure_storage(self) -> bool:
"""Validate Azure Storage configuration"""
return bool(
self.AZURE_STORAGE_ACCOUNT_URL and
self.AZURE_BLOB_SAS_TOKEN
)
class NERConfig(BaseConfig):
"""Configuration for NER Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("NER_PORT", "8500"))
# DeepSeek Configuration
self.DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
self.DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
self.DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
# OCR Service Configuration
self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
# NER Specific Settings
self.MAX_TEXT_LENGTH = 100000 # 100KB
self.SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
self.SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
# Entity and Relationship Types
self.ENTITY_TYPES = [
"PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
"VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
"WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
]
self.RELATIONSHIP_TYPES = [
# Standard relationships
"works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
"leads", "reports_to", "collaborates_with", "partners_with", "supplies_to", "acquires",
"invests_in", "headquartered_in", "operates_in", "born_in", "lives_in", "studied_at",
"graduated_from", "worked_at", "visited", "attended", "participated_in", "sponsored",
"developed", "created", "invented", "discovered", "published", "authored", "edited",
# Thai relationships
"ทำงานที่", "ก่อตั้ง", "ตั้งอยู่ที่", "เป็นส่วนหนึ่งของ", "เกี่ยวข้องกับ", "เป็นเจ้าของ", "จัดการ",
"นำโดย", "รายงานต่อ", "ร่วมงานกับ", "เป็นพันธมิตรกับ", "จัดหาให้", "ซื้อกิจการ", "ลงทุนใน",
"สำนักงานใหญ่ที่", "ดำเนินการใน", "เกิดที่", "อาศัยอยู่ที่", "ศึกษาที่", "จบการศึกษาจาก",
# Law enforcement relationships
"arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of", "witness_of",
"victim_of", "suspect_in", "charged_with", "convicted_of", "sentenced_by", "defended_by",
"prosecuted_by", "testified_against", "alibi_for", "found_at", "seized_from", "linked_to",
"จับกุมโดย", "สอบสวนโดย", "ยึดจาก", "ใช้ในการ", "หลักฐานของ", "พยานใน", "เหยื่อของ",
"ผู้ต้องสงสัยใน", "ถูกตั้งข้อหา", "ถูกตัดสิน", "ถูกพิพากษาโดย", "ต่อสู้คดีโดย", "ฟ้องร้องโดย",
"ให้การต่อต้าน", "เป็นข้อแก้ตัวสำหรับ", "พบที่", "ยึดจาก", "เชื่อมโยงกับ",
# Criminal relationships
"possess_illegal", "transport_illegal", "sell_illegal", "buy_illegal", "hide_evidence",
"plan_crime", "commit_crime", "flee_from", "escape_from", "hide_at", "meet_with",
"communicate_with", "threaten", "blackmail", "bribe", "corrupt", "money_launder",
"ครอบครองของผิดกฎหมาย", "ขนส่งของผิดกฎหมาย", "ขายของผิดกฎหมาย", "ซื้อของผิดกฎหมาย",
"ซ่อนหลักฐาน", "วางแผนอาชญากรรม", "กระทำอาชญากรรม", "หลบหนีจาก", "แอบซ่อนที่",
"พบปะกับ", "ติดต่อกับ", "ข่มขู่", "แบล็คเมล์", "ให้สินบน", "ทุจริต", "ฟอกเงิน"
]
def validate_deepseek(self) -> bool:
"""Validate DeepSeek configuration"""
return bool(
self.DEEPSEEK_ENDPOINT and
self.DEEPSEEK_API_KEY and
self.DEEPSEEK_ENDPOINT != "YOUR_DEEPSEEK_ENDPOINT" and
self.DEEPSEEK_API_KEY != "YOUR_DEEPSEEK_API_KEY"
)
class OCRConfig(BaseConfig):
"""Configuration for OCR Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("OCR_PORT", "8400"))
# Azure Document Intelligence Configuration
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
self.AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
# Web scraping configuration
self.MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
self.USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
def validate_azure_document_intelligence(self) -> bool:
"""Validate Azure Document Intelligence configuration"""
return bool(
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
self.AZURE_DOCUMENT_INTELLIGENCE_KEY and
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
self.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
)
class RAGConfig(BaseConfig):
"""Configuration for RAG Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("RAG_PORT", "8401"))
# OCR Service Configuration
self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
# PostgreSQL Configuration (specific to RAG)
self.PG_HOST = self.POSTGRES_HOST
self.PG_PORT = self.POSTGRES_PORT
self.PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch") # RAG uses different default DB
self.PG_USER = self.POSTGRES_USER
self.PG_PASSWORD = self.POSTGRES_PASSWORD
self.PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
# Chunking Configuration
self.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
self.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
self.MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
# Azure OpenAI Configuration (RAG specific)
self.AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
self.AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
class UnifiedConfig:
"""Unified configuration for all services"""
def __init__(self):
self.ner = NERConfig()
self.ocr = OCRConfig()
self.rag = RAGConfig()
# Main app configuration
self.MAIN_PORT = int(os.getenv("MAIN_PORT", "8000"))
self.MAIN_HOST = os.getenv("MAIN_HOST", "0.0.0.0")
# Service URLs (for inter-service communication)
self.NER_SERVICE_URL = f"http://localhost:{self.ner.PORT}"
self.OCR_SERVICE_URL = f"http://localhost:{self.ocr.PORT}"
self.RAG_SERVICE_URL = f"http://localhost:{self.rag.PORT}"
# Service Health Check Configuration
self.HEALTH_CHECK_TIMEOUT = 30
self.HEALTH_CHECK_RETRIES = 3
self.HEALTH_CHECK_INTERVAL = 5
# Load balancing and routing
self.SERVICE_WEIGHTS = {
"ner": 1.0,
"ocr": 1.0,
"rag": 1.0
}
def validate_all(self) -> Dict[str, Dict[str, bool]]:
"""Validate all service configurations"""
validation_results = {
"ner": {
"deepseek": self.ner.validate_deepseek(),
"azure_openai": self.ner.validate_azure_openai(),
"postgres": self.ner.validate_postgres(),
"azure_storage": self.ner.validate_azure_storage()
},
"ocr": {
"azure_document_intelligence": self.ocr.validate_azure_document_intelligence()
},
"rag": {
"azure_openai": self.rag.validate_azure_openai(),
"postgres": self.rag.validate_postgres()
}
}
return validation_results
def get_service_config(self, service_name: str) -> BaseConfig:
"""Get configuration for a specific service"""
service_configs = {
"ner": self.ner,
"ocr": self.ocr,
"rag": self.rag
}
return service_configs.get(service_name.lower())
def get_database_config(self) -> Dict[str, str]:
"""Get database configuration for services that need it"""
return {
"host": self.ner.POSTGRES_HOST,
"port": str(self.ner.POSTGRES_PORT),
"user": self.ner.POSTGRES_USER,
"password": self.ner.POSTGRES_PASSWORD,
"database": self.ner.POSTGRES_DATABASE,
"ssl_mode": getattr(self.rag, 'PG_SSL_MODE', 'require')
}
def get_azure_openai_config(self) -> Dict[str, str]:
"""Get Azure OpenAI configuration for services that need it"""
return {
"endpoint": self.ner.AZURE_OPENAI_ENDPOINT,
"api_key": self.ner.AZURE_OPENAI_API_KEY,
"embedding_model": self.ner.EMBEDDING_MODEL,
"deployment_name": self.ner.AZURE_OPENAI_DEPLOYMENT_NAME
}
def print_configuration_summary(self):
"""Print a summary of all configurations"""
print("🔧 Configuration Summary")
print("=" * 50)
# Validate all configurations
validation_results = self.validate_all()
# NER Service
print(f"📝 NER Service (Port {self.ner.PORT}):")
print(f" DeepSeek: {'✅' if validation_results['ner']['deepseek'] else '❌'}")
print(f" Azure OpenAI: {'✅' if validation_results['ner']['azure_openai'] else '❌'}")
print(f" PostgreSQL: {'✅' if validation_results['ner']['postgres'] else '❌'}")
print(f" Azure Storage: {'✅' if validation_results['ner']['azure_storage'] else '❌'}")
print(f" OCR Service URL: {self.ner.OCR_SERVICE_URL}")
# OCR Service
print(f"\n🔍 OCR Service (Port {self.ocr.PORT}):")
print(f" Azure Document Intelligence: {'✅' if validation_results['ocr']['azure_document_intelligence'] else '❌'}")
print(f" Max File Size: {self.ocr.MAX_FILE_SIZE / (1024*1024):.0f} MB")
# RAG Service
print(f"\n🧠 RAG Service (Port {self.rag.PORT}):")
print(f" Azure OpenAI: {'✅' if validation_results['rag']['azure_openai'] else '❌'}")
print(f" PostgreSQL: {'✅' if validation_results['rag']['postgres'] else '❌'}")
print(f" OCR Service URL: {self.rag.OCR_SERVICE_URL}")
print(f" Chunk Size: {self.rag.CHUNK_SIZE}")
# Main App
print(f"\n🌐 Main App (Port {self.MAIN_PORT}):")
print(f" NER Service: {self.NER_SERVICE_URL}")
print(f" OCR Service: {self.OCR_SERVICE_URL}")
print(f" RAG Service: {self.RAG_SERVICE_URL}")
# Database Configuration
print(f"\n🗄️ Database Configuration:")
print(f" Host: {self.ner.POSTGRES_HOST}")
print(f" Port: {self.ner.POSTGRES_PORT}")
print(f" User: {self.ner.POSTGRES_USER}")
print(f" NER Database: {self.ner.POSTGRES_DATABASE}")
print(f" RAG Database: {self.rag.PG_DATABASE}")
# Critical Issues
all_validations = []
for service, validations in validation_results.items():
all_validations.extend(validations.values())
if not all(all_validations):
print(f"\n⚠️ CONFIGURATION ISSUES DETECTED:")
for service, validations in validation_results.items():
for component, is_valid in validations.items():
if not is_valid:
print(f" ❌ {service.upper()}: {component} not configured")
else:
print(f"\n✅ All configurations are valid!")
# Global configuration instance
config = UnifiedConfig()
def get_config() -> UnifiedConfig:
"""Get the global configuration instance"""
return config
def validate_environment() -> bool:
"""Validate the entire environment configuration"""
validation_results = config.validate_all()
# Check critical components
critical_components = [
validation_results['ner']['azure_openai'],
validation_results['ner']['postgres'],
validation_results['ocr']['azure_document_intelligence'],
validation_results['rag']['azure_openai'],
validation_results['rag']['postgres']
]
return all(critical_components)
if __name__ == "__main__":
"""Test configuration loading and validation"""
print("🧪 Testing Configuration Loading")
print("=" * 40)
try:
config.print_configuration_summary()
if validate_environment():
print("\n🎉 Environment validation passed!")
print("All critical services are properly configured.")
else:
print("\n❌ Environment validation failed!")
print("Some critical services are not properly configured.")
print("Please check your .env file and update missing values.")
except Exception as e:
print(f"\n❌ Configuration loading failed: {e}")
logger.error(f"Configuration error: {e}")