|
|
"""Configuration loader for YAML settings.""" |
|
|
|
|
|
import yaml |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import Dict, Any, Optional |
|
|
from dotenv import load_dotenv |
|
|
import os |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def load_config(config_path: str = None) -> Dict[str, Any]: |
|
|
""" |
|
|
Load configuration from YAML file. |
|
|
|
|
|
Args: |
|
|
config_path: Path to config file. If None, uses default settings.yaml |
|
|
|
|
|
Returns: |
|
|
Dictionary containing configuration settings |
|
|
""" |
|
|
if config_path is None: |
|
|
|
|
|
config_path = Path(__file__).parent / "settings.yaml" |
|
|
|
|
|
config_path = Path(config_path) |
|
|
|
|
|
if not config_path.exists(): |
|
|
raise FileNotFoundError(f"Configuration file not found: {config_path}") |
|
|
|
|
|
with open(config_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
import os |
|
|
import re |
|
|
|
|
|
def replace_env_vars(match): |
|
|
env_var = match.group(1) |
|
|
return os.getenv(env_var, match.group(0)) |
|
|
|
|
|
|
|
|
content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content) |
|
|
|
|
|
config = yaml.safe_load(content) |
|
|
|
|
|
|
|
|
config = _override_with_env_vars(config) |
|
|
|
|
|
return config |
|
|
|
|
|
|
|
|
def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]: |
|
|
"""Override config values with environment variables where available.""" |
|
|
|
|
|
|
|
|
env_mappings = { |
|
|
'QDRANT_URL': ['qdrant', 'url'], |
|
|
'QDRANT_COLLECTION': ['qdrant', 'collection_name'], |
|
|
'QDRANT_API_KEY': ['qdrant', 'api_key'], |
|
|
'RETRIEVER_MODEL': ['retriever', 'model'], |
|
|
'RANKER_MODEL': ['ranker', 'model'], |
|
|
'READER_TYPE': ['reader', 'default_type'], |
|
|
'MAX_TOKENS': ['reader', 'max_tokens'], |
|
|
'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'], |
|
|
'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'], |
|
|
'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'], |
|
|
'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'], |
|
|
'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'], |
|
|
'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'], |
|
|
'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'], |
|
|
} |
|
|
|
|
|
for env_var, config_path in env_mappings.items(): |
|
|
env_value = os.getenv(env_var) |
|
|
if env_value: |
|
|
|
|
|
current = config |
|
|
for key in config_path[:-1]: |
|
|
if key not in current: |
|
|
current[key] = {} |
|
|
current = current[key] |
|
|
|
|
|
|
|
|
final_key = config_path[-1] |
|
|
if final_key in ['top_k', 'max_tokens', 'num_predict']: |
|
|
current[final_key] = int(env_value) |
|
|
elif final_key in ['normalize', 'prefer_grpc']: |
|
|
current[final_key] = env_value.lower() in ('true', '1', 'yes') |
|
|
elif final_key == 'temperature': |
|
|
current[final_key] = float(env_value) |
|
|
else: |
|
|
current[final_key] = env_value |
|
|
|
|
|
return config |
|
|
|
|
|
|
|
|
def get_nested_config(config: Dict[str, Any], path: str, default=None): |
|
|
""" |
|
|
Get a nested configuration value using dot notation. |
|
|
|
|
|
Args: |
|
|
config: Configuration dictionary |
|
|
path: Dot-separated path (e.g., 'reader.MISTRAL.model') |
|
|
default: Default value if path not found |
|
|
|
|
|
Returns: |
|
|
Configuration value or default |
|
|
""" |
|
|
keys = path.split('.') |
|
|
current = config |
|
|
|
|
|
try: |
|
|
for key in keys: |
|
|
current = current[key] |
|
|
return current |
|
|
except (KeyError, TypeError): |
|
|
return default |
|
|
|
|
|
|
|
|
def load_collections_mapping() -> Dict[str, Dict[str, str]]: |
|
|
"""Load collections mapping from JSON file.""" |
|
|
collections_file = Path(__file__).parent / "collections.json" |
|
|
|
|
|
if not collections_file.exists(): |
|
|
|
|
|
return { |
|
|
"docling": { |
|
|
"model": "sentence-transformers/all-MiniLM-L6-v2", |
|
|
"description": "Default collection" |
|
|
} |
|
|
} |
|
|
|
|
|
with open(collections_file, 'r') as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
def get_embedding_model_for_collection(collection_name: str) -> Optional[str]: |
|
|
"""Get embedding model for a specific collection name.""" |
|
|
collections = load_collections_mapping() |
|
|
|
|
|
if collection_name in collections: |
|
|
return collections[collection_name]["model"] |
|
|
|
|
|
|
|
|
if "modernbert" in collection_name.lower(): |
|
|
return "Akryl/modernbert-embed-base-akryl-matryoshka" |
|
|
elif "minilm" in collection_name.lower(): |
|
|
return "sentence-transformers/all-MiniLM-L6-v2" |
|
|
elif "mpnet" in collection_name.lower(): |
|
|
return "sentence-transformers/all-mpnet-base-v2" |
|
|
elif "bge" in collection_name.lower(): |
|
|
return "BAAI/bge-m3" |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def get_collection_info(collection_name: str) -> Dict[str, str]: |
|
|
"""Get full collection information including model and description.""" |
|
|
collections = load_collections_mapping() |
|
|
|
|
|
if collection_name in collections: |
|
|
return collections[collection_name] |
|
|
|
|
|
|
|
|
model = get_embedding_model_for_collection(collection_name) |
|
|
return { |
|
|
"model": model or "unknown", |
|
|
"description": f"Auto-inferred collection: {collection_name}" |
|
|
} |
|
|
|