akryldigital's picture
Pilot (#2)
92633a7 verified
raw
history blame
5.77 kB
"""Configuration loader for YAML settings."""
import yaml
import json
from pathlib import Path
from typing import Dict, Any, Optional
from dotenv import load_dotenv
import os
load_dotenv()
def load_config(config_path: str = None) -> Dict[str, Any]:
"""
Load configuration from YAML file.
Args:
config_path: Path to config file. If None, uses default settings.yaml
Returns:
Dictionary containing configuration settings
"""
if config_path is None:
# Default to settings.yaml in the same directory as this file
config_path = Path(__file__).parent / "settings.yaml"
config_path = Path(config_path)
if not config_path.exists():
raise FileNotFoundError(f"Configuration file not found: {config_path}")
with open(config_path, 'r', encoding='utf-8') as f:
content = f.read()
# Replace environment variables in the content
import os
import re
def replace_env_vars(match):
env_var = match.group(1)
return os.getenv(env_var, match.group(0)) # Return original if env var not found
# Replace ${VAR} patterns with environment variables
content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content)
config = yaml.safe_load(content)
# Override with environment variables if they exist
config = _override_with_env_vars(config)
return config
def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]:
"""Override config values with environment variables where available."""
# Map environment variables to config paths
env_mappings = {
'QDRANT_URL': ['qdrant', 'url'],
'QDRANT_COLLECTION': ['qdrant', 'collection_name'],
'QDRANT_API_KEY': ['qdrant', 'api_key'],
'RETRIEVER_MODEL': ['retriever', 'model'],
'RANKER_MODEL': ['ranker', 'model'],
'READER_TYPE': ['reader', 'default_type'],
'MAX_TOKENS': ['reader', 'max_tokens'],
'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'],
'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'],
'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'],
'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'],
'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'],
'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'],
'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'],
}
for env_var, config_path in env_mappings.items():
env_value = os.getenv(env_var)
if env_value:
# Navigate to the nested config location
current = config
for key in config_path[:-1]:
if key not in current:
current[key] = {}
current = current[key]
# Set the final value, converting to appropriate type
final_key = config_path[-1]
if final_key in ['top_k', 'max_tokens', 'num_predict']:
current[final_key] = int(env_value)
elif final_key in ['normalize', 'prefer_grpc']:
current[final_key] = env_value.lower() in ('true', '1', 'yes')
elif final_key == 'temperature':
current[final_key] = float(env_value)
else:
current[final_key] = env_value
return config
def get_nested_config(config: Dict[str, Any], path: str, default=None):
"""
Get a nested configuration value using dot notation.
Args:
config: Configuration dictionary
path: Dot-separated path (e.g., 'reader.MISTRAL.model')
default: Default value if path not found
Returns:
Configuration value or default
"""
keys = path.split('.')
current = config
try:
for key in keys:
current = current[key]
return current
except (KeyError, TypeError):
return default
def load_collections_mapping() -> Dict[str, Dict[str, str]]:
"""Load collections mapping from JSON file."""
collections_file = Path(__file__).parent / "collections.json"
if not collections_file.exists():
# Return default mapping if file doesn't exist
return {
"docling": {
"model": "sentence-transformers/all-MiniLM-L6-v2",
"description": "Default collection"
}
}
with open(collections_file, 'r') as f:
return json.load(f)
def get_embedding_model_for_collection(collection_name: str) -> Optional[str]:
"""Get embedding model for a specific collection name."""
collections = load_collections_mapping()
if collection_name in collections:
return collections[collection_name]["model"]
# Try to infer from collection name patterns
if "modernbert" in collection_name.lower():
return "Akryl/modernbert-embed-base-akryl-matryoshka"
elif "minilm" in collection_name.lower():
return "sentence-transformers/all-MiniLM-L6-v2"
elif "mpnet" in collection_name.lower():
return "sentence-transformers/all-mpnet-base-v2"
elif "bge" in collection_name.lower():
return "BAAI/bge-m3"
return None
def get_collection_info(collection_name: str) -> Dict[str, str]:
"""Get full collection information including model and description."""
collections = load_collections_mapping()
if collection_name in collections:
return collections[collection_name]
# Return inferred info for unknown collections
model = get_embedding_model_for_collection(collection_name)
return {
"model": model or "unknown",
"description": f"Auto-inferred collection: {collection_name}"
}