Spaces:
Sleeping
Sleeping
"""Typed configuration objects used across the framework.""" | |
from dataclasses import dataclass, field | |
from pathlib import Path | |
from typing import Optional, Literal, Union | |
class LoggingConfig: | |
"""Logging configuration (rotating file + console).""" | |
log_dir: Path = Path("logs") | |
level: str = "INFO" # DEBUG | INFO | WARNING | ERROR | CRITICAL | |
max_mb: int = 5 # per-file size before rotation | |
backups: int = 5 # number of rotated files to keep | |
class CrossEncoderConfig: | |
"""Configuration for an optional cross-encoder re-ranker.""" | |
enable: bool = False # master switch | |
model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" | |
device: str = "cpu" | |
max_length: int = 512 # truncation length | |
first_stage_k: int = 50 # how many docs to pass to re-ranker | |
final_k: Optional[int] = None # override PipelineConfig.retriever.top_k | |
class RetrieverConfig: | |
"""Configuration for a retriever back-end.""" | |
name: Literal["bm25", "dense", "hybrid"] = "bm25" | |
top_k: int = 5 | |
# For backward compatibility with tests: allow index_path alias for sparse | |
index_path: Optional[Union[str, Path]] = None # alias for bm25_index | |
# Specific to BM25 | |
bm25_idx: Optional[Union[str, Path]] = None | |
doc_store: Optional[Union[str, Path]] = None | |
# For dense-only | |
faiss_index: Optional[Union[str, Path]] = None | |
model_name: str = "sentence-transformers/all-MiniLM-L6-v2" | |
embedder_cache: Optional[Union[str, Path]] = None | |
device: str = "cpu" | |
# For hybrid only | |
alpha: float = 0.5 # sparse ↔ dense weight | |
def __post_init__(self): | |
# If index_path is provided (legacy), use it as bm25_index | |
if self.index_path: | |
self.bm25_idx = self.index_path | |
class GeneratorConfig: | |
"""Configuration for the text generator.""" | |
model_name: str = "google/flan-t5-base" | |
device: str = "cpu" | |
max_new_tokens: int = 256 | |
temperature: float = 0.0 | |
class StatsConfig: | |
"""Configuration for statistical tests & robustness analyses.""" | |
# Correlation (RQ1 & RQ2) | |
correlation_method: Literal["spearman", "kendall"] = "spearman" | |
n_boot: int = 1000 # bootstrap replicates for CIs | |
ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %) | |
# Significance tests (RQ2) | |
wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided" | |
multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni" | |
alpha: float = 0.05 # family-wise error rate | |
# Robustness / sensitivity (RQ3 & RQ4) | |
compute_effect_size: bool = True | |
n_permutations: int = 1000 | |
failure_threshold: float = 0.0 | |
class PipelineConfig: | |
"""Top-level pipeline configuration.""" | |
logging: LoggingConfig = field(default_factory=LoggingConfig) | |
reranker: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) | |
retriever: RetrieverConfig = field(default_factory=RetrieverConfig) | |
generator: GeneratorConfig = field(default_factory=GeneratorConfig) | |
stats: StatsConfig = field(default_factory=StatsConfig) | |