Spaces:
Sleeping
Sleeping
File size: 3,133 Bytes
8521f60 27d4b0c 8521f60 fc20fed 8521f60 e8c3964 fc20fed e8c3964 fc20fed bdb49ae fc20fed bdb49ae fc20fed 8521f60 fc20fed 8521f60 cdf4160 fc20fed 79bdbbe fc20fed cdf4160 fc20fed 79bdbbe 8521f60 bdb49ae fc20fed bdb49ae fc20fed bdb49ae fc20fed bdb49ae fc20fed bdb49ae 8521f60 fc20fed 27d4b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
"""Typed configuration objects used across the framework."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, Literal, Union
@dataclass
class LoggingConfig:
"""Logging configuration (rotating file + console)."""
log_dir: Path = Path("logs")
level: str = "INFO" # DEBUG | INFO | WARNING | ERROR | CRITICAL
max_mb: int = 5 # per-file size before rotation
backups: int = 5 # number of rotated files to keep
@dataclass
class CrossEncoderConfig:
"""Configuration for an optional cross-encoder re-ranker."""
enable: bool = False # master switch
model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
device: str = "cpu"
max_length: int = 512 # truncation length
first_stage_k: int = 50 # how many docs to pass to re-ranker
final_k: Optional[int] = None # override PipelineConfig.retriever.top_k
@dataclass
class RetrieverConfig:
"""Configuration for a retriever back-end."""
name: Literal["bm25", "dense", "hybrid"] = "bm25"
top_k: int = 5
# For backward compatibility with tests: allow index_path alias for sparse
index_path: Optional[Union[str, Path]] = None # alias for bm25_index
# Specific to BM25
bm25_idx: Optional[Union[str, Path]] = None
doc_store: Optional[Union[str, Path]] = None
# For dense-only
faiss_index: Optional[Union[str, Path]] = None
model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
embedder_cache: Optional[Union[str, Path]] = None
device: str = "cpu"
# For hybrid only
alpha: float = 0.5 # sparse ↔ dense weight
def __post_init__(self):
# If index_path is provided (legacy), use it as bm25_index
if self.index_path:
self.bm25_idx = self.index_path
@dataclass
class GeneratorConfig:
"""Configuration for the text generator."""
model_name: str = "google/flan-t5-base"
device: str = "cpu"
max_new_tokens: int = 256
temperature: float = 0.0
@dataclass
class StatsConfig:
"""Configuration for statistical tests & robustness analyses."""
# Correlation (RQ1 & RQ2)
correlation_method: Literal["spearman", "kendall"] = "spearman"
n_boot: int = 1000 # bootstrap replicates for CIs
ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %)
# Significance tests (RQ2)
wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided"
multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni"
alpha: float = 0.05 # family-wise error rate
# Robustness / sensitivity (RQ3 & RQ4)
compute_effect_size: bool = True
n_permutations: int = 1000
failure_threshold: float = 0.0
@dataclass
class PipelineConfig:
"""Top-level pipeline configuration."""
logging: LoggingConfig = field(default_factory=LoggingConfig)
reranker: CrossEncoderConfig = field(default_factory=CrossEncoderConfig)
retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
generator: GeneratorConfig = field(default_factory=GeneratorConfig)
stats: StatsConfig = field(default_factory=StatsConfig)
|