Spaces:
Sleeping
Sleeping
| """The bare resolver. Matches a raw value to a canonical id via the | |
| strategy chain (exact → normalized → fuzzy → no_match), and — when | |
| given a `CanonicalStore` — enriches the result with the matched | |
| canonical's metadata, parent edges, model-specific lineage fields, | |
| and quantized-chain root collapse. | |
| The enrichment matches the HTTP API's response shape exactly. Callers | |
| using the resolver standalone get the same `ResolutionResult` they'd | |
| get back from `POST /api/v1/resolve`.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Optional | |
| from eval_entity_resolver.alias_store import AliasStore | |
| from eval_entity_resolver.canonical_store import CanonicalStore | |
| from eval_entity_resolver.models import ResolutionResult, ResolverConfig | |
| from eval_entity_resolver.strategies.exact import exact_match | |
| from eval_entity_resolver.strategies.normalized import normalized_match | |
| from eval_entity_resolver.strategies.fuzzy import fuzzy_match | |
| class Resolver: | |
| def __init__( | |
| self, | |
| store: AliasStore, | |
| config: Optional[ResolverConfig] = None, | |
| canonical_store: Optional[CanonicalStore] = None, | |
| ) -> None: | |
| """`store` is required (alias matching is the resolver's core job). | |
| `canonical_store` is optional — when provided, results are | |
| enriched with parent / lineage / metadata fields. Without it, | |
| only the basic match fields (canonical_id, strategy, confidence) | |
| are populated.""" | |
| self.store = store | |
| self.config = config or ResolverConfig() | |
| self.canonical_store = canonical_store | |
| def from_parquet( | |
| cls, | |
| path: str | Path, | |
| config: Optional[ResolverConfig] = None, | |
| ) -> "Resolver": | |
| """Load both alias and canonical stores from a parquet directory | |
| (e.g. `./fixtures/`) and return a fully-enriching resolver. This | |
| is the recommended convenience for callers who want the same | |
| response shape as the HTTP API.""" | |
| return cls( | |
| AliasStore.from_parquet(path), | |
| config=config, | |
| canonical_store=CanonicalStore.from_parquet(path), | |
| ) | |
| def from_hf( | |
| cls, | |
| repo_id: str, | |
| config: Optional[ResolverConfig] = None, | |
| ) -> "Resolver": | |
| """Load both stores from a HF Dataset repo and return a | |
| fully-enriching resolver.""" | |
| return cls( | |
| AliasStore.from_hf(repo_id), | |
| config=config, | |
| canonical_store=CanonicalStore.from_hf(repo_id), | |
| ) | |
| def resolve( | |
| self, | |
| raw_value: str, | |
| entity_type: str, | |
| source_config: Optional[str] = None, | |
| ) -> ResolutionResult: | |
| # 1. Exact | |
| canonical_id = exact_match(raw_value, entity_type, source_config, self.store) | |
| if canonical_id is not None: | |
| return self._enrich(raw_value, entity_type, source_config, canonical_id, "exact", 1.0) | |
| # 2. Normalized (confidence 0.95 — only return if above threshold) | |
| _NORMALIZED_CONFIDENCE = 0.95 | |
| if _NORMALIZED_CONFIDENCE >= self.config.threshold: | |
| canonical_id = normalized_match(raw_value, entity_type, self.store, source_config) | |
| if canonical_id is not None: | |
| return self._enrich( | |
| raw_value, entity_type, source_config, | |
| canonical_id, "normalized", _NORMALIZED_CONFIDENCE, | |
| ) | |
| # 3. Fuzzy | |
| canonical_id, confidence = fuzzy_match( | |
| raw_value, entity_type, self.config.threshold, self.store, source_config | |
| ) | |
| if canonical_id is not None: | |
| return self._enrich( | |
| raw_value, entity_type, source_config, | |
| canonical_id, "fuzzy", confidence, | |
| ) | |
| # 4. No match | |
| return ResolutionResult( | |
| raw_value=raw_value, | |
| entity_type=entity_type, | |
| source_config=source_config, | |
| canonical_id=None, | |
| strategy="no_match", | |
| confidence=0.0, | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Enrichment (no-op when no canonical_store is attached) | |
| # ------------------------------------------------------------------ | |
| def build_result( | |
| self, | |
| raw_value: str, | |
| entity_type: str, | |
| source_config: Optional[str], | |
| canonical_id: str, | |
| strategy: str, | |
| confidence: float, | |
| ) -> ResolutionResult: | |
| """Construct an enriched `ResolutionResult` for a canonical_id | |
| the caller already knows — useful for callers that bypass the | |
| strategy chain (e.g. an alias-table cache hit, an auto-created | |
| draft) but want the same rich response shape. Identical to the | |
| enrichment that happens inside `resolve()`.""" | |
| return self._enrich(raw_value, entity_type, source_config, canonical_id, strategy, confidence) | |
| def _enrich( | |
| self, | |
| raw_value: str, | |
| entity_type: str, | |
| source_config: Optional[str], | |
| matched_canonical_id: str, | |
| strategy: str, | |
| confidence: float, | |
| ) -> ResolutionResult: | |
| """Look up the matched canonical's row and populate the rich | |
| response fields. When no canonical_store is attached, the rich | |
| fields stay None and the result has just the basic match info.""" | |
| if self.canonical_store is None: | |
| return ResolutionResult( | |
| raw_value=raw_value, | |
| entity_type=entity_type, | |
| source_config=source_config, | |
| canonical_id=matched_canonical_id, | |
| strategy=strategy, | |
| confidence=confidence, | |
| ) | |
| cs = self.canonical_store | |
| matched_entity = cs.lookup(entity_type, matched_canonical_id) | |
| review_status = (matched_entity or {}).get("review_status") if matched_entity else None | |
| if entity_type == "model": | |
| fields = cs.model_metadata_fields(matched_canonical_id, matched_entity) | |
| # If the response collapses to a different canonical (root), | |
| # surface THAT canonical's review_status — keeps the response | |
| # internally consistent. | |
| if fields["canonical_id"] != matched_canonical_id: | |
| root_entity = cs.lookup("model", fields["canonical_id"]) | |
| if root_entity: | |
| review_status = root_entity.get("review_status") or review_status | |
| return ResolutionResult( | |
| raw_value=raw_value, | |
| entity_type=entity_type, | |
| source_config=source_config, | |
| canonical_id=fields["canonical_id"], | |
| strategy=strategy, | |
| confidence=confidence, | |
| review_status=review_status, | |
| parent_canonical_id=cs.parent_canonical_id("model", matched_entity), | |
| resolved_leaf_id=fields["resolved_leaf_id"], | |
| root_model_id=fields["root_model_id"], | |
| lineage_origin_org_id=fields["lineage_origin_org_id"], | |
| parents=fields["parents"], | |
| open_weights=fields["open_weights"], | |
| release_date=fields["release_date"], | |
| params_billions=fields["params_billions"], | |
| ) | |
| # Benchmark: fill in hierarchy-alignment fields (family_key, | |
| # category) by walking canonical_families. composite_keys stays | |
| # empty here — see CanonicalStore.benchmark_family_enrichment for | |
| # why composite computation belongs in the producer. | |
| if entity_type == "benchmark": | |
| fam = cs.benchmark_family_enrichment(matched_canonical_id) | |
| return ResolutionResult( | |
| raw_value=raw_value, | |
| entity_type=entity_type, | |
| source_config=source_config, | |
| canonical_id=matched_canonical_id, | |
| strategy=strategy, | |
| confidence=confidence, | |
| review_status=review_status, | |
| parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity), | |
| family_key=fam["family_key"], | |
| category=fam["category"], | |
| composite_keys=fam["composite_keys"], | |
| ) | |
| # Other non-model types (metric, harness, org): only | |
| # parent_canonical_id and review_status are meaningful | |
| return ResolutionResult( | |
| raw_value=raw_value, | |
| entity_type=entity_type, | |
| source_config=source_config, | |
| canonical_id=matched_canonical_id, | |
| strategy=strategy, | |
| confidence=confidence, | |
| review_status=review_status, | |
| parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity), | |
| ) | |