Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Dataset-specific logic for the Text2SPARQL repair pipeline. | |
| Handles loading dataset configs, KG profiles, and endpoint URLs. | |
| All dataset-specific hacks remain inside this file. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from .config import RuntimeConfig | |
| from .models import DatasetConfig | |
| logger = logging.getLogger(__name__) | |
| def load_dataset_config(dataset_id: str, config: RuntimeConfig) -> DatasetConfig: | |
| """Load dataset configuration from the runtime config. | |
| Args: | |
| dataset_id: The dataset identifier URL. | |
| config: The runtime configuration containing dataset definitions. | |
| Returns: | |
| DatasetConfig for the requested dataset. | |
| Raises: | |
| ValueError: If dataset_id is not found in the config. | |
| """ | |
| if dataset_id not in config.datasets: | |
| available = list(config.datasets.keys()) | |
| raise ValueError( | |
| f"Unknown dataset_id: {dataset_id!r}. " | |
| f"Available datasets: {available}" | |
| ) | |
| ds_raw = config.datasets[dataset_id] | |
| return DatasetConfig( | |
| dataset_id=dataset_id, | |
| endpoint_url=ds_raw["endpoint_url"], | |
| kg_profile_path=ds_raw.get("kg_profile_path", ""), | |
| default_prefixes=ds_raw.get("default_prefixes", {}), | |
| mode=ds_raw.get("mode", "dbpedia"), | |
| ) | |
| def load_kg_profile(dataset: DatasetConfig) -> dict: | |
| """Load the KG profile for a dataset. | |
| The KG profile contains entity labels, property labels, and class labels | |
| used for context building. Returns an empty profile structure if the | |
| profile file does not exist yet. | |
| Args: | |
| dataset: The dataset configuration. | |
| Returns: | |
| Dictionary with keys: entities, properties, classes. | |
| """ | |
| empty_profile = { | |
| "entities": [], | |
| "properties": [], | |
| "classes": [], | |
| "metadata": { | |
| "dataset_id": dataset.dataset_id, | |
| "mode": dataset.mode, | |
| }, | |
| } | |
| profile_path = Path(dataset.kg_profile_path) | |
| if not profile_path.exists(): | |
| logger.warning( | |
| "KG profile not found at %s — using empty profile. " | |
| "Context building will rely on prefix hints only.", | |
| dataset.kg_profile_path, | |
| ) | |
| return empty_profile | |
| try: | |
| with open(profile_path, "r") as f: | |
| profile = json.load(f) | |
| # Ensure required keys exist | |
| for key in ("entities", "properties", "classes"): | |
| if key not in profile: | |
| profile[key] = [] | |
| return profile | |
| except (json.JSONDecodeError, OSError) as exc: | |
| logger.error("Failed to load KG profile from %s: %s", profile_path, exc) | |
| return empty_profile | |
| def get_endpoint_url(dataset: DatasetConfig) -> str: | |
| """Get the SPARQL endpoint URL for a dataset. | |
| Args: | |
| dataset: The dataset configuration. | |
| Returns: | |
| The endpoint URL string. | |
| """ | |
| return dataset.endpoint_url | |