#!/usr/bin/env python3 """ Serve a Hugging Face dataset. """ import dataclasses import os from typing import Optional import datasets import huggingface_hub from renumics import spotlight # type: ignore def login() -> None: """ Login to Hugging Face Hub. """ if token := os.environ.get("HF_TOKEN"): huggingface_hub.login(token) @dataclasses.dataclass class HFSettings: """ Hugging Face settings. """ dataset: str subset: Optional[str] = None split: Optional[str] = None revision: Optional[str] = None enrichment: Optional[str] = None enrichment_revision: Optional[str] = None @classmethod def from_environ(cls) -> "HFSettings": """ Parse Hugging Face settings from environment. """ dataset = os.environ.get("HF_DATASET") or None if dataset is None: raise RuntimeError( "Desired Hugging Face dataset must be set as `HF_DATASET` " "environment variable." ) return cls( dataset, os.environ.get("HF_SUBSET") or None, os.environ.get("HF_SPLIT") or None, os.environ.get("HF_REVISION") or None, os.environ.get("HF_ENRICHMENT") or None, os.environ.get("HF_ENRICHMENT_REVISION") or None, ) def __str__(self) -> str: return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]" if __name__ == "__main__": """ Load and serve the given Hugging Face dataset. """ login() hf_settings = HFSettings.from_environ() print(f"Loading Hugging Face dataset {hf_settings}.") ds = datasets.load_dataset( hf_settings.dataset, hf_settings.subset, split=hf_settings.split, revision=hf_settings.revision, ) if hf_settings.enrichment is not None: ds_enrichment = datasets.load_dataset( hf_settings.enrichment, hf_settings.subset, split=hf_settings.split, revision=hf_settings.enrichment_revision, ) if len(ds_enrichment) != len(ds): raise RuntimeError( f"Length of the enrichment dataset ({len(ds_enrichment)}) " f"mismatches length of the original dataset ({len(ds)})" ) ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1) dtypes = {} for col in ds.column_names: if "embedding" in col and isinstance(ds.features[col], datasets.Sequence): dtypes[col] = spotlight.dtypes.embedding_dtype if not isinstance(ds, datasets.Dataset): raise TypeError( f"Loaded Hugging Face dataset is of type {type(ds)} instead of " "`datasets.Dataset`. Did you forget to specify subset and/or split " "(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?" ) print(f"Serving Hugging Face dataset {hf_settings}.") spotlight.show( ds, host="0.0.0.0", no_ssl=True, port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False )