|
|
|
""" |
|
Serve a Hugging Face dataset. |
|
""" |
|
|
|
import dataclasses |
|
import os |
|
from typing import Optional |
|
|
|
import datasets |
|
import huggingface_hub |
|
from renumics import spotlight |
|
|
|
|
|
def login() -> None: |
|
""" |
|
Login to Hugging Face Hub. |
|
""" |
|
if token := os.environ.get("HF_TOKEN"): |
|
huggingface_hub.login(token) |
|
|
|
|
|
@dataclasses.dataclass |
|
class HFSettings: |
|
""" |
|
Hugging Face settings. |
|
""" |
|
|
|
dataset: str |
|
subset: Optional[str] = None |
|
split: Optional[str] = None |
|
revision: Optional[str] = None |
|
|
|
enrichment: Optional[str] = None |
|
enrichment_revision: Optional[str] = None |
|
|
|
@classmethod |
|
def from_environ(cls) -> "HFSettings": |
|
""" |
|
Parse Hugging Face settings from environment. |
|
""" |
|
dataset = os.environ.get("HF_DATASET") or None |
|
if dataset is None: |
|
raise RuntimeError( |
|
"Desired Hugging Face dataset must be set as `HF_DATASET` " |
|
"environment variable." |
|
) |
|
return cls( |
|
dataset, |
|
os.environ.get("HF_SUBSET") or None, |
|
os.environ.get("HF_SPLIT") or None, |
|
os.environ.get("HF_REVISION") or None, |
|
os.environ.get("HF_ENRICHMENT") or None, |
|
os.environ.get("HF_ENRICHMENT_REVISION") or None, |
|
) |
|
|
|
def __str__(self) -> str: |
|
return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]" |
|
|
|
|
|
if __name__ == "__main__": |
|
""" |
|
Load and serve the given Hugging Face dataset. |
|
""" |
|
login() |
|
|
|
hf_settings = HFSettings.from_environ() |
|
print(f"Loading Hugging Face dataset {hf_settings}.") |
|
ds = datasets.load_dataset( |
|
hf_settings.dataset, |
|
hf_settings.subset, |
|
split=hf_settings.split, |
|
revision=hf_settings.revision, |
|
) |
|
if hf_settings.enrichment is not None: |
|
ds_enrichment = datasets.load_dataset( |
|
hf_settings.enrichment, |
|
hf_settings.subset, |
|
split=hf_settings.split, |
|
revision=hf_settings.enrichment_revision, |
|
) |
|
if len(ds_enrichment) != len(ds): |
|
raise RuntimeError( |
|
f"Length of the enrichment dataset ({len(ds_enrichment)}) " |
|
f"mismatches length of the original dataset ({len(ds)})" |
|
) |
|
ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1) |
|
|
|
dtypes = {} |
|
for col in ds.column_names: |
|
if "embedding" in col and isinstance(ds.features[col], datasets.Sequence): |
|
dtypes[col] = spotlight.dtypes.embedding_dtype |
|
|
|
if not isinstance(ds, datasets.Dataset): |
|
raise TypeError( |
|
f"Loaded Hugging Face dataset is of type {type(ds)} instead of " |
|
"`datasets.Dataset`. Did you forget to specify subset and/or split " |
|
"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?" |
|
) |
|
print(f"Serving Hugging Face dataset {hf_settings}.") |
|
|
|
|
|
spotlight.show( |
|
ds, host="0.0.0.0", no_ssl=True, port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False |
|
) |
|
|