File size: 3,168 Bytes
38b8acb bfb1446 38b8acb bfb1446 38b8acb bfb1446 38b8acb bfb1446 38b8acb bfb1446 38b8acb c15dd8f bfb1446 aeb5f87 bfb1446 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
#!/usr/bin/env python3
"""
Serve a Hugging Face dataset.
"""
import dataclasses
import os
from typing import Optional
import datasets
import huggingface_hub
from renumics import spotlight # type: ignore
def login() -> None:
"""
Login to Hugging Face Hub.
"""
if token := os.environ.get("HF_TOKEN"):
huggingface_hub.login(token)
@dataclasses.dataclass
class HFSettings:
"""
Hugging Face settings.
"""
dataset: str
subset: Optional[str] = None
split: Optional[str] = None
revision: Optional[str] = None
enrichment: Optional[str] = None
enrichment_revision: Optional[str] = None
@classmethod
def from_environ(cls) -> "HFSettings":
"""
Parse Hugging Face settings from environment.
"""
dataset = os.environ.get("HF_DATASET") or None
if dataset is None:
raise RuntimeError(
"Desired Hugging Face dataset must be set as `HF_DATASET` "
"environment variable."
)
return cls(
dataset,
os.environ.get("HF_SUBSET") or None,
os.environ.get("HF_SPLIT") or None,
os.environ.get("HF_REVISION") or None,
os.environ.get("HF_ENRICHMENT") or None,
os.environ.get("HF_ENRICHMENT_REVISION") or None,
)
def __str__(self) -> str:
return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"
if __name__ == "__main__":
"""
Load and serve the given Hugging Face dataset.
"""
login()
hf_settings = HFSettings.from_environ()
print(f"Loading Hugging Face dataset {hf_settings}.")
ds = datasets.load_dataset(
hf_settings.dataset,
hf_settings.subset,
split=hf_settings.split,
revision=hf_settings.revision,
)
if hf_settings.enrichment is not None:
ds_enrichment = datasets.load_dataset(
hf_settings.enrichment,
hf_settings.subset,
split=hf_settings.split,
revision=hf_settings.enrichment_revision,
)
if len(ds_enrichment) != len(ds):
raise RuntimeError(
f"Length of the enrichment dataset ({len(ds_enrichment)}) "
f"mismatches length of the original dataset ({len(ds)})"
)
ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)
dtypes = {}
for col in ds.column_names:
if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
dtypes[col] = spotlight.dtypes.embedding_dtype
if not isinstance(ds, datasets.Dataset):
raise TypeError(
f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
"`datasets.Dataset`. Did you forget to specify subset and/or split "
"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
)
print(f"Serving Hugging Face dataset {hf_settings}.")
spotlight.show(
ds, host="0.0.0.0", no_ssl=True, port=7860, wait="forever", dtype=dtypes, layout="spotlight-layout.json", analyze=False
)
|