Spaces:
Runtime error
Runtime error
neindochoh
commited on
Commit
•
2a002f5
1
Parent(s):
b06dffc
Upload folder using huggingface_hub
Browse files- Dockerfile +20 -0
- README.md +9 -5
- run.py +103 -0
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
ARG SPOTLIGHT_VERSION=1.5.0rc1
|
4 |
+
|
5 |
+
RUN useradd -m -u 1000 user
|
6 |
+
|
7 |
+
USER user
|
8 |
+
|
9 |
+
ENV HOME=/home/user \
|
10 |
+
PATH=/home/user/.local/bin:$PATH
|
11 |
+
|
12 |
+
WORKDIR $HOME/app
|
13 |
+
|
14 |
+
ENV SPOTLIGHT_VERSION=$SPOTLIGHT_VERSION
|
15 |
+
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
|
16 |
+
RUN pip install --no-cache-dir --upgrade "renumics-spotlight==${SPOTLIGHT_VERSION}"
|
17 |
+
|
18 |
+
COPY --chown=user --chmod=0755 run.py .
|
19 |
+
|
20 |
+
CMD ["./run.py"]
|
README.md
CHANGED
@@ -1,10 +1,14 @@
|
|
1 |
---
|
2 |
-
title: Spotlight
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
|
|
|
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
|
10 |
-
|
|
|
1 |
---
|
2 |
+
title: Spotlight boolq
|
3 |
+
emoji: 🔬
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
+
datasets: [boolq, renumics/spotlight-boolq-enrichment]
|
9 |
+
tags: [renumics, spotlight, EDA, enriched, data-centric-ai, viewer]
|
10 |
pinned: false
|
11 |
+
license: mit
|
12 |
---
|
13 |
|
14 |
+
# Explore boolq with [Renumics Spotlight](https://github.com/renumics/spotlight)!
|
run.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Serve a Hugging Face dataset.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import dataclasses
|
7 |
+
import os
|
8 |
+
from typing import Optional
|
9 |
+
|
10 |
+
import datasets
|
11 |
+
import huggingface_hub
|
12 |
+
from renumics import spotlight # type: ignore
|
13 |
+
|
14 |
+
|
15 |
+
def login() -> None:
|
16 |
+
"""
|
17 |
+
Login to Hugging Face Hub.
|
18 |
+
"""
|
19 |
+
if token := os.environ.get("HF_TOKEN"):
|
20 |
+
huggingface_hub.login(token)
|
21 |
+
|
22 |
+
|
23 |
+
@dataclasses.dataclass
|
24 |
+
class HFSettings:
|
25 |
+
"""
|
26 |
+
Hugging Face settings.
|
27 |
+
"""
|
28 |
+
|
29 |
+
dataset: str
|
30 |
+
subset: Optional[str] = None
|
31 |
+
split: Optional[str] = None
|
32 |
+
revision: Optional[str] = None
|
33 |
+
|
34 |
+
enrichment: Optional[str] = None
|
35 |
+
enrichment_revision: Optional[str] = None
|
36 |
+
|
37 |
+
@classmethod
|
38 |
+
def from_environ(cls) -> "HFSettings":
|
39 |
+
"""
|
40 |
+
Parse Hugging Face settings from environment.
|
41 |
+
"""
|
42 |
+
dataset = os.environ.get("HF_DATASET") or None
|
43 |
+
if dataset is None:
|
44 |
+
raise RuntimeError(
|
45 |
+
"Desired Hugging Face dataset must be set as `HF_DATASET` "
|
46 |
+
"environment variable."
|
47 |
+
)
|
48 |
+
return cls(
|
49 |
+
dataset,
|
50 |
+
os.environ.get("HF_SUBSET") or None,
|
51 |
+
os.environ.get("HF_SPLIT") or None,
|
52 |
+
os.environ.get("HF_REVISION") or None,
|
53 |
+
os.environ.get("HF_ENRICHMENT") or None,
|
54 |
+
os.environ.get("HF_ENRICHMENT_REVISION") or None,
|
55 |
+
)
|
56 |
+
|
57 |
+
def __str__(self) -> str:
|
58 |
+
return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"
|
59 |
+
|
60 |
+
|
61 |
+
if __name__ == "__main__":
|
62 |
+
"""
|
63 |
+
Load and serve the given Hugging Face dataset.
|
64 |
+
"""
|
65 |
+
login()
|
66 |
+
|
67 |
+
hf_settings = HFSettings.from_environ()
|
68 |
+
print(f"Loading Hugging Face dataset {hf_settings}.")
|
69 |
+
ds = datasets.load_dataset(
|
70 |
+
hf_settings.dataset,
|
71 |
+
hf_settings.subset,
|
72 |
+
split=hf_settings.split,
|
73 |
+
revision=hf_settings.revision,
|
74 |
+
)
|
75 |
+
if hf_settings.enrichment is not None:
|
76 |
+
ds_enrichment = datasets.load_dataset(
|
77 |
+
hf_settings.enrichment,
|
78 |
+
hf_settings.subset,
|
79 |
+
split=hf_settings.split,
|
80 |
+
revision=hf_settings.enrichment_revision,
|
81 |
+
)
|
82 |
+
if len(ds_enrichment) != len(ds):
|
83 |
+
raise RuntimeError(
|
84 |
+
f"Length of the enrichment dataset ({len(ds_enrichment)}) "
|
85 |
+
f"mismatches length of the original dataset ({len(ds)})"
|
86 |
+
)
|
87 |
+
ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)
|
88 |
+
|
89 |
+
dtypes = {}
|
90 |
+
for col in ds.column_names:
|
91 |
+
if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
|
92 |
+
dtypes[col] = spotlight.dtypes.embedding_dtype
|
93 |
+
|
94 |
+
if not isinstance(ds, datasets.Dataset):
|
95 |
+
raise TypeError(
|
96 |
+
f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
|
97 |
+
"`datasets.Dataset`. Did you forget to specify subset and/or split "
|
98 |
+
"(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
|
99 |
+
)
|
100 |
+
print(f"Serving Hugging Face dataset {hf_settings}.")
|
101 |
+
spotlight.show(
|
102 |
+
ds, host="0.0.0.0", port=7860, wait="forever", dtype=dtypes, analyze=True
|
103 |
+
)
|