neindochoh commited on
Commit
a78959a
1 Parent(s): 4bfbb0b

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -0
  2. README.md +7 -3
  3. run.py +127 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ ARG SPOTLIGHT_VERSION=1.5.0
4
+
5
+ RUN useradd -m -u 1000 user
6
+
7
+ USER user
8
+
9
+ ENV HOME=/home/user \
10
+ PATH=/home/user/.local/bin:$PATH
11
+
12
+ WORKDIR $HOME/app
13
+
14
+ ENV SPOTLIGHT_VERSION=$SPOTLIGHT_VERSION
15
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel
16
+ RUN pip install --no-cache-dir --upgrade "renumics-spotlight==${SPOTLIGHT_VERSION}"
17
+
18
+ COPY --chown=user --chmod=0755 run.py .
19
+
20
+ CMD ["./run.py"]
README.md CHANGED
@@ -1,10 +1,14 @@
1
  ---
2
- title: Spotlight-vikp-textbook Quality Programming
3
- emoji: 🏢
4
  colorFrom: indigo
5
  colorTo: green
6
  sdk: docker
 
 
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ # title:
3
+ emoji: 🔬
4
  colorFrom: indigo
5
  colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
+ datasets: [vikp/textbook_quality_programming, renumics/spotlight-vikp-textbook_quality_programming-enrichment]
9
+ tags: [renumics, spotlight, EDA, enriched, data-centric-ai, viewer]
10
  pinned: false
11
+ license: mit
12
  ---
13
 
14
+ # Explore vikp/textbook_quality_programming with [Renumics Spotlight](https://github.com/renumics/spotlight)!
run.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Serve a Hugging Face dataset.
4
+ """
5
+
6
+ import dataclasses
7
+ import os
8
+ from typing import Optional
9
+
10
+ import datasets
11
+ import huggingface_hub
12
+ from renumics import spotlight # type: ignore
13
+
14
+
15
+ def login() -> None:
16
+ """
17
+ Login to Hugging Face Hub.
18
+ """
19
+ if token := os.environ.get("HF_TOKEN"):
20
+ huggingface_hub.login(token)
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class HFSettings:
25
+ """
26
+ Hugging Face settings.
27
+ """
28
+
29
+ dataset: str
30
+ subset: Optional[str] = None
31
+ split: Optional[str] = None
32
+ revision: Optional[str] = None
33
+
34
+ enrichment: Optional[str] = None
35
+ enrichment_revision: Optional[str] = None
36
+
37
+ @classmethod
38
+ def from_environ(cls) -> "HFSettings":
39
+ """
40
+ Parse Hugging Face settings from environment.
41
+ """
42
+ dataset = os.environ.get("HF_DATASET") or None
43
+ if dataset is None:
44
+ raise RuntimeError(
45
+ "Desired Hugging Face dataset must be set as `HF_DATASET` "
46
+ "environment variable."
47
+ )
48
+ return cls(
49
+ dataset,
50
+ os.environ.get("HF_SUBSET") or None,
51
+ os.environ.get("HF_SPLIT") or None,
52
+ os.environ.get("HF_REVISION") or None,
53
+ os.environ.get("HF_ENRICHMENT") or None,
54
+ os.environ.get("HF_ENRICHMENT_REVISION") or None,
55
+ )
56
+
57
+ def __str__(self) -> str:
58
+ return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"
59
+
60
+
61
+ if __name__ == "__main__":
62
+ """
63
+ Load and serve the given Hugging Face dataset.
64
+ """
65
+ login()
66
+
67
+ hf_settings = HFSettings.from_environ()
68
+ print(f"Loading Hugging Face dataset {hf_settings}.")
69
+ ds = datasets.load_dataset(
70
+ hf_settings.dataset,
71
+ hf_settings.subset,
72
+ split=hf_settings.split,
73
+ revision=hf_settings.revision,
74
+ )
75
+ if hf_settings.enrichment is not None:
76
+ ds_enrichment = datasets.load_dataset(
77
+ hf_settings.enrichment,
78
+ hf_settings.subset,
79
+ split=hf_settings.split,
80
+ revision=hf_settings.enrichment_revision,
81
+ )
82
+ if len(ds_enrichment) != len(ds):
83
+ raise RuntimeError(
84
+ f"Length of the enrichment dataset ({len(ds_enrichment)}) "
85
+ f"mismatches length of the original dataset ({len(ds)})"
86
+ )
87
+ ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)
88
+
89
+ dtypes = {}
90
+ for col in ds.column_names:
91
+ if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
92
+ dtypes[col] = spotlight.dtypes.embedding_dtype
93
+
94
+ layout = spotlight.layout.split(
95
+ spotlight.layout.split(
96
+ spotlight.layout.tab(spotlight.layout.table(), weight=4),
97
+ spotlight.layout.tab(
98
+ spotlight.layout.similaritymap(),
99
+ spotlight.layout.scatterplot(),
100
+ weight=3,
101
+ ),
102
+ spotlight.layout.tab(
103
+ spotlight.layout.histogram(), spotlight.layout.metric(), weight=3
104
+ ),
105
+ weight=5,
106
+ ),
107
+ spotlight.layout.tab(spotlight.layout.inspector(), weight=3),
108
+ orientation="vertical",
109
+ )
110
+
111
+ if not isinstance(ds, datasets.Dataset):
112
+ raise TypeError(
113
+ f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
114
+ "`datasets.Dataset`. Did you forget to specify subset and/or split "
115
+ "(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
116
+ )
117
+ print(f"Serving Hugging Face dataset {hf_settings}.")
118
+ spotlight.show(
119
+ ds,
120
+ host="0.0.0.0",
121
+ port=7860,
122
+ wait="forever",
123
+ dtype=dtypes,
124
+ layout=layout,
125
+ analyze=True,
126
+ no_browser=True,
127
+ )