Spaces:
Runtime error
Runtime error
add option to download
Browse files
app.py
CHANGED
@@ -39,7 +39,23 @@ desc2opt = {v: k for k, v in opt2desc.items()}
|
|
39 |
optimization_options = list(opt2desc.values())
|
40 |
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
ds_name,
|
44 |
ds_config,
|
45 |
column_name,
|
@@ -84,14 +100,10 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
84 |
"""
|
85 |
This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
|
86 |
articles -- taking about __ hours and costing approximately $__.
|
87 |
-
|
88 |
-
|
89 |
This utilizes state-of-the-art open-source embedding models, \
|
90 |
and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
|
91 |
levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
|
92 |
-
|
93 |
Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
|
94 |
-
|
95 |
Future options:
|
96 |
- OpenVino for CPU inference
|
97 |
- TensorRT for GPU inference
|
@@ -100,22 +112,16 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
100 |
- Text splitting options
|
101 |
- More control about which rows to embed (skip some, stop early)
|
102 |
- Dynamic padding
|
103 |
-
|
104 |
## Steps
|
105 |
-
|
106 |
1. Upload the dataset to the Hugging Face Hub.
|
107 |
2. Enter dataset details into the form below.
|
108 |
3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
|
109 |
4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
|
110 |
5. Choose a name for the new dataset.
|
111 |
6. Hit run!
|
112 |
-
|
113 |
-
|
114 |
### Note:
|
115 |
-
|
116 |
If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
|
117 |
O4 requires the tokenized documents to be padded to max length.
|
118 |
-
|
119 |
"""
|
120 |
)
|
121 |
|
@@ -172,12 +178,25 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
172 |
)
|
173 |
|
174 |
with gr.Row():
|
175 |
-
|
|
|
|
|
176 |
|
177 |
last = gr.Textbox(value="")
|
178 |
|
179 |
-
|
180 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
inputs=[
|
182 |
ds_name,
|
183 |
ds_config,
|
@@ -194,4 +213,4 @@ with gr.Blocks(title="Bulk embeddings") as demo:
|
|
194 |
|
195 |
|
196 |
if __name__ == "__main__":
|
197 |
-
demo.queue(concurrency_count=20).launch(show_error=True)
|
|
|
39 |
optimization_options = list(opt2desc.values())
|
40 |
|
41 |
|
42 |
+
|
43 |
+
def download(
|
44 |
+
ds_name,
|
45 |
+
ds_config,
|
46 |
+
ds_split,
|
47 |
+
progress=gr.Progress(),
|
48 |
+
):
|
49 |
+
if progress is not None:
|
50 |
+
progress(0.5, "Loading dataset...")
|
51 |
+
ds = load_hf_dataset(ds_name, ds_config, ds_split)
|
52 |
+
|
53 |
+
return f"Downloaded! It has {len(ds)} docs."
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
def embed(
|
59 |
ds_name,
|
60 |
ds_config,
|
61 |
column_name,
|
|
|
100 |
"""
|
101 |
This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
|
102 |
articles -- taking about __ hours and costing approximately $__.
|
|
|
|
|
103 |
This utilizes state-of-the-art open-source embedding models, \
|
104 |
and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
|
105 |
levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
|
|
|
106 |
Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
|
|
|
107 |
Future options:
|
108 |
- OpenVino for CPU inference
|
109 |
- TensorRT for GPU inference
|
|
|
112 |
- Text splitting options
|
113 |
- More control about which rows to embed (skip some, stop early)
|
114 |
- Dynamic padding
|
|
|
115 |
## Steps
|
|
|
116 |
1. Upload the dataset to the Hugging Face Hub.
|
117 |
2. Enter dataset details into the form below.
|
118 |
3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
|
119 |
4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
|
120 |
5. Choose a name for the new dataset.
|
121 |
6. Hit run!
|
|
|
|
|
122 |
### Note:
|
|
|
123 |
If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
|
124 |
O4 requires the tokenized documents to be padded to max length.
|
|
|
125 |
"""
|
126 |
)
|
127 |
|
|
|
178 |
)
|
179 |
|
180 |
with gr.Row():
|
181 |
+
|
182 |
+
download_btn = gr.Button(value="Download dataset!")
|
183 |
+
embed_btn = gr.Button(value="Embed texts!")
|
184 |
|
185 |
last = gr.Textbox(value="")
|
186 |
|
187 |
+
download_btn.click(
|
188 |
+
fn=download,
|
189 |
+
inputs=[
|
190 |
+
ds_name,
|
191 |
+
ds_config,
|
192 |
+
column_name,
|
193 |
+
ds_split,
|
194 |
+
],
|
195 |
+
outputs=last,
|
196 |
+
)
|
197 |
+
|
198 |
+
embed_btn.click(
|
199 |
+
fn=embed,
|
200 |
inputs=[
|
201 |
ds_name,
|
202 |
ds_config,
|
|
|
213 |
|
214 |
|
215 |
if __name__ == "__main__":
|
216 |
+
demo.queue(concurrency_count=20).launch(show_error=True, debug=True)
|