nbroad HF staff commited on
Commit
595c4bf
·
1 Parent(s): 9f6b9a6

add option to download

Browse files
Files changed (1) hide show
  1. app.py +34 -15
app.py CHANGED
@@ -39,7 +39,23 @@ desc2opt = {v: k for k, v in opt2desc.items()}
39
  optimization_options = list(opt2desc.values())
40
 
41
 
42
- def run(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ds_name,
44
  ds_config,
45
  column_name,
@@ -84,14 +100,10 @@ with gr.Blocks(title="Bulk embeddings") as demo:
84
  """
85
  This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
86
  articles -- taking about __ hours and costing approximately $__.
87
-
88
-
89
  This utilizes state-of-the-art open-source embedding models, \
90
  and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
91
  levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
92
-
93
  Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
94
-
95
  Future options:
96
  - OpenVino for CPU inference
97
  - TensorRT for GPU inference
@@ -100,22 +112,16 @@ with gr.Blocks(title="Bulk embeddings") as demo:
100
  - Text splitting options
101
  - More control about which rows to embed (skip some, stop early)
102
  - Dynamic padding
103
-
104
  ## Steps
105
-
106
  1. Upload the dataset to the Hugging Face Hub.
107
  2. Enter dataset details into the form below.
108
  3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
109
  4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
110
  5. Choose a name for the new dataset.
111
  6. Hit run!
112
-
113
-
114
  ### Note:
115
-
116
  If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
117
  O4 requires the tokenized documents to be padded to max length.
118
-
119
  """
120
  )
121
 
@@ -172,12 +178,25 @@ with gr.Blocks(title="Bulk embeddings") as demo:
172
  )
173
 
174
  with gr.Row():
175
- btn = gr.Button(value="Embed texts!")
 
 
176
 
177
  last = gr.Textbox(value="")
178
 
179
- btn.click(
180
- fn=run,
 
 
 
 
 
 
 
 
 
 
 
181
  inputs=[
182
  ds_name,
183
  ds_config,
@@ -194,4 +213,4 @@ with gr.Blocks(title="Bulk embeddings") as demo:
194
 
195
 
196
  if __name__ == "__main__":
197
- demo.queue(concurrency_count=20).launch(show_error=True)
 
39
  optimization_options = list(opt2desc.values())
40
 
41
 
42
+
43
+ def download(
44
+ ds_name,
45
+ ds_config,
46
+ ds_split,
47
+ progress=gr.Progress(),
48
+ ):
49
+ if progress is not None:
50
+ progress(0.5, "Loading dataset...")
51
+ ds = load_hf_dataset(ds_name, ds_config, ds_split)
52
+
53
+ return f"Downloaded! It has {len(ds)} docs."
54
+
55
+
56
+
57
+
58
+ def embed(
59
  ds_name,
60
  ds_config,
61
  column_name,
 
100
  """
101
  This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
102
  articles -- taking about __ hours and costing approximately $__.
 
 
103
  This utilizes state-of-the-art open-source embedding models, \
104
  and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \
105
  levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase.
 
106
  Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime.
 
107
  Future options:
108
  - OpenVino for CPU inference
109
  - TensorRT for GPU inference
 
112
  - Text splitting options
113
  - More control about which rows to embed (skip some, stop early)
114
  - Dynamic padding
 
115
  ## Steps
 
116
  1. Upload the dataset to the Hugging Face Hub.
117
  2. Enter dataset details into the form below.
118
  3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
119
  4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
120
  5. Choose a name for the new dataset.
121
  6. Hit run!
 
 
122
  ### Note:
 
123
  If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
124
  O4 requires the tokenized documents to be padded to max length.
 
125
  """
126
  )
127
 
 
178
  )
179
 
180
  with gr.Row():
181
+
182
+ download_btn = gr.Button(value="Download dataset!")
183
+ embed_btn = gr.Button(value="Embed texts!")
184
 
185
  last = gr.Textbox(value="")
186
 
187
+ download_btn.click(
188
+ fn=download,
189
+ inputs=[
190
+ ds_name,
191
+ ds_config,
192
+ column_name,
193
+ ds_split,
194
+ ],
195
+ outputs=last,
196
+ )
197
+
198
+ embed_btn.click(
199
+ fn=embed,
200
  inputs=[
201
  ds_name,
202
  ds_config,
 
213
 
214
 
215
  if __name__ == "__main__":
216
+ demo.queue(concurrency_count=20).launch(show_error=True, debug=True)