nbroad HF staff commited on
Commit
f7ff38b
1 Parent(s): 2492d76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -18
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
 
3
- from utils import load_hf_dataset, get_model_and_tokenizer, batch_embed, download_wikipedia
4
-
5
 
6
  # TODO: add instructor models
7
  # "hkunlp/instructor-xl",
@@ -40,23 +40,35 @@ optimization_options = list(opt2desc.values())
40
 
41
 
42
 
43
- def download(
44
  ds_name,
45
  ds_config,
 
46
  ds_split,
 
 
47
  num2skip,
48
  num2embed,
49
- progress=gr.Progress(),
50
  ):
51
- if progress is not None:
52
- progress(0.5, "Loading dataset...")
53
 
54
- if ds_name == "wikipedia":
55
- ds = download_wikipedia(ds_name, ds_config, num2skip, num2embed)
56
- else:
57
- ds = load_hf_dataset(ds_name, ds_config, ds_split)
 
58
 
59
- return f"Downloaded! It has {len(ds)} docs."
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
 
@@ -71,11 +83,10 @@ def embed(
71
  new_dataset_id,
72
  num2skip,
73
  num2embed,
74
- progress=gr.Progress(),
75
  ):
76
- if progress is not None:
77
- progress(0.5, "Loading dataset...")
78
- ds = load_hf_dataset(ds_name, ds_config, ds_split)
79
 
80
  opt_level = desc2opt[opt_desc]
81
 
@@ -104,6 +115,9 @@ def embed(
104
  with gr.Blocks(title="Bulk embeddings") as demo:
105
  gr.Markdown(
106
  """
 
 
 
107
  This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
108
  articles -- taking about __ hours and costing approximately $__.
109
  This utilizes state-of-the-art open-source embedding models, \
@@ -118,6 +132,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
118
  - Text splitting options
119
  - More control about which rows to embed (skip some, stop early)
120
  - Dynamic padding
 
121
  ## Steps
122
  1. Upload the dataset to the Hugging Face Hub.
123
  2. Enter dataset details into the form below.
@@ -125,6 +140,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
125
  4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
126
  5. Choose a name for the new dataset.
127
  6. Hit run!
 
128
  ### Note:
129
  If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
130
  O4 requires the tokenized documents to be padded to max length.
@@ -170,7 +186,7 @@ with gr.Blocks(title="Bulk embeddings") as demo:
170
  num2skip = gr.Slider(
171
  value=0,
172
  minimum=0,
173
- maximum=10_000_000,
174
  step=1,
175
  label="Number of rows to skip",
176
  )
@@ -178,14 +194,22 @@ with gr.Blocks(title="Bulk embeddings") as demo:
178
  num2embed = gr.Slider(
179
  value=30000,
180
  minimum=-1,
181
- maximum=10_000_000,
182
  step=1,
183
  label="Number of rows to embed (-1 = all)",
184
  )
185
 
 
 
 
 
 
 
 
 
186
  with gr.Row():
187
 
188
- download_btn = gr.Button(value="Download dataset!")
189
  embed_btn = gr.Button(value="Embed texts!")
190
 
191
  last = gr.Textbox(value="")
 
1
  import gradio as gr
2
 
3
+ from data import download_dataset, tokenize_dataset, load_tokenized_dataset
4
+ from infer import get_model_and_tokenizer, batch_embed
5
 
6
  # TODO: add instructor models
7
  # "hkunlp/instructor-xl",
 
40
 
41
 
42
 
43
+ def download_and_tokenize(
44
  ds_name,
45
  ds_config,
46
+ column_name,
47
  ds_split,
48
+ model_choice,
49
+ opt_desc,
50
  num2skip,
51
  num2embed,
52
+ progress=gr.Progress(track_tqdm=True),
53
  ):
 
 
54
 
55
+ num_samples = download_dataset(ds_name, ds_config, ds_split, num2skip, num2embed)
56
+
57
+ opt_level = desc2opt[opt_desc]
58
+
59
+ model_name = model_choice.split()[0]
60
 
61
+ tokenize_dataset(
62
+ ds_name=ds_name,
63
+ ds_config=ds_config,
64
+ model_name=model_name,
65
+ opt_level=opt_level,
66
+ column_name=column_name,
67
+ num2skip=num2skip,
68
+ num2embed=num2embed,
69
+ )
70
+
71
+ return f"Downloaded! It has {len(num_samples)} docs."
72
 
73
 
74
 
 
83
  new_dataset_id,
84
  num2skip,
85
  num2embed,
86
+ progress=gr.Progress(track_tqdm=True),
87
  ):
88
+
89
+ ds = load_tokenized_dataset(ds_name, ds_config, ds_split)
 
90
 
91
  opt_level = desc2opt[opt_desc]
92
 
 
115
  with gr.Blocks(title="Bulk embeddings") as demo:
116
  gr.Markdown(
117
  """
118
+ # Bulk Embeddings
119
+
120
+
121
  This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \
122
  articles -- taking about __ hours and costing approximately $__.
123
  This utilizes state-of-the-art open-source embedding models, \
 
132
  - Text splitting options
133
  - More control about which rows to embed (skip some, stop early)
134
  - Dynamic padding
135
+
136
  ## Steps
137
  1. Upload the dataset to the Hugging Face Hub.
138
  2. Enter dataset details into the form below.
 
140
  4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details.
141
  5. Choose a name for the new dataset.
142
  6. Hit run!
143
+
144
  ### Note:
145
  If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \
146
  O4 requires the tokenized documents to be padded to max length.
 
186
  num2skip = gr.Slider(
187
  value=0,
188
  minimum=0,
189
+ maximum=100_000_000,
190
  step=1,
191
  label="Number of rows to skip",
192
  )
 
194
  num2embed = gr.Slider(
195
  value=30000,
196
  minimum=-1,
197
+ maximum=100_000_000,
198
  step=1,
199
  label="Number of rows to embed (-1 = all)",
200
  )
201
 
202
+ num2upload = gr.Slider(
203
+ value=10000,
204
+ minimum=1000,
205
+ maximum=100000,
206
+ step=1000,
207
+ label="Chunk size for uploading",
208
+ )
209
+
210
  with gr.Row():
211
 
212
+ download_btn = gr.Button(value="Download and tokenize dataset!")
213
  embed_btn = gr.Button(value="Embed texts!")
214
 
215
  last = gr.Textbox(value="")