edbeeching commited on
Commit
08390cd
Β·
1 Parent(s): 860c08d

add load dataset button

Browse files
Files changed (1) hide show
  1. app.py +79 -2
app.py CHANGED
@@ -198,6 +198,75 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
198
  return request
199
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def add_request_to_db(request: GenerationRequest):
202
  url: str = os.getenv("SUPABASE_URL")
203
  key: str = os.getenv("SUPABASE_KEY")
@@ -342,8 +411,9 @@ def main():
342
  with gr.Row():
343
  with gr.Column():
344
  input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
345
- prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question", value=None, interactive=False, info="Click Load Info to populate")
346
-
 
347
  with gr.Column():
348
  output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
349
 
@@ -519,6 +589,13 @@ def main():
519
  except Exception as e:
520
  return f"Error: {str(e)}"
521
 
 
 
 
 
 
 
 
522
  submit_btn.click(
523
  submit_request,
524
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
 
198
  return request
199
 
200
 
201
+ def load_dataset_info(dataset_name, dataset_token=None):
202
+ """Load dataset information and return choices for dropdowns"""
203
+ if not dataset_name.strip():
204
+ return (
205
+ gr.update(choices=[], value=None), # config
206
+ gr.update(choices=[], value=None), # split
207
+ gr.update(choices=[], value=None), # prompt_column
208
+ gr.update(value="", interactive=True), # output_dataset_name
209
+ gr.update(interactive=False), # num_output_samples
210
+ "Please enter a dataset name first."
211
+ )
212
+
213
+ try:
214
+ # Get dataset info
215
+ dataset_infos = get_dataset_infos(dataset_name, token=dataset_token)
216
+
217
+ if not dataset_infos:
218
+ raise Exception("No configs found for this dataset")
219
+
220
+ # Get available configs
221
+ config_choices = list(dataset_infos.keys())
222
+ default_config = config_choices[0] if config_choices else None
223
+
224
+ # Get splits and features for the default config
225
+ if default_config:
226
+ config_info = dataset_infos[default_config]
227
+ split_choices = list(config_info.splits.keys())
228
+ default_split = split_choices[0] if split_choices else None
229
+
230
+ # Get column choices (features)
231
+ column_choices = list(config_info.features.keys())
232
+ default_column = None
233
+ # Try to find a likely prompt column
234
+ for col in column_choices:
235
+ if any(keyword in col.lower() for keyword in ['prompt', 'text', 'question', 'input']):
236
+ default_column = col
237
+ break
238
+ if not default_column and column_choices:
239
+ default_column = column_choices[0]
240
+ else:
241
+ split_choices = []
242
+ column_choices = []
243
+ default_split = None
244
+ default_column = None
245
+
246
+ # Generate a suggested output dataset name
247
+ dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
248
+ suggested_output_name = f"{dataset_base_name}-synthetic"
249
+
250
+ return (
251
+ gr.update(choices=config_choices, value=default_config, interactive=True), # config
252
+ gr.update(choices=split_choices, value=default_split, interactive=True), # split
253
+ gr.update(choices=column_choices, value=default_column, interactive=True), # prompt_column
254
+ gr.update(value=suggested_output_name, interactive=True), # output_dataset_name
255
+ gr.update(interactive=True), # num_output_samples
256
+ f"βœ… Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
257
+ )
258
+
259
+ except Exception as e:
260
+ return (
261
+ gr.update(choices=[], value=None, interactive=False), # config
262
+ gr.update(choices=[], value=None, interactive=False), # split
263
+ gr.update(choices=[], value=None, interactive=False), # prompt_column
264
+ gr.update(value="", interactive=False), # output_dataset_name
265
+ gr.update(interactive=False), # num_output_samples
266
+ f"❌ Error loading dataset info: {str(e)}"
267
+ )
268
+
269
+
270
  def add_request_to_db(request: GenerationRequest):
271
  url: str = os.getenv("SUPABASE_URL")
272
  key: str = os.getenv("SUPABASE_KEY")
 
411
  with gr.Row():
412
  with gr.Column():
413
  input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
414
+ load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
415
+ load_info_status = gr.Markdown("", visible=True)
416
+
417
  with gr.Column():
418
  output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
419
 
 
589
  except Exception as e:
590
  return f"Error: {str(e)}"
591
 
592
+ # Wire up the Load Dataset Info button
593
+ load_info_btn.click(
594
+ load_dataset_info,
595
+ inputs=[input_dataset_name],
596
+ outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
597
+ )
598
+
599
  submit_btn.click(
600
  submit_request,
601
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,