Spaces:
Running
Running
edbeeching
commited on
Commit
Β·
08390cd
1
Parent(s):
860c08d
add load dataset button
Browse files
app.py
CHANGED
|
@@ -198,6 +198,75 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 198 |
return request
|
| 199 |
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def add_request_to_db(request: GenerationRequest):
|
| 202 |
url: str = os.getenv("SUPABASE_URL")
|
| 203 |
key: str = os.getenv("SUPABASE_KEY")
|
|
@@ -342,8 +411,9 @@ def main():
|
|
| 342 |
with gr.Row():
|
| 343 |
with gr.Column():
|
| 344 |
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 345 |
-
|
| 346 |
-
|
|
|
|
| 347 |
with gr.Column():
|
| 348 |
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
|
| 349 |
|
|
@@ -519,6 +589,13 @@ def main():
|
|
| 519 |
except Exception as e:
|
| 520 |
return f"Error: {str(e)}"
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
submit_btn.click(
|
| 523 |
submit_request,
|
| 524 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
|
|
|
| 198 |
return request
|
| 199 |
|
| 200 |
|
| 201 |
+
def load_dataset_info(dataset_name, dataset_token=None):
|
| 202 |
+
"""Load dataset information and return choices for dropdowns"""
|
| 203 |
+
if not dataset_name.strip():
|
| 204 |
+
return (
|
| 205 |
+
gr.update(choices=[], value=None), # config
|
| 206 |
+
gr.update(choices=[], value=None), # split
|
| 207 |
+
gr.update(choices=[], value=None), # prompt_column
|
| 208 |
+
gr.update(value="", interactive=True), # output_dataset_name
|
| 209 |
+
gr.update(interactive=False), # num_output_samples
|
| 210 |
+
"Please enter a dataset name first."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
# Get dataset info
|
| 215 |
+
dataset_infos = get_dataset_infos(dataset_name, token=dataset_token)
|
| 216 |
+
|
| 217 |
+
if not dataset_infos:
|
| 218 |
+
raise Exception("No configs found for this dataset")
|
| 219 |
+
|
| 220 |
+
# Get available configs
|
| 221 |
+
config_choices = list(dataset_infos.keys())
|
| 222 |
+
default_config = config_choices[0] if config_choices else None
|
| 223 |
+
|
| 224 |
+
# Get splits and features for the default config
|
| 225 |
+
if default_config:
|
| 226 |
+
config_info = dataset_infos[default_config]
|
| 227 |
+
split_choices = list(config_info.splits.keys())
|
| 228 |
+
default_split = split_choices[0] if split_choices else None
|
| 229 |
+
|
| 230 |
+
# Get column choices (features)
|
| 231 |
+
column_choices = list(config_info.features.keys())
|
| 232 |
+
default_column = None
|
| 233 |
+
# Try to find a likely prompt column
|
| 234 |
+
for col in column_choices:
|
| 235 |
+
if any(keyword in col.lower() for keyword in ['prompt', 'text', 'question', 'input']):
|
| 236 |
+
default_column = col
|
| 237 |
+
break
|
| 238 |
+
if not default_column and column_choices:
|
| 239 |
+
default_column = column_choices[0]
|
| 240 |
+
else:
|
| 241 |
+
split_choices = []
|
| 242 |
+
column_choices = []
|
| 243 |
+
default_split = None
|
| 244 |
+
default_column = None
|
| 245 |
+
|
| 246 |
+
# Generate a suggested output dataset name
|
| 247 |
+
dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
|
| 248 |
+
suggested_output_name = f"{dataset_base_name}-synthetic"
|
| 249 |
+
|
| 250 |
+
return (
|
| 251 |
+
gr.update(choices=config_choices, value=default_config, interactive=True), # config
|
| 252 |
+
gr.update(choices=split_choices, value=default_split, interactive=True), # split
|
| 253 |
+
gr.update(choices=column_choices, value=default_column, interactive=True), # prompt_column
|
| 254 |
+
gr.update(value=suggested_output_name, interactive=True), # output_dataset_name
|
| 255 |
+
gr.update(interactive=True), # num_output_samples
|
| 256 |
+
f"β
Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
except Exception as e:
|
| 260 |
+
return (
|
| 261 |
+
gr.update(choices=[], value=None, interactive=False), # config
|
| 262 |
+
gr.update(choices=[], value=None, interactive=False), # split
|
| 263 |
+
gr.update(choices=[], value=None, interactive=False), # prompt_column
|
| 264 |
+
gr.update(value="", interactive=False), # output_dataset_name
|
| 265 |
+
gr.update(interactive=False), # num_output_samples
|
| 266 |
+
f"β Error loading dataset info: {str(e)}"
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
def add_request_to_db(request: GenerationRequest):
|
| 271 |
url: str = os.getenv("SUPABASE_URL")
|
| 272 |
key: str = os.getenv("SUPABASE_KEY")
|
|
|
|
| 411 |
with gr.Row():
|
| 412 |
with gr.Column():
|
| 413 |
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 414 |
+
load_info_btn = gr.Button("π Load Dataset Info", size="sm", variant="secondary")
|
| 415 |
+
load_info_status = gr.Markdown("", visible=True)
|
| 416 |
+
|
| 417 |
with gr.Column():
|
| 418 |
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
|
| 419 |
|
|
|
|
| 589 |
except Exception as e:
|
| 590 |
return f"Error: {str(e)}"
|
| 591 |
|
| 592 |
+
# Wire up the Load Dataset Info button
|
| 593 |
+
load_info_btn.click(
|
| 594 |
+
load_dataset_info,
|
| 595 |
+
inputs=[input_dataset_name],
|
| 596 |
+
outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
submit_btn.click(
|
| 600 |
submit_request,
|
| 601 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|