File size: 21,904 Bytes
b6d1901
 
 
 
 
 
 
 
 
ab5bf76
 
b6d1901
8854100
 
 
f32647d
ac0089d
8854100
 
b6d1901
ab5bf76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6d1901
 
 
 
 
 
 
0fb1b95
 
f32647d
b6d1901
 
 
 
 
f32647d
b6d1901
 
 
 
8854100
b6d1901
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
 
 
b6d1901
0fb1b95
b6d1901
 
 
 
 
 
 
 
 
 
ab5bf76
 
 
00f53b5
ab5bf76
 
 
 
 
 
5fad7f1
0fb1b95
002e03d
 
0fb1b95
 
 
002e03d
 
 
 
0fb1b95
b6d1901
 
 
 
 
5fad7f1
 
 
 
 
 
 
 
 
 
 
 
b6d1901
 
5fad7f1
 
 
 
 
b6d1901
8854100
5fad7f1
b6d1901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d89dcd
 
b6d1901
 
 
 
 
 
8854100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
00f53b5
 
8854100
 
 
 
 
b6d1901
 
ab5bf76
 
002e03d
 
 
 
 
cc8e33a
002e03d
b6d1901
ab5bf76
002e03d
0fb1b95
 
 
 
ab5bf76
 
f32647d
ab5bf76
 
 
 
 
 
0fb1b95
f32647d
ab5bf76
 
 
 
 
 
 
 
 
 
67c4080
 
 
 
ab5bf76
 
0fb1b95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
002e03d
cc8e33a
ab5bf76
0fb1b95
 
 
 
 
 
002e03d
0fb1b95
 
ab5bf76
0fb1b95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab5bf76
0fb1b95
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
ab5bf76
0fb1b95
ab5bf76
 
5fad7f1
ab5bf76
 
b6d1901
ab5bf76
 
 
 
 
 
 
 
 
 
 
 
5fad7f1
ab5bf76
 
 
 
 
 
 
 
 
 
 
 
 
0fb1b95
ab5bf76
b6d1901
ab5bf76
 
 
 
 
 
 
0fb1b95
ab5bf76
 
b6d1901
0fb1b95
 
cc8e33a
0fb1b95
 
 
 
ab5bf76
0fb1b95
 
 
002e03d
0fb1b95
002e03d
 
 
 
cc8e33a
002e03d
 
 
cc8e33a
 
 
 
002e03d
 
cc8e33a
002e03d
cc8e33a
5fad7f1
 
002e03d
 
 
cc8e33a
5fad7f1
b6d1901
 
ab5bf76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import gradio as gr

from dataclasses import dataclass
import os
from supabase import create_client, Client
from supabase.client import ClientOptions
from enum import Enum
from datasets import get_dataset_infos
from transformers import AutoConfig
from huggingface_hub import whoami
from typing import Optional, List, Tuple, Union

"""
 Still TODO:
 - validate the user is PRO
 - check the output dataset token is valid (hardcoded for now as a secret)
 - validate max model params
"""


def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
    """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
    if not token:
        return False
    
    if isinstance(token, gr.OAuthToken):
        token_str = token.token
    elif isinstance(token, str):
        token_str = token
    else:
        return False
    
    try:
        user_info = whoami(token=token_str)
        return (
            user_info.get("isPro", False) or
            any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
        )
    except Exception as e:
        print(f"Could not verify user's PRO/Enterprise status: {e}")
        return False



class GenerationStatus(Enum):
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    COMPLETED = "COMPLETED"
    FAILED = "FAILED"


MAX_SAMPLES_PRO = 10000  # max number of samples for PRO/Enterprise users
MAX_SAMPLES_FREE = 100   # max number of samples for free users
MAX_TOKENS = 8192
MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)

@dataclass
class GenerationRequest:
    id: str
    created_at: str
    status: GenerationStatus
    input_dataset_name: str
    input_dataset_config: str
    input_dataset_split: str
    output_dataset_name: str
    prompt_column: str
    model_name_or_path: str
    model_revision: str
    model_token: str | None
    system_prompt: str | None
    max_tokens: int
    temperature: float
    top_k: int
    top_p: float
    input_dataset_token: str | None
    output_dataset_token: str
    username: str
    email: str
    num_output_examples: int
    private: bool = False
    num_retries: int = 0

def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
    # checks that the request is valid
    # - input dataset exists and can be accessed with the provided token
    try:
        input_dataset_info = get_dataset_infos(request.input_dataset_name, token=request.input_dataset_token)[request.input_dataset_config]
    except Exception as e:
        raise Exception(f"Dataset {request.input_dataset_name} does not exist or cannot be accessed with the provided token.")
    
    # check that the input dataset split exists
    if request.input_dataset_split not in input_dataset_info.splits:
        raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")

    # if num_output_examples is 0, set it to the number of examples in the input dataset split
    if request.num_output_examples == 0:
        request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
    else: 
        if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
            raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
        request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"



    # Check user tier and apply appropriate limits
    # Anonymous users (oauth_token is None) are treated as free tier
    is_pro = verify_pro_status(oauth_token) if oauth_token else False
    max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
    
    if request.num_output_examples > max_samples:
        if oauth_token is None:
            user_tier = "anonymous"
        else:
            user_tier = "PRO/Enterprise" if is_pro else "free"
        raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")

    # check the prompt column exists in the dataset
    if request.prompt_column not in input_dataset_info.features:
        raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")

    # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
    # check output_dataset name is valid
    if request.output_dataset_name.count("/") != 1:
        raise Exception("Output dataset name must be in the format 'dataset_name', e.g., 'my-dataset'. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
        
    # check the output dataset is valid and accessible with the provided token
    try:
        output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
        raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
    except Exception as e:
        pass  # dataset does not exist, which is expected

    # check the models exists
    try:
        model_config = AutoConfig.from_pretrained(request.model_name_or_path, 
                                                  revision=request.model_revision,     
                                                  force_download=True,
                                                  token=False
        )
    except Exception as e:
        print(e)
        raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed. The model may be private or gated, which is not supported at this time.")

    # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS    
    if model_config.max_position_embeddings < request.max_tokens:
        raise Exception(f"Model {request.model_name_or_path} max position embeddings {model_config.max_position_embeddings} is less than the requested max tokens {request.max_tokens}.")
    if request.max_tokens > MAX_TOKENS:
        raise Exception(f"Requested max tokens {request.max_tokens} exceeds the limit of {MAX_TOKENS}.")
    
    # check sampling parameters are valid
    if request.temperature < 0.0 or request.temperature > 2.0:
        raise Exception("Temperature must be between 0.0 and 2.0")
    if request.top_k < 1 or request.top_k > 100:
        raise Exception("Top K must be between 1 and 100")
    if request.top_p < 0.0 or request.top_p > 1.0:
        raise Exception("Top P must be between 0.0 and 1.0")   
    
    # check valid email address TODO: use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
    if "@" not in request.email or "." not in request.email.split("@")[-1]:
        raise Exception("Invalid email address")
    
    return request


def add_request_to_db(request: GenerationRequest):
    url: str = os.getenv("SUPABASE_URL")
    key: str = os.getenv("SUPABASE_KEY")
    
    try:
        supabase: Client = create_client(
            url,
            key,
            options=ClientOptions(
                postgrest_client_timeout=10,
                storage_client_timeout=10,
                schema="public",
            )
        )
        
        data = {
            "status": request.status.value,
            "input_dataset_name": request.input_dataset_name,
            "input_dataset_config": request.input_dataset_config,
            "input_dataset_split": request.input_dataset_split,
            "output_dataset_name": request.output_dataset_name,
            "prompt_column": request.prompt_column,
            "model_name_or_path": request.model_name_or_path,
            "model_revision": request.model_revision,
            "model_token": request.model_token,
            "system_prompt": request.system_prompt,
            "max_tokens": request.max_tokens,
            "temperature": request.temperature,
            "top_k": request.top_k,
            "top_p": request.top_p,
            "input_dataset_token": request.input_dataset_token,
            "output_dataset_token": request.output_dataset_token,
            "username": request.username,
            "email": request.email,
            "num_output_examples": request.num_output_examples,
            "private": request.private,
        }
            
        supabase.table("gen-requests").insert(data).execute()
    except Exception as e:
        raise Exception("Failed to add request to database")


def main():
    with gr.Blocks(title="Synthetic Data Generation") as demo:
        gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
        
        # Add sign-in button at the top
        with gr.Row():
            gr.Markdown("")  # Empty space for alignment
            login_button = gr.LoginButton(value="πŸ”‘ Sign in", size="sm")
            gr.Markdown("")  # Empty space for alignment
        
        pro_message = gr.Markdown(visible=False)
        main_interface = gr.Column(visible=True)
        
        # Store the current oauth token for use in submit_request
        current_oauth_token = gr.State(None)
        
        with main_interface:
            with gr.Group():
                with gr.Row():
                    gr.Markdown("# Synthetic Data Generation Request")    
                with gr.Row():
                    gr.Markdown("""
                    Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                    Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
                    """)
            with gr.Accordion("How it works", open=False):
                with gr.Row():
                    gr.Markdown("""
                    **How it works:**
                    1. Provide an input dataset with prompts
                    2. Select a public language model for generation
                    3. Configure generation parameters
                    4. Submit your request.
                    """)
                    gr.Markdown("""              
                    
                    **Requirements:**
                    - Input dataset must be publicly accessible
                    - Model must be publicly accessible (and not gated)
                    - Maximum 10,000 samples per dataset
                    - Maximum of 8192 generated tokens
                    """)
            
            with gr.Tabs():
                with gr.TabItem("Generate Synthetic Data"):
                    with gr.Group():
                        gr.Markdown("##  Model information")
                        with gr.Column():
                            with gr.Row():
                                model_name_or_path = gr.Dropdown(
                                    choices=[
                                        "microsoft/Phi-3.5-mini-instruct",
                                        "Qwen/Qwen2.5-7B-Instruct", 
                                        "meta-llama/Llama-3.2-8B-Instruct",
                                        "mistralai/Mistral-7B-Instruct-v0.3",
                                        "google/gemma-2-9b-it",
                                        "microsoft/DialoGPT-medium",
                                        "HuggingFaceH4/zephyr-7b-beta",
                                        "teknium/OpenHermes-2.5-Mistral-7B",
                                        "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
                                        "01-ai/Yi-34B-Chat"
                                    ],
                                    label="Select Model",
                                    value="microsoft/Phi-3.5-mini-instruct",
                                    info="Choose from popular instruction-tuned models under 40B parameters"
                                )
                            # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
                    with gr.Group():
                        gr.Markdown("##  Dataset information")
                        # Dynamic user limit info - default to anonymous user
                        user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                        with gr.Row():
                            with gr.Column():
                                input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
                                prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
                                
                            with gr.Column():
                                output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
                                num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1)
                    
                    with gr.Accordion("Advanced Options", open=False):
                        with gr.Row():
                            input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
                            input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
                            model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
                        
                        with gr.Group():
                            gr.Markdown("### Generation Parameters")
                            with gr.Row():
                                with gr.Column():
                                    with gr.Row():
                                        max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
                                        temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                                    with gr.Row():
                                        top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
                                        top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
                                    with gr.Row():
                                        system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")

                    with gr.Group():
                        gr.Markdown("##  User Information, for notification when your job is completed (still TODO)")
                        with gr.Row():
                            with gr.Column():
                                with gr.Row():
                                    email = gr.Textbox(label="Email", placeholder="your.email@example.com")
                                # with gr.Row():
                                    # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
                                    # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")

                    submit_btn = gr.Button("Submit Generation Request", variant="primary")
                    output_status = gr.Textbox(label="Status", interactive=False)
                
                with gr.TabItem("Coming Soon"):
                    gr.Markdown("## New features coming soon!")
                    gr.Markdown("This tab will contain additional functionality in future updates.")

            def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
                                max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):

                MASTER_ORG = "synthetic-data-universe/"
                model_token = False # This is currently not supported
                input_dataset_token = None # This is currently not supported
                output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
                
                try:
                    request = GenerationRequest(
                        id="",  # Will be generated when adding to the database
                        created_at="",  # Will be set when adding to the database
                        status=GenerationStatus.PENDING,
                        input_dataset_name=input_dataset_name,
                        input_dataset_split=input_split,
                        input_dataset_config=input_dataset_config,
                        output_dataset_name=MASTER_ORG + output_dataset_name,
                        prompt_column=prompt_col,
                        model_name_or_path=model_name,
                        model_revision=model_rev,
                        model_token=model_token,
                        system_prompt=sys_prompt if sys_prompt else None,
                        max_tokens=int(max_tok),
                        temperature=temp,
                        top_k=int(top_k_val),
                        top_p=top_p_val,
                        input_dataset_token=input_dataset_token if input_dataset_token else None,
                        output_dataset_token=output_dataset_token,
                        num_output_examples=num_output_samples,  # will be set after validating the input dataset
                        username="user",
                        email=email_addr
                    )
                    
                    # check the input dataset exists and can be accessed with the provided token
                    request = validate_request(request, oauth_token)
                    add_request_to_db(request)

                    return "Request submitted successfully!"
                except Exception as e:
                    return f"Error: {str(e)}"
            
            submit_btn.click(
                submit_request,
                inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
                        model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
                outputs=output_status
            )
        
        def update_user_limits(oauth_token):
            if oauth_token is None:
                return "πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)."
            
            is_pro = verify_pro_status(oauth_token)
            if is_pro:
                return "✨ **PRO User**: You can generate up to 10,000 samples per request."
            else:
                return "πŸ‘€ **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."

        def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
            # Always show the interface, whether user is logged in or not
            limit_msg = update_user_limits(oauth_token)
            
            # Update slider maximum based on user tier
            if oauth_token is None:
                max_samples = MAX_SAMPLES_FREE
                button_text = "πŸ”‘ Sign in for PRO benefits"
            else:
                is_pro = verify_pro_status(oauth_token)
                max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
                if is_pro:
                    button_text = f"✨ Signed in as PRO ({profile.name if profile else 'User'})"
                else:
                    button_text = f"πŸ‘€ Signed in as {profile.name if profile else 'User'}"
            
            slider_update = gr.update(maximum=max_samples)
            button_update = gr.update(value=button_text)
            
            return gr.update(visible=True), gr.update(visible=False), oauth_token, limit_msg, slider_update, button_update


        # Handle login state changes - LoginButton automatically handles auth state changes
        # The demo.load will handle both initial load and auth changes
        
        demo.load(control_access, inputs=None, outputs=[main_interface, pro_message, current_oauth_token, user_limit_info, num_output_samples, login_button])
        demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)

if __name__ == "__main__":
    main()