File size: 2,272 Bytes
4f83ec0
6b97460
4f83ec0
 
 
 
 
 
6b97460
6447366
6b97460
 
 
6447366
4f83ec0
 
6b97460
 
 
 
 
 
 
 
 
4f83ec0
 
6b97460
4f83ec0
6b97460
 
 
4f83ec0
 
 
 
 
 
 
61755fe
4f83ec0
 
 
 
 
 
 
 
6b97460
 
4f83ec0
 
 
 
 
 
 
 
 
 
 
 
6b97460
4f83ec0
6447366
4f83ec0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
from urllib.parse import urlparse, parse_qs

import gradio as gr
import io
import pandas as pd
import spaces

from generate import stream_jsonl_file

MAX_SIZE = 20
DEFAULT_SEED = 42
DEFAULT_SIZE = 3

@spaces.GPU(duration=120)
def stream_output(filename: str):
    parsed_filename = urlparse(filename)
    filename = parsed_filename.path
    params = parse_qs(parsed_filename.query)
    prompt = params["prompt"][0] if "prompt" in params else ""
    columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
    size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
    seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
    if size > MAX_SIZE:
        yield None, None, "Error: Maximum size is 20"
    content = ""
    start_time = time.time()
    for i, chunk in enumerate(stream_jsonl_file(
        filename=filename,
        prompt=prompt,
        columns=columns,
        seed=seed,
        size=size,
    )):
        content += chunk
        df = pd.read_json(io.StringIO(content), lines=True)
        state_msg = (
            f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s"
            if i + 1 == size else
            f"⚙️ Generating... [{i + 1}/{size}]"
        )
        yield df, "```json\n" + content + "\n```", state_msg

title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
    "movies_data.jsonl",
    "dungeon_and_dragon_characters.jsonl"
    "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
    "common_first_names.jsonl?columns=first_name,popularity&size=10",
]

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    filename_comp = gr.Textbox(examples[0], placeholder=examples[0])
    gr.Examples(examples, filename_comp)
    generate_button = gr.Button("Generate dataset")
    state_msg_comp = gr.Markdown("🔥 Ready to generate")
    with gr.Tab("Dataset"):
        dataframe_comp = gr.DataFrame()
    with gr.Tab("File content"):
        file_content_comp = gr.Markdown()

    generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])


demo.launch()