LLM_DataGen / gradio_app.py
lhoestq's picture
lhoestq HF staff
test
61755fe
raw
history blame
No virus
2.4 kB
import time
import gradio as gr
import io
import pandas as pd
import spaces
from generate import stream_file
@spaces.GPU(duration=120)
def stream_output(filename: str):
if filename.endswith(".jsonl"):
filename = filename[:-len(".jsonl")]
content = ""
size=3
start_time = time.time()
for i, chunk in enumerate(stream_file(
filename=filename,
prompt="",
columns=[],
seed=42,
size=size,
)):
content += chunk
df = pd.read_json(io.StringIO(content), lines=True)
state_msg = (
f"βœ… Done generating {size} samples in {time.time() - start_time:.2f}s"
if i + 1 == size else
f"βš™οΈ Generating... [{i + 1}/{size}]"
)
yield df, "```json\n" + content + "\n```", state_msg
def test(filename: str):
if not filename.endswith(".jsonl"):
yield "❌ 404: File name must end with .jsonl", None, ""
return
content = ""
size = 10
start_time = time.time()
for i in range(size):
content += f'{{"i": {i}, "filename": "{filename}"}}\n'
df = pd.read_json(io.StringIO(content), lines=True)
state_msg = (
f"βœ… Done generating {size} samples in {time.time() - start_time:.2f}s"
if i + 1 == size else
f"βš™οΈ Generating... [{i + 1}/{size}]"
)
yield df, "```json\n" + content + "\n```", state_msg
time.sleep(0.1)
title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
"movies_data.jsonl",
"common_first_names.jsonl",
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
"dungeon_and_dragon_characters.jsonl"
]
with gr.Blocks() as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
filename_comp = gr.Textbox(examples[0], placeholder=examples[0])
gr.Examples(examples, filename_comp)
generate_button = gr.Button("Generate dataset")
state_msg_comp = gr.Markdown("πŸ”₯ Ready to generate")
with gr.Tab("Dataset"):
dataframe_comp = gr.DataFrame()
with gr.Tab("File content"):
with gr.Blocks(fill_height=True):
with gr.Row():
file_content_comp = gr.Markdown()
generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
demo.launch()