Spaces:

carosh
/

cli-1m-explorer

Running

App Files Files Community

cli-1m-explorer / app.py

kobi-kadosh

fix: move demo to top-level, lazy-load dataset on button click

d28d1c7 verified 3 days ago

raw

history blame contribute delete

4.03 kB

	"""CLI-1M Dataset Explorer — carosh/cli-1m

	Random-row viewer with bucket / shell / language filters.
	Deploy to HuggingFace Spaces (CPU Free tier).
	"""

	import random
	import gradio as gr
	from datasets import load_dataset

	_REVISION = "v1.0-rc1"
	_DS = None # lazy-loaded on first query

	SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"]
	LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"]

	# Known buckets — avoids full dataset scan at startup
	BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt",
	"finance_web3", "bio_science", "data_ml", "network", "media",
	"editor_term", "editor_writer", "lang_tool", "mobile_embed",
	"modern_unix", "systems", "web_api", "misc"]


	def _load():
	global _DS
	if _DS is None:
	_DS = load_dataset(
	"carosh/cli-1m", name="sample", revision=_REVISION, split="train"
	)
	return _DS


	def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
	try:
	ds = _load()
	except Exception as e:
	return f"Error loading dataset: {e}", ""

	filtered = ds
	if shell_filter != "(any)":
	filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
	if lang_filter != "(any)":
	filtered = filtered.filter(lambda r: r["language"] == lang_filter)
	if bucket_filter != "(any)":
	filtered = filtered.filter(
	lambda r: bucket_filter in (r["bucket"] if isinstance(r["bucket"], list) else [])
	)

	total = len(filtered)
	if total == 0:
	return "No rows match the selected filters.", "0"

	rng = random.Random(int(seed) if str(seed).strip() else None)
	n = min(int(n_rows), total)
	indices = rng.sample(range(total), n)
	rows = filtered.select(indices)

	parts = [f"{total:,} rows match — showing {n}\n"]
	for i, row in enumerate(rows):
	msgs = row.get("messages") or []
	user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
	assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
	bucket = ", ".join(row.get("bucket") or [])
	parts.append(
	f"---\nRow {i+1} · `shell={row.get('shell')}` · "
	f"`lang={row.get('language')}` · `bucket={bucket}`\n\n"
	f"User: {user_msg}\n\n"
	f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
	)
	return "\n".join(parts), f"{total:,}"


	with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"# CLI-1M Dataset Explorer\n"
	f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
	f"— `sample` config (50k stratified rows), revision `{_REVISION}`\n\n"
	"Filter by shell, language, or industry bucket, then click Sample rows."
	)

	with gr.Row():
	shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell")
	lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language")
	bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket")

	with gr.Row():
	n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
	seed = gr.Number(value=42, label="Random seed (blank = random)")

	sample_btn = gr.Button("Sample rows", variant="primary")
	match_count = gr.Textbox(label="Matching rows", interactive=False)
	output = gr.Markdown()

	sample_btn.click(
	fn=sample_rows,
	inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
	outputs=[output, match_count],
	)

	gr.Markdown(
	"---\n"
	"Links: "
	"[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
	"[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
	"[Source repo](https://github.com/wildcard/caro-eval) · "
	"Apache-2.0"
	)


	if __name__ == "__main__":
	demo.launch()