Spaces:

severo
/

find_script_based_datasets_dependencies

Runtime error

App Files Files Community

severo HF Staff commited on Mar 26, 2024

Commit

418a37b

1 Parent(s): 47c9fe2

create app

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +67 -0
poetry.lock +0 -0
pyproject.toml +18 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+from huggingface_hub import hf_hub_download
+import subprocess
+def get_dataset_dependencies(dataset: str) -> set[str]:
+    script_name = dataset.split("/")[-1] + ".py"
+    input_file = hf_hub_download(repo_id=dataset, filename=script_name, repo_type="dataset")
+    result = subprocess.run(["findpydeps", "-i", input_file, "--no-header"], capture_output=True, text=True)
+    return set(d for d in result.stdout.split("\n") if d)
+def update(datasets: str):
+    all_dependencies = set()
+    for dataset in datasets.split("\n"):
+        dataset = dataset.strip()
+        print(dataset)
+        if not dataset:
+            continue
+        try:
+            dependencies = get_dataset_dependencies(dataset)
+            print(f"Dependencies for {dataset} processed: {len(dependencies)}")
+            all_dependencies.update(dependencies)
+        except Exception as e:
+            print(f"Error processing {dataset}: {e}")
+            continue
+    return "\n".join(sorted(list(all_dependencies)))
+with gr.Blocks() as demo:
+    gr.Markdown("""# Script-based dataset dependencies
+Paste a list of newline-separated dataset names, and then click **Run** to see the list of dependencies in their scripts.
+""")
+    with gr.Row():
+        inp = gr.Textbox(placeholder="mnist\ncifar10", label="Datasets", lines=10, max_lines=10)
+        out = gr.Textbox(label="Dependencies", lines=10, max_lines=10, show_copy_button=True)
+    btn = gr.Button("Run")
+    examples = ["mnist\ncifar10", "mnist", """espnet/yodas
+gaia-benchmark/GAIA
+google/fleurs
+mozilla-foundation/common_voice_1_0
+mozilla-foundation/common_voice_10_0
+mozilla-foundation/common_voice_11_0
+mozilla-foundation/common_voice_12_0
+mozilla-foundation/common_voice_13_0
+mozilla-foundation/common_voice_14_0
+mozilla-foundation/common_voice_15_0
+mozilla-foundation/common_voice_16_0
+mozilla-foundation/common_voice_16_1
+mozilla-foundation/common_voice_2_0
+mozilla-foundation/common_voice_3_0
+mozilla-foundation/common_voice_4_0
+mozilla-foundation/common_voice_5_0
+mozilla-foundation/common_voice_5_1
+mozilla-foundation/common_voice_6_0
+mozilla-foundation/common_voice_6_1
+mozilla-foundation/common_voice_7_0
+mozilla-foundation/common_voice_8_0
+mozilla-foundation/common_voice_9_0
+poloclub/diffusiondb
+pufanyi/MIMICIT
+speechcolab/gigaspeech
+togethercomputer/RedPajama-Data-1T
+togethercomputer/RedPajama-Data-V2
+"""            ]
+    gr.Examples(examples, inp, label="Example Datasets", )
+    btn.click(fn=update, inputs=inp, outputs=out)
+demo.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[tool.poetry]
+name = "find-script-based-datasets-dependencies"
+version = "0.1.0"
+description = ""
+authors = ["Sylvain Lesage <sylvain.lesage@huggingface.co>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.9"
+gradio = "4.23.0"
+findpydeps = "^0.2.6"
+pip = "^24.0"
+huggingface-hub = "^0.22.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"