Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Nov 21, 2024

Commit

099e99c

1 Parent(s): 080f560

refactor: redesign of the generator

Browse files

Files changed (13) hide show

app.py +23 -49
demo.py +61 -0
pdm.lock +0 -0
pyproject.toml +4 -2
requirements.txt +148 -7
src/distilabel_dataset_generator/_tabbedinterface.py +73 -0
src/distilabel_dataset_generator/apps/base.py +16 -28
src/distilabel_dataset_generator/apps/eval.py +328 -0
src/distilabel_dataset_generator/apps/sft.py +248 -296
src/distilabel_dataset_generator/apps/textcat.py +291 -343
src/distilabel_dataset_generator/pipelines/sft.py +2 -28
src/distilabel_dataset_generator/pipelines/textcat.py +2 -28
src/distilabel_dataset_generator/utils.py +11 -6

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
@@ -23,64 +24,37 @@ css = """
         background-color: black;
     }
 }
 """
-demo = gr.TabbedInterface(
     [textcat_app, sft_app, faq_app],
     ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
-    <style>
-        .header-container {
-            display: flex;
-            align-items: center;
-            justify-content: center;
-            position: relative;
-            padding: 20px 0;
-        }
-        .logo-container {
-            position: absolute;
-            left: 0;
-            top: 0;
-        }
-        .title-container {
-            text-align: center;
-        }
-        @media (max-width: 600px) {
-            .header-container {
-                flex-direction: column;
-            }
-            .logo-container {
-                position: static;
-                margin-bottom: 20px;
-            }
-        }
-        button[role="tab"].selected,
-        button[role="tab"][aria-selected="true"],
-        button[role="tab"][data-tab-id][aria-selected="true"] {
-            background-color: #000000;
-            color: white;
-            border: none;
-            font-size: 16px;
-            font-weight: bold;
-            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
-            transition: background-color 0.3s ease, color 0.3s ease;
-        }
-    </style>
-    <div class="header-container">
-        <div class="logo-container">
-            <a href="https://github.com/argilla-io/distilabel" target="_blank" rel="noopener noreferrer">
-                <img src="https://distilabel.argilla.io/latest/assets/distilabel-black.svg" alt="Distilabel Logo" style="width: 150px; height: auto;">
-            </a>
-        </div>
-        <div class="title-container">
-            <h1 style="margin: 0; font-size: 2em;">🧬 Synthetic Data Generator</h1>
-            <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
-        </div>
-    </div>
     """,
     theme=theme,
 )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
         background-color: black;
     }
 }
+button[role="tab"].selected,
+button[role="tab"][aria-selected="true"],
+button[role="tab"][data-tab-id][aria-selected="true"] {
+    background-color: #000000;
+    color: white;
+    border: none;
+    font-size: 16px;
+    font-weight: bold;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+    transition: background-color 0.3s ease, color 0.3s ease;
+}
+.gallery {
+    color: black !important;
+}
+.flex-shrink-0.truncate.px-1 {
+    color: black !important;
+}
 """
+demo = TabbedInterface(
     [textcat_app, sft_app, faq_app],
     ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
+    <h1>Synthetic Data Generator</h1>
+    <h3>Build datasets using natural language</h3>
     """,
+    head="Synthetic Data Generator",
     theme=theme,
 )
 if __name__ == "__main__":
     demo.launch()

demo.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import gradio as gr
+from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
+from src.distilabel_dataset_generator.apps.eval import app as eval_app
+from src.distilabel_dataset_generator.apps.faq import app as faq_app
+from src.distilabel_dataset_generator.apps.sft import app as sft_app
+from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
+theme = gr.themes.Monochrome(
+    spacing_size="md",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+)
+css = """
+.main_ui_logged_out{opacity: 0.3; pointer-events: none}
+.tabitem{border: 0px}
+.group_padding{padding: .55em}
+#space_model .wrap > label:last-child{opacity: 0.3; pointer-events:none}
+#system_prompt_examples {
+    color: black;
+}
+@media (prefers-color-scheme: dark) {
+    #system_prompt_examples {
+        color: white;
+        background-color: black;
+    }
+}
+button[role="tab"].selected,
+button[role="tab"][aria-selected="true"],
+button[role="tab"][data-tab-id][aria-selected="true"] {
+    background-color: #000000;
+    color: white;
+    border: none;
+    font-size: 16px;
+    font-weight: bold;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+    transition: background-color 0.3s ease, color 0.3s ease;
+}
+.gallery {
+    color: black !important;
+}
+.flex-shrink-0.truncate.px-1 {
+    color: black !important;
+}
+"""
+demo = TabbedInterface(
+    [textcat_app, sft_app, eval_app, faq_app],
+    ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
+    css=css,
+    title="""
+    <h1>Synthetic Data Generator</h1>
+    <h3>Build datasets using natural language</h3>
+    """,
+    head="Synthetic Data Generator",
+    theme=theme,
+)
+if __name__ == "__main__":
+    demo.launch()

pdm.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -6,11 +6,13 @@ authors = [
     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints,argilla]>=1.4.1",
-    "gradio[oauth]>=5.5.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",
     "model2vec>=0.2.4",
 ]
 requires-python = "<3.13,>=3.10"
 readme = "README.md"

     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla,outlines]>=1.4.1",
+    "gradio[oauth]<5.0.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",
     "model2vec>=0.2.4",
+    "gradio-huggingfacehub-search>=0.0.7",
+    "argilla>=2.4.0",
 ]
 requires-python = "<3.13,>=3.10"
 readme = "README.md"

requirements.txt CHANGED Viewed

@@ -1,7 +1,148 @@
-transformers
-gradio[oauth]
-distilabel[hf-inference-endpoints,argilla]
-beautifulsoup4
-sentence-transformers
-model2vec
-outlines

+# This file is @generated by PDM.
+# Please do not edit it manually.
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.11.7
+aiosignal==1.3.1
+airportsdata==20241001
+annotated-types==0.7.0
+anyio==4.6.2.post1
+argilla==2.4.0
+asttokens==2.4.1
+async-timeout==5.0.1; python_version < "3.11"
+attrs==24.2.0
+authlib==1.3.2
+certifi==2024.8.30
+cffi==1.17.1; platform_python_implementation != "PyPy"
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6; platform_system == "Windows" or sys_platform == "win32"
+contourpy==1.3.1
+cryptography==43.0.3
+cycler==0.12.1
+datasets==3.1.0
+decorator==5.1.1
+dill==0.3.8
+diskcache==5.6.3
+distilabel==1.4.1
+distilabel[argilla,hf-inference-endpoints,outlines]==1.4.1
+exceptiongroup==1.2.2; python_version < "3.11"
+executing==2.1.0
+fastapi==0.115.5
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.55.0
+frozenlist==1.5.0
+fsspec==2024.9.0
+fsspec[http]==2024.9.0
+gradio==4.44.1
+gradio-client==1.3.0
+gradio-huggingfacehub-search==0.0.7
+gradio[oauth]==4.44.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+importlib-resources==6.4.5
+interegular==0.3.3
+ipython==8.29.0
+itsdangerous==2.2.0
+jedi==0.19.2
+jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.7
+lark==1.2.2
+llvmlite==0.43.0
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+model2vec==0.3.3
+mpmath==1.3.0; python_version >= "3.9"
+multidict==6.1.0
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.4.2
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-curand-cu12==10.3.5.147; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cusolver-cu12==11.6.1.9; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cusparse-cu12==12.3.1.170; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nccl-cu12==2.21.5; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nvjitlink-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nvtx-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"
+orjson==3.10.11
+outlines==0.1.4
+outlines-core==0.1.17
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
+pillow==10.4.0
+portalocker==3.0.0
+prompt-toolkit==3.0.48
+propcache==0.2.0
+ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
+pure-eval==0.2.3
+pyarrow==18.0.0
+pycountry==24.6.1
+pycparser==2.22; platform_python_implementation != "PyPy"
+pydantic==2.10.0
+pydantic-core==2.27.0
+pydub==0.25.1
+pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.17
+pytz==2024.2
+pywin32==308; platform_system == "Windows"
+pyyaml==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+ruff==0.7.4; sys_platform != "emscripten"
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+sentence-transformers==3.3.1
+setuptools==75.6.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.41.3
+sympy==1.13.1; python_version >= "3.9"
+tblib==3.0.0
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+tomlkit==0.12.0
+torch==2.5.1
+tqdm==4.67.0
+traitlets==5.14.3
+transformers==4.46.3
+triton==3.1.0; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13"
+typer==0.13.1
+typing-extensions==4.12.2
+tzdata==2024.2
+universal-pathlib==0.2.5
+urllib3==2.2.3
+uvicorn==0.32.1; sys_platform != "emscripten"
+wcwidth==0.2.13
+websockets==12.0
+xxhash==3.5.0
+yarl==1.18.0

src/distilabel_dataset_generator/_tabbedinterface.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+This file defines two useful high-level abstractions to build Gradio apps: Interface and TabbedInterface.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+import gradio as gr
+from gradio.blocks import Blocks
+from gradio.components import HTML
+from gradio.layouts import Tab, Tabs
+from gradio.themes import ThemeClass as Theme
+from gradio_client.documentation import document
+@document()
+class TabbedInterface(Blocks):
+    """
+    A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
+    rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
+    Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
+    Demos: tabbed_interface_lite
+    """
+    def __init__(
+        self,
+        interface_list: Sequence[Blocks],
+        tab_names: list[str] | None = None,
+        title: str | None = None,
+        theme: Theme | str | None = None,
+        analytics_enabled: bool | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+    ):
+        """
+        Parameters:
+            interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
+            tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: The tab title to display when this demo is opened in a browser window.
+            theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js as a string or path to a js file. The custom js should in the form of a single js function. This function will automatically be executed when the page loads. For more flexibility, use the head parameter to insert js inside <script> tags.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, multiple scripts, stylesheets, etc. to the page.
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+            js=js,
+            head=head,
+        )
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                HTML(value=title)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.LoginButton(value="Sign in!", size="sm", scale=2)
+                    with gr.Column(scale=3):
+                        pass
+            with Tabs():
+                for interface, tab_name in zip(interface_list, tab_names, strict=False):
+                    with Tab(label=tab_name):
+                        interface.render()

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -168,8 +168,7 @@ def get_main_ui(
 def validate_argilla_user_workspace_dataset(
     dataset_name: str,
-    final_dataset: pd.DataFrame,
-    add_to_existing_dataset: bool,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> str:
@@ -193,7 +192,7 @@ def validate_argilla_user_workspace_dataset(
     dataset = client.datasets(name=dataset_name, workspace=hf_user)
     if dataset and not add_to_existing_dataset:
         raise gr.Error(f"Dataset {dataset_name} already exists")
-    return final_dataset
 def get_org_dropdown(oauth_token: OAuthToken = None):
@@ -302,7 +301,8 @@ def get_iterate_on_sample_dataset_ui(
 def get_pipeline_code_ui(pipeline_code: str) -> gr.Code:
-    gr.Markdown("## Or run this pipeline locally with distilabel")
     gr.Markdown(
         "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
     )
@@ -400,7 +400,7 @@ def push_pipeline_code_to_hub(
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ):
-    repo_id = _check_push_to_hub(org_name, repo_name)
     progress(0.1, desc="Uploading pipeline code")
     with io.BytesIO(pipeline_code.encode("utf-8")) as f:
         upload_file(
@@ -427,7 +427,7 @@ def push_dataset_to_hub(
     task: str = TEXTCAT_TASK,
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
-    repo_id = _check_push_to_hub(org_name, repo_name)
     if task == TEXTCAT_TASK:
         if num_labels == 1:
@@ -459,7 +459,7 @@ def push_dataset_to_hub(
     return dataframe
-def _check_push_to_hub(org_name, repo_name):
     repo_id = (
         f"{org_name}/{repo_name}"
         if repo_name is not None and org_name is not None
@@ -491,7 +491,7 @@ def get_success_message_row() -> gr.Markdown:
     return success_message
-def show_success_message_argilla() -> gr.Markdown:
     client = get_argilla_client()
     argilla_api_url = client.api_url
     return gr.Markdown(
@@ -499,7 +499,13 @@ def show_success_message_argilla() -> gr.Markdown:
         <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
-                Your dataset is now available at:
                 <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
                     {argilla_api_url}
                 </a>
@@ -513,23 +519,5 @@ def show_success_message_argilla() -> gr.Markdown:
     )
-def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
-    return gr.Markdown(
-        value=f"""
-        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
-            <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
-            <p style="margin-top: 0.5em;">
-                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain or other frameworks.
-                Your dataset is now available at:
-                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                    https://huggingface.co/datasets/{org_name}/{repo_name}
-                </a>
-            </p>
-        </div>
-        """,
-        visible=True,
-    )
 def hide_success_message() -> gr.Markdown:
-    return gr.Markdown(visible=False)

 def validate_argilla_user_workspace_dataset(
     dataset_name: str,
+    add_to_existing_dataset: bool = True,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> str:
     dataset = client.datasets(name=dataset_name, workspace=hf_user)
     if dataset and not add_to_existing_dataset:
         raise gr.Error(f"Dataset {dataset_name} already exists")
+    return ""
 def get_org_dropdown(oauth_token: OAuthToken = None):
 def get_pipeline_code_ui(pipeline_code: str) -> gr.Code:
+    gr.Markdown("## Customize and run locally with distilabel")
+    gr.HTML("<hr>")
     gr.Markdown(
         "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
     )
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ):
+    repo_id = validate_push_to_hub(org_name, repo_name)
     progress(0.1, desc="Uploading pipeline code")
     with io.BytesIO(pipeline_code.encode("utf-8")) as f:
         upload_file(
     task: str = TEXTCAT_TASK,
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
+    repo_id = validate_push_to_hub(org_name, repo_name)
     if task == TEXTCAT_TASK:
         if num_labels == 1:
     return dataframe
+def validate_push_to_hub(org_name, repo_name):
     repo_id = (
         f"{org_name}/{repo_name}"
         if repo_name is not None and org_name is not None
     return success_message
+def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     argilla_api_url = client.api_url
     return gr.Markdown(
         <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
+                Your dataset is now available the Hugging Face Hub:
+                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    https://huggingface.co/datasets/{org_name}/{repo_name}
+                </a>
+            </p>
+            <p style="margin-top: 0.5em;">
+                Your dataset is now available within Argilla:
                 <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
                     {argilla_api_url}
                 </a>
     )
 def hide_success_message() -> gr.Markdown:
+    return gr.Markdown(value="")

src/distilabel_dataset_generator/apps/eval.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import json
+import gradio as gr
+import pandas as pd
+from datasets import load_dataset
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from src.distilabel_dataset_generator.utils import get_org_dropdown
+def get_iframe(hub_repo_id) -> str:
+    if not hub_repo_id:
+        raise gr.Error("Hub repo id is required")
+    url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
+    iframe = f"""
+    <iframe
+  src="{url}"
+  frameborder="0"
+  width="100%"
+  height="600px"
+></iframe>
+"""
+    return iframe
+def get_valid_columns(df: pd.DataFrame):
+    valid_columns = []
+    for col in df.columns:
+        sample_val = df[col].iloc[0]
+        if isinstance(sample_val, str) or (
+            isinstance(sample_val, list)
+            and all(isinstance(item, dict) for item in sample_val)
+        ):
+            valid_columns.append(col)
+    return valid_columns
+def load_dataset_from_hub(hub_repo_id: str, n_rows: int = 10):
+    gr.Info(message="Loading dataset ...")
+    if not hub_repo_id:
+        raise gr.Error("Hub repo id is required")
+    ds_dict = load_dataset(hub_repo_id)
+    splits = list(ds_dict.keys())
+    ds = ds_dict[splits[0]]
+    if n_rows:
+        ds = ds.select(range(n_rows))
+    df = ds.to_pandas()
+    # Get columns that contain either strings or lists of dictionaries
+    valid_columns = get_valid_columns(df)
+    return (
+        df,
+        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
+        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
+        gr.Dropdown(choices=valid_columns, label="Response Column"),
+    )
+def define_evaluation_aspects(task_type: str):
+    if task_type == "instruction":
+        return gr.Dropdown(
+            value=["overall-rating"],
+            choices=["complexity", "quality"],
+            label="Evaluation Aspects",
+            multiselect=True,
+            interactive=True,
+        )
+    elif task_type == "instruction-response":
+        return gr.Dropdown(
+            value=["overall-rating"],
+            choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
+            label="Evaluation Aspects",
+            multiselect=True,
+            interactive=True,
+        )
+    else:
+        return gr.Dropdown(interactive=False)
+def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str):
+    pass
+def evaluate_instruction_response(
+    df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str
+):
+    pass
+def evaluate_custom(
+    df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict
+):
+    pass
+def _apply_to_dataset(
+    df: pd.DataFrame,
+    eval_type: str,
+    aspects_instruction: list[str],
+    instruction_column: str,
+    aspects_instruction_response: list[str],
+    instruction_column_response: str,
+    response_column_response: str,
+    aspects_custom: list[str],
+    prompt_template: str,
+    structured_output: dict,
+):
+    if eval_type == "instruction":
+        df = evaluate_instruction(df, aspects_instruction, instruction_column)
+    elif eval_type == "instruction-response":
+        df = evaluate_instruction_response(
+            df,
+            aspects_instruction_response,
+            instruction_column_response,
+            response_column_response,
+        )
+    elif eval_type == "custom":
+        df = evaluate_custom(df, aspects_custom, prompt_template, structured_output)
+    return df
+def apply_to_sample_dataset(
+    repo_id: str,
+    eval_type: str,
+    aspects_instruction: list[str],
+    aspects_instruction_response: list[str],
+    aspects_custom: list[str],
+    instruction_instruction: str,
+    instruction_instruction_response: str,
+    response_instruction_response: str,
+    prompt_template: str,
+    structured_output: dict,
+):
+    df, _, _, _ = load_dataset_from_hub(repo_id, n_rows=10)
+    df = _apply_to_dataset(
+        df,
+        eval_type,
+        aspects_instruction,
+        instruction_instruction,
+        aspects_instruction_response,
+        instruction_instruction_response,
+        response_instruction_response,
+        aspects_custom,
+        prompt_template,
+        structured_output,
+    )
+    return df
+def push_to_hub(
+    org_name: str,
+    repo_name: str,
+    private: bool,
+    n_rows: int,
+    original_repo_id: str,
+    eval_type: str,
+    aspects_instruction: list[str],
+    aspects_instruction_response: list[str],
+    aspects_custom: list[str],
+    instruction_instruction: str,
+    instruction_instruction_response: str,
+    response_instruction_response: str,
+    prompt_template: str,
+    structured_output: dict,
+):
+    df, _, _, _ = load_dataset_from_hub(original_repo_id, n_rows=n_rows)
+    df = _apply_to_dataset(
+        df,
+        eval_type,
+        aspects_instruction,
+        instruction_instruction,
+        aspects_instruction_response,
+        instruction_instruction_response,
+        response_instruction_response,
+        aspects_custom,
+        prompt_template,
+        structured_output,
+    )
+    new_repo_id = f"{org_name}/{repo_name}"
+    print(df)
+with gr.Blocks() as app:
+    gr.Markdown("## Select your input dataset")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            search_in = HuggingfaceHubSearch(
+                label="Search",
+                placeholder="Search for a Dataset",
+                search_type="dataset",
+                sumbit_on_select=True,
+            )
+            load_btn = gr.Button("Load Dataset")
+        with gr.Column(scale=3):
+            search_out = gr.HTML(label="Dataset Preview")
+    gr.Markdown("## Configure your task")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            eval_type = gr.Dropdown(
+                label="Evaluation Type",
+                choices=["instruction", "instruction-response", "custom"],
+                visible=False,
+            )
+            with gr.Tab("instruction") as tab_instruction:
+                aspects_instruction = define_evaluation_aspects("instruction")
+                instruction_instruction = gr.Dropdown(
+                    label="Instruction Column", interactive=True
+                )
+                tab_instruction.select(
+                    lambda: "instruction",
+                    inputs=[],
+                    outputs=[eval_type],
+                )
+            with gr.Tab("instruction-response") as tab_instruction_response:
+                aspects_instruction_response = define_evaluation_aspects(
+                    "instruction-response"
+                )
+                instruction_instruction_response = gr.Dropdown(
+                    label="Instruction Column", interactive=True
+                )
+                response_instruction_response = gr.Dropdown(
+                    label="Response Column", interactive=True
+                )
+                tab_instruction_response.select(
+                    lambda: "instruction-response",
+                    inputs=[],
+                    outputs=[eval_type],
+                )
+            with gr.Tab("custom") as tab_custom:
+                aspects_custom = define_evaluation_aspects("custom")
+                prompt_template = gr.Code(
+                    label="Prompt Template",
+                    value="{{column_1}} based on {{column_2}}",
+                    language="markdown",
+                    interactive=True,
+                )
+                structured_output = gr.Code(
+                    label="Structured Output",
+                    value=json.dumps({"eval_aspect": "str"}),
+                    language="json",
+                    interactive=True,
+                )
+                tab_custom.select(
+                    lambda: "custom",
+                    inputs=[],
+                    outputs=[eval_type],
+                )
+            btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
+        with gr.Column(scale=3):
+            dataframe = gr.Dataframe()
+    gr.Markdown("## Generate your dataset")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            org_name = get_org_dropdown()
+            repo_name = gr.Textbox(
+                label="Repo name",
+                placeholder="dataset_name",
+                value="my-distiset",
+                interactive=True,
+            )
+            n_rows = gr.Number(
+                label="Number of rows",
+                value=10,
+                interactive=True,
+                scale=1,
+            )
+            private = gr.Checkbox(
+                label="Private dataset",
+                value=False,
+                interactive=True,
+                scale=1,
+            )
+            btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+        with gr.Column(scale=3):
+            success_message = gr.Markdown(visible=False)
+    search_in.submit(get_iframe, inputs=search_in, outputs=search_out)
+    load_btn.click(
+        load_dataset_from_hub,
+        inputs=[search_in],
+        outputs=[
+            dataframe,
+            instruction_instruction,
+            instruction_instruction_response,
+            response_instruction_response,
+        ],
+    )
+    btn_apply_to_sample_dataset.click(
+        apply_to_sample_dataset,
+        inputs=[
+            search_in,
+            eval_type,
+            aspects_instruction,
+            aspects_instruction_response,
+            aspects_custom,
+            instruction_instruction,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+        ],
+        outputs=dataframe,
+    )
+    btn_push_to_hub.click(
+        push_to_hub,
+        inputs=[
+            org_name,
+            repo_name,
+            private,
+            n_rows,
+            search_in,
+            eval_type,
+            aspects_instruction,
+            aspects_instruction_response,
+            aspects_custom,
+            instruction_instruction,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+        ],
+        outputs=success_message,
+    )
+    app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
 from typing import Dict, List, Union
 import argilla as rg
@@ -10,16 +11,11 @@ from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     get_argilla_client,
-    get_main_ui,
     get_pipeline_code_ui,
     hide_success_message,
-    push_pipeline_code_to_hub,
-    show_success_message_argilla,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
-)
-from src.distilabel_dataset_generator.apps.base import (
-    push_dataset_to_hub as push_to_hub_base,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     DEFAULT_BATCH_SIZE,
@@ -30,16 +26,15 @@ from src.distilabel_dataset_generator.pipelines.embeddings import (
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
-    DEFAULT_DATASETS,
-    DEFAULT_SYSTEM_PROMPTS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
-TASK = "supervised_fine_tuning"
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
@@ -57,33 +52,176 @@ def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     return dataframe
-def push_dataset_to_hub(
-    dataframe: pd.DataFrame,
-    private: bool = True,
-    org_name: str = None,
-    repo_name: str = None,
-    oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
-):
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
-    try:
-        push_to_hub_base(
-            dataframe, private, org_name, repo_name, oauth_token, progress, task=TASK
-        )
-    except Exception as e:
-        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
     return original_dataframe
 def push_dataset_to_argilla(
-    dataframe: pd.DataFrame,
-    dataset_name: str,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
-    original_dataframe = dataframe.copy(deep=True)
-    dataframe = convert_dataframe_messages(dataframe)
     try:
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
@@ -185,10 +323,10 @@ def push_dataset_to_argilla(
             dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
         progress(0.5, desc="Creating dataset")
-        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
         if rg_dataset is None:
             rg_dataset = rg.Dataset(
-                name=dataset_name,
                 workspace=hf_user,
                 settings=settings,
                 client=client,
@@ -200,309 +338,123 @@ def push_dataset_to_argilla(
         progress(1.0, desc="Dataset pushed to Argilla")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
-    return original_dataframe
-def generate_system_prompt(dataset_description, progress=gr.Progress()):
-    progress(0.0, desc="Generating system prompt")
-    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
-        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
-        if index < len(DEFAULT_SYSTEM_PROMPTS):
-            return DEFAULT_SYSTEM_PROMPTS[index]
-    progress(0.3, desc="Initializing text generation")
-    generate_description = get_prompt_generator()
-    progress(0.7, desc="Generating system prompt")
-    result = next(
-        generate_description.process(
-            [
-                {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
-                    "instruction": dataset_description,
-                }
-            ]
-        )
-    )[0]["generation"]
-    progress(1.0, desc="System prompt generated")
-    return result
-def generate_dataset(
-    system_prompt: str,
-    num_turns: int = 1,
-    num_rows: int = 5,
-    is_sample: bool = False,
-    progress=gr.Progress(),
-) -> pd.DataFrame:
-    progress(0.0, desc="(1/2) Generating instructions")
-    magpie_generator = get_magpie_generator(
-        num_turns, num_rows, system_prompt, is_sample
-    )
-    response_generator = get_response_generator(num_turns, system_prompt, is_sample)
-    total_steps: int = num_rows * 2
-    batch_size = DEFAULT_BATCH_SIZE
-    # create instructions
-    n_processed = 0
-    magpie_results = []
-    while n_processed < num_rows:
-        progress(
-            0.5 * n_processed / num_rows,
-            total=total_steps,
-            desc="(1/2) Generating instructions",
-        )
-        remaining_rows = num_rows - n_processed
-        batch_size = min(batch_size, remaining_rows)
-        inputs = [{"system_prompt": system_prompt} for _ in range(batch_size)]
-        batch = list(magpie_generator.process(inputs=inputs))
-        magpie_results.extend(batch[0])
-        n_processed += batch_size
-    progress(0.5, desc="(1/2) Generating instructions")
-    # generate responses
-    n_processed = 0
-    response_results = []
-    if num_turns == 1:
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(2/2) Generating responses",
-            )
-            batch = magpie_results[n_processed : n_processed + batch_size]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses[0])
-            n_processed += batch_size
-        for result in response_results:
-            result["prompt"] = result["instruction"]
-            result["completion"] = result["generation"]
-            result["system_prompt"] = system_prompt
-    else:
-        for result in magpie_results:
-            result["conversation"].insert(
-                0, {"role": "system", "content": system_prompt}
             )
-            result["messages"] = result["conversation"]
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(2/2) Generating responses",
             )
-            batch = magpie_results[n_processed : n_processed + batch_size]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses[0])
-            n_processed += batch_size
-        for result in response_results:
-            result["messages"].append(
-                {"role": "assistant", "content": result["generation"]}
             )
-    progress(
-        1,
-        total=total_steps,
-        desc="(2/2) Creating dataset",
-    )
-    # create distiset
-    distiset_results = []
-    for result in response_results:
-        record = {}
-        for relevant_keys in [
-            "messages",
-            "prompt",
-            "completion",
-            "model_name",
-            "system_prompt",
-        ]:
-            if relevant_keys in result:
-                record[relevant_keys] = result[relevant_keys]
-        distiset_results.append(record)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_list(distiset_results),
-        }
-    )
-    # If not pushing to hub generate the dataset directly
-    distiset = distiset["default"]
-    if num_turns == 1:
-        outputs = distiset.to_pandas()[["system_prompt", "prompt", "completion"]]
-    else:
-        outputs = distiset.to_pandas()[["messages"]]
-    dataframe = pd.DataFrame(outputs)
-    progress(1.0, desc="Dataset generation completed")
-    return dataframe
-(
-    app,
-    main_ui,
-    custom_input_ui,
-    dataset_description,
-    examples,
-    btn_generate_system_prompt,
-    system_prompt,
-    sample_dataset,
-    btn_generate_sample_dataset,
-    dataset_name,
-    add_to_existing_dataset,
-    btn_generate_full_dataset_argilla,
-    btn_generate_and_push_to_argilla,
-    btn_push_to_argilla,
-    org_name,
-    repo_name,
-    private,
-    btn_generate_full_dataset,
-    btn_generate_and_push_to_hub,
-    btn_push_to_hub,
-    final_dataset,
-    success_message,
-) = get_main_ui(
-    default_dataset_descriptions=DEFAULT_DATASET_DESCRIPTIONS,
-    default_system_prompts=DEFAULT_SYSTEM_PROMPTS,
-    default_datasets=DEFAULT_DATASETS,
-    fn_generate_system_prompt=generate_system_prompt,
-    fn_generate_dataset=generate_dataset,
-    task=TASK,
-)
-with app:
-    with main_ui:
-        with custom_input_ui:
             num_turns = gr.Number(
                 value=1,
                 label="Number of turns in the conversation",
                 minimum=1,
                 maximum=4,
                 step=1,
                 info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
             )
-            num_rows = gr.Number(
                 value=10,
-                label="Number of rows in the dataset",
-                minimum=1,
-                maximum=500,
-                info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
             )
-        pipeline_code = get_pipeline_code_ui(
-            generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
-        )
-    # define app triggers
     gr.on(
-        triggers=[
-            btn_generate_full_dataset.click,
-            btn_generate_full_dataset_argilla.click,
-        ],
-        fn=hide_success_message,
-        outputs=[success_message],
     ).then(
-        fn=generate_dataset,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[final_dataset],
         show_progress=True,
     )
-    btn_generate_and_push_to_argilla.click(
         fn=validate_argilla_user_workspace_dataset,
-        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=hide_success_message,
         outputs=[success_message],
-    ).success(
-        fn=generate_dataset,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=show_success_message_argilla,
-        inputs=[],
-        outputs=[success_message],
-    )
-    btn_generate_and_push_to_hub.click(
-        fn=hide_success_message,
-        outputs=[success_message],
-    ).then(
-        fn=generate_dataset,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).then(
-        fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name],
-        outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=push_pipeline_code_to_hub,
-        inputs=[pipeline_code, org_name, repo_name],
-        outputs=[],
-        show_progress=True,
-    ).success(
-        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-    )
-    btn_push_to_hub.click(
-        fn=hide_success_message,
-        outputs=[success_message],
-    ).then(
-        fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).then(
-        fn=push_pipeline_code_to_hub,
-        inputs=[pipeline_code, org_name, repo_name],
-        outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message_hub,
-        inputs=[org_name, repo_name],
-        outputs=[success_message],
-    )
-    btn_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
-    ).success(
-        fn=validate_argilla_user_workspace_dataset,
-        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
-        outputs=[final_dataset],
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name],
-        outputs=[final_dataset],
         show_progress=True,
     ).success(
-        fn=show_success_message_argilla,
-        inputs=[],
         outputs=[success_message],
     )
-    system_prompt.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[pipeline_code],
-    )
-    num_turns.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[pipeline_code],
-    )
-    num_rows.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[pipeline_code],
-    )

 import ast
+import uuid
 from typing import Dict, List, Union
 import argilla as rg
 from src.distilabel_dataset_generator.apps.base import (
     get_argilla_client,
     get_pipeline_code_ui,
     hide_success_message,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     DEFAULT_BATCH_SIZE,
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
+from src.distilabel_dataset_generator.utils import (
+    get_org_dropdown,
+)
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     return dataframe
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Generating system prompt")
+    progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating system prompt")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "system_prompt": PROMPT_CREATION_PROMPT,
+                    "instruction": dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+    progress(1.0, desc="System prompt generated")
+    return result, pd.DataFrame()
+def generate_sample_dataset(system_prompt, progress=gr.Progress()):
+    df = generate_dataset(
+        system_prompt=system_prompt,
+        num_turns=1,
+        num_rows=10,
+        progress=progress,
+        is_sample=True,
+    )
+    return df
+def generate_dataset(
+    system_prompt: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    is_sample: bool = False,
     progress=gr.Progress(),
+) -> pd.DataFrame:
+    progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(
+        num_turns, num_rows, system_prompt, is_sample
+    )
+    response_generator = get_response_generator(num_turns, system_prompt, is_sample)
+    total_steps: int = num_rows * 2
+    batch_size = DEFAULT_BATCH_SIZE
+    # create instructions
+    n_processed = 0
+    magpie_results = []
+    while n_processed < num_rows:
+        progress(
+            0.5 * n_processed / num_rows,
+            total=total_steps,
+            desc="(1/2) Generating instructions",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = [{"system_prompt": system_prompt} for _ in range(batch_size)]
+        batch = list(magpie_generator.process(inputs=inputs))
+        magpie_results.extend(batch[0])
+        n_processed += batch_size
+    progress(0.5, desc="(1/2) Generating instructions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    if num_turns == 1:
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
+            )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+        for result in response_results:
+            result["prompt"] = result["instruction"]
+            result["completion"] = result["generation"]
+            result["system_prompt"] = system_prompt
+    else:
+        for result in magpie_results:
+            result["conversation"].insert(
+                0, {"role": "system", "content": system_prompt}
+            )
+            result["messages"] = result["conversation"]
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
+            )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+        for result in response_results:
+            result["messages"].append(
+                {"role": "assistant", "content": result["generation"]}
+            )
+    progress(
+        1,
+        total=total_steps,
+        desc="(2/2) Creating dataset",
+    )
+    # create distiset
+    distiset_results = []
+    for result in response_results:
+        record = {}
+        for relevant_keys in [
+            "messages",
+            "prompt",
+            "completion",
+            "model_name",
+            "system_prompt",
+        ]:
+            if relevant_keys in result:
+                record[relevant_keys] = result[relevant_keys]
+        distiset_results.append(record)
+    distiset = Distiset(
+        {
+            "default": Dataset.from_list(distiset_results),
+        }
+    )
+    # If not pushing to hub generate the dataset directly
+    distiset = distiset["default"]
+    if num_turns == 1:
+        outputs = distiset.to_pandas()[["prompt", "completion", "system_prompt"]]
+    else:
+        outputs = distiset.to_pandas()[["messages"]]
+    dataframe = pd.DataFrame(outputs)
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private):
+    repo_id = validate_push_to_hub(org_name, repo_name)
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
+    distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
     return original_dataframe
 def push_dataset_to_argilla(
+    org_name: str,
+    repo_name: str,
+    system_prompt: str,
+    num_turns: int = 1,
+    n_rows: int = 10,
+    private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    dataframe = generate_dataset(
+        system_prompt=system_prompt,
+        num_turns=num_turns,
+        num_rows=n_rows,
+    )
+    push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
             dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
         progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
         if rg_dataset is None:
             rg_dataset = rg.Dataset(
+                name=repo_name,
                 workspace=hf_user,
                 settings=settings,
                 client=client,
         progress(1.0, desc="Dataset pushed to Argilla")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+with gr.Blocks() as app:
+    gr.Markdown("## Describe the dataset you want")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            dataset_description = gr.Textbox(
+                label="Dataset description",
+                placeholder="Give a precise description of your desired dataset.",
             )
+            examples = gr.Examples(
+                examples=DEFAULT_DATASET_DESCRIPTIONS,
+                inputs=[dataset_description],
+                cache_examples=False,
+                label="Example descriptions",
             )
+            system_prompt = gr.Textbox(
+                label="System prompt",
+                placeholder="You are a helpful assistant.",
+                visible=False,
             )
+            load_btn = gr.Button("Load Dataset")
+        with gr.Column(scale=3):
+            pass
+    gr.Markdown("## Configure your task")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
             num_turns = gr.Number(
                 value=1,
                 label="Number of turns in the conversation",
                 minimum=1,
                 maximum=4,
                 step=1,
+                interactive=True,
                 info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
             )
+            btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
+        with gr.Column(scale=3):
+            dataframe = gr.Dataframe()
+    gr.Markdown("## Generate your dataset")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            org_name = get_org_dropdown()
+            repo_name = gr.Textbox(
+                label="Repo name",
+                placeholder="dataset_name",
+                value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                interactive=True,
+            )
+            n_rows = gr.Number(
+                label="Number of rows",
                 value=10,
+                interactive=True,
+                scale=1,
             )
+            private = gr.Checkbox(
+                label="Private dataset",
+                value=False,
+                interactive=True,
+                scale=1,
+            )
+            btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+        with gr.Column(scale=3):
+            success_message = gr.Markdown()
+    pipeline_code = get_pipeline_code_ui(
+        generate_pipeline_code(system_prompt.value, num_turns.value, n_rows.value)
+    )
     gr.on(
+        triggers=[load_btn.click, btn_apply_to_sample_dataset.click],
+        fn=generate_system_prompt,
+        inputs=[dataset_description],
+        outputs=[system_prompt, dataframe],
+        show_progress=True,
     ).then(
+        fn=generate_sample_dataset,
+        inputs=[system_prompt],
+        outputs=[dataframe],
         show_progress=True,
     )
+    btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
         outputs=[success_message],
         show_progress=True,
     ).then(
+        fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
         show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
+        inputs=[
+            org_name,
+            repo_name,
+            system_prompt,
+            num_turns,
+            n_rows,
+            private,
+        ],
+        outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
+        inputs=[org_name, repo_name],
         outputs=[success_message],
     )
+    app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,24 +1,21 @@
 import re
 from typing import List, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
-from datasets import Dataset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     get_argilla_client,
-    get_main_ui,
     get_pipeline_code_ui,
     hide_success_message,
-    push_pipeline_code_to_hub,
-    show_success_message_argilla,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
-)
-from src.distilabel_dataset_generator.apps.base import (
-    push_dataset_to_hub as push_to_hub_base,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     DEFAULT_BATCH_SIZE,
@@ -29,166 +26,24 @@ from src.distilabel_dataset_generator.pipelines.embeddings import (
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
-    DEFAULT_DATASETS,
-    DEFAULT_SYSTEM_PROMPTS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
     get_textcat_generator,
 )
-from src.distilabel_dataset_generator.utils import get_preprocess_labels
-TASK = "text_classification"
-def push_dataset_to_hub(
-    dataframe: pd.DataFrame,
-    private: bool = True,
-    org_name: str = None,
-    repo_name: str = None,
-    oauth_token: Union[gr.OAuthToken, None] = None,
-    progress=gr.Progress(),
-    labels: List[str] = None,
-    num_labels: int = 1,
-):
-    original_dataframe = dataframe.copy(deep=True)
-    dataframe = dataframe[
-        (dataframe["text"].str.strip() != "") & (dataframe["text"].notna())
-    ]
-    labels = get_preprocess_labels(labels)
-    try:
-        push_to_hub_base(
-            dataframe,
-            private,
-            org_name,
-            repo_name,
-            oauth_token,
-            progress,
-            labels,
-            num_labels,
-            task=TASK,
-        )
-    except Exception as e:
-        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
-    return original_dataframe
-def push_dataset_to_argilla(
-    dataframe: pd.DataFrame,
-    dataset_name: str,
-    oauth_token: Union[gr.OAuthToken, None] = None,
-    progress=gr.Progress(),
-    num_labels: int = 1,
-    labels: List[str] = None,
-) -> pd.DataFrame:
-    original_dataframe = dataframe.copy(deep=True)
-    dataframe = dataframe[
-        (dataframe["text"].str.strip() != "") & (dataframe["text"].notna())
-    ]
-    try:
-        progress(0.1, desc="Setting up user and workspace")
-        client = get_argilla_client()
-        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
-        labels = get_preprocess_labels(labels)
-        settings = rg.Settings(
-            fields=[
-                rg.TextField(
-                    name="text",
-                    description="The text classification data",
-                    title="Text",
-                ),
-            ],
-            questions=[
-                (
-                    rg.LabelQuestion(
-                        name="label",
-                        title="Label",
-                        description="The label of the text",
-                        labels=labels,
-                    )
-                    if num_labels == 1
-                    else rg.MultiLabelQuestion(
-                        name="labels",
-                        title="Labels",
-                        description="The labels of the conversation",
-                        labels=labels,
-                    )
-                ),
-            ],
-            metadata=[
-                rg.IntegerMetadataProperty(name="text_length", title="Text Length"),
-            ],
-            vectors=[
-                rg.VectorField(
-                    name="text_embeddings",
-                    dimensions=get_sentence_embedding_dimensions(),
-                )
-            ],
-            guidelines="Please review the text and provide or correct the label where needed.",
-        )
-        dataframe["text_length"] = dataframe["text"].apply(len)
-        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
-        progress(0.5, desc="Creating dataset")
-        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
-        if rg_dataset is None:
-            rg_dataset = rg.Dataset(
-                name=dataset_name,
-                workspace=hf_user,
-                settings=settings,
-                client=client,
-            )
-            rg_dataset = rg_dataset.create()
-        progress(0.7, desc="Pushing dataset to Argilla")
-        hf_dataset = Dataset.from_pandas(dataframe)
-        records = [
-            rg.Record(
-                fields={
-                    "text": sample["text"],
-                },
-                metadata={"text_length": sample["text_length"]},
-                vectors={"text_embeddings": sample["text_embeddings"]},
-                suggestions=(
-                    [
-                        rg.Suggestion(
-                            question_name="label" if num_labels == 1 else "labels",
-                            value=(
-                                sample["label"] if num_labels == 1 else sample["labels"]
-                            ),
-                        )
-                    ]
-                    if (
-                        (num_labels == 1 and sample["label"] in labels)
-                        or (
-                            num_labels > 1
-                            and all(label in labels for label in sample["labels"])
-                        )
-                    )
-                    else []
-                ),
-            )
-            for sample in hf_dataset
-        ]
-        rg_dataset.records.log(records=records)
-        progress(1.0, desc="Dataset pushed to Argilla")
-    except Exception as e:
-        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
-    return original_dataframe
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
-    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
-        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
-        if index < len(DEFAULT_SYSTEM_PROMPTS):
-            return DEFAULT_SYSTEM_PROMPTS[index]
     progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator()
     progress(0.7, desc="Generating text classification task")
-    result = next(
         generate_description.process(
             [
                 {
@@ -199,7 +54,25 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
-    return result
 def generate_dataset(
@@ -212,6 +85,10 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
@@ -230,7 +107,7 @@ def generate_dataset(
     textcat_results = []
     while n_processed < num_rows:
         progress(
-            0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
@@ -244,7 +121,7 @@ def generate_dataset(
         result["text"] = result["input_text"]
     # label text classification data
-    progress(0.5, desc="(1/2) Generating text classification data")
     if not is_sample:
         n_processed = 0
         labeller_results = []
@@ -300,6 +177,158 @@ def generate_dataset(
     return dataframe
 def update_suggested_labels(system_prompt):
     new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
     if not new_labels:
@@ -321,41 +350,34 @@ def update_max_num_labels(labels):
     return gr.update(maximum=len(labels) if labels else 1)
-(
-    app,
-    main_ui,
-    custom_input_ui,
-    dataset_description,
-    examples,
-    btn_generate_system_prompt,
-    system_prompt,
-    sample_dataset,
-    btn_generate_sample_dataset,
-    dataset_name,
-    add_to_existing_dataset,
-    btn_generate_full_dataset_argilla,
-    btn_generate_and_push_to_argilla,
-    btn_push_to_argilla,
-    org_name,
-    repo_name,
-    private,
-    btn_generate_full_dataset,
-    btn_generate_and_push_to_hub,
-    btn_push_to_hub,
-    final_dataset,
-    success_message,
-) = get_main_ui(
-    default_dataset_descriptions=DEFAULT_DATASET_DESCRIPTIONS,
-    default_system_prompts=DEFAULT_SYSTEM_PROMPTS,
-    default_datasets=DEFAULT_DATASETS,
-    fn_generate_system_prompt=generate_system_prompt,
-    fn_generate_dataset=generate_dataset,
-    task=TASK,
-)
-with app:
-    with main_ui:
-        with custom_input_ui:
             difficulty = gr.Dropdown(
                 choices=[
                     ("High School", "high school"),
@@ -366,6 +388,7 @@ with app:
                 value="mixed",
                 label="Difficulty",
                 info="Select the comprehension level for the text. Ensure it matches the task context.",
             )
             clarity = gr.Dropdown(
                 choices=[
@@ -380,51 +403,78 @@ with app:
                 value="mixed",
                 label="Clarity",
                 info="Set how easily the correct label or labels can be identified.",
             )
-            with gr.Column():
-                labels = gr.Dropdown(
-                    choices=[],
-                    value=["negative", "positive"],
-                    allow_custom_value=True,
-                    interactive=True,
-                    label="Labels",
-                    multiselect=True,
-                    info="Add the labels to classify the text.",
-                )
-                with gr.Blocks():
-                    btn_suggested_labels = gr.Button(
-                        value="Add suggested labels",
-                        variant="primary",
-                        size="sm",
-                    )
             num_labels = gr.Number(
                 label="Number of labels per text",
                 value=1,
                 minimum=1,
                 maximum=10,
                 info="Select 1 for single-label and >1 for multi-label.",
             )
-            num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
-                minimum=1,
-                maximum=500,
-                info="Select the number of rows in the dataset. More rows will take more time.",
             )
-        pipeline_code = get_pipeline_code_ui(
-            generate_pipeline_code(
-                system_prompt.value,
-                difficulty=difficulty.value,
-                clarity=clarity.value,
-                labels=labels.value,
-                num_labels=num_labels.value,
-                num_rows=num_rows.value,
             )
         )
-    # define app triggers
-    btn_suggested_labels.click(
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
@@ -434,141 +484,39 @@ with app:
         outputs=[num_labels],
     )
-    gr.on(
-        triggers=[
-            btn_generate_full_dataset.click,
-            btn_generate_full_dataset_argilla.click,
-        ],
-        fn=hide_success_message,
-        outputs=[success_message],
-    ).then(
-        fn=validate_input_labels,
-        inputs=[labels],
-        outputs=[labels],
-    ).success(
-        fn=generate_dataset,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[final_dataset],
-        show_progress=True,
-    )
-    btn_generate_and_push_to_argilla.click(
         fn=validate_argilla_user_workspace_dataset,
-        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=hide_success_message,
-        outputs=[success_message],
-    ).success(
-        fn=generate_dataset,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name, num_labels, labels],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).success(
-        fn=show_success_message_argilla,
-        inputs=[],
-        outputs=[success_message],
-    )
-    btn_generate_and_push_to_hub.click(
-        fn=hide_success_message,
         outputs=[success_message],
-    ).then(
-        fn=generate_dataset,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).then(
-        fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
-        outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=push_pipeline_code_to_hub,
-        inputs=[pipeline_code, org_name, repo_name],
-        outputs=[],
-        show_progress=True,
-    ).success(
-        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-    )
-    btn_push_to_hub.click(
-        fn=hide_success_message,
-        outputs=[success_message],
-    ).then(
-        fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
-        outputs=[final_dataset],
-        show_progress=True,
-    ).then(
-        fn=push_pipeline_code_to_hub,
-        inputs=[pipeline_code, org_name, repo_name],
-        outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message_hub,
-        inputs=[org_name, repo_name],
-        outputs=[success_message],
-    )
-    btn_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
-    ).success(
-        fn=validate_argilla_user_workspace_dataset,
-        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
-        outputs=[final_dataset],
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name, num_labels, labels],
-        outputs=[final_dataset],
         show_progress=True,
     ).success(
-        fn=show_success_message_argilla,
-        inputs=[],
         outputs=[success_message],
     )
-    system_prompt.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    )
-    difficulty.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    )
-    clarity.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    )
-    labels.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    ).then(
-        fn=update_max_num_labels,
-        inputs=[labels],
-        outputs=[num_labels],
-    )
-    num_labels.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    )
-    num_rows.change(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
-        outputs=[pipeline_code],
-    )

 import re
+import uuid
 from typing import List, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
+from datasets import ClassLabel, Dataset, Features, Sequence, Value
+from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     get_argilla_client,
     get_pipeline_code_ui,
     hide_success_message,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     DEFAULT_BATCH_SIZE,
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
     get_textcat_generator,
 )
+from src.distilabel_dataset_generator.utils import (
+    get_org_dropdown,
+    get_preprocess_labels,
+)
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
     progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator()
     progress(0.7, desc="Generating text classification task")
+    system_prompt = next(
         generate_description.process(
             [
                 {
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
+    return system_prompt, pd.DataFrame()
+def generate_sample_dataset(system_prompt, progress=gr.Progress()):
+    df = generate_dataset(
+        system_prompt=system_prompt,
+        difficulty="mixed",
+        clarity="mixed",
+        labels=[],
+        num_labels=1,
+        num_rows=10,
+        progress=progress,
+        is_sample=True,
+    )
+    if "label" in df.columns:
+        df = df[["label", "text"]]
+    elif "labels" in df.columns:
+        df = df[["labels", "text"]]
+    return df
 def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    if is_sample:
+        multiplier = 1
+    else:
+        multiplier = 2
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
     textcat_results = []
     while n_processed < num_rows:
         progress(
+            multiplier * 0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
         result["text"] = result["input_text"]
     # label text classification data
+    progress(multiplier * 0.5, desc="(1/2) Generating text classification data")
     if not is_sample:
         n_processed = 0
         labeller_results = []
     return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    org_name: str,
+    repo_name: str,
+    num_labels: int = 1,
+    labels: List[str] = None,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    private: bool = False,
+):
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    labels = get_preprocess_labels(labels)
+    if num_labels == 1:
+        dataframe["label"] = dataframe["label"].replace("", None)
+        features = Features(
+            {"text": Value("string"), "label": ClassLabel(names=labels)}
+        )
+    else:
+        features = Features(
+            {
+                "text": Value("string"),
+                "labels": Sequence(feature=ClassLabel(names=labels)),
+            }
+        )
+    distiset = Distiset({"default": Dataset.from_pandas(dataframe, features=features)})
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+def push_dataset_to_argilla(
+    org_name: str,
+    repo_name: str,
+    system_prompt: str,
+    difficulty: str,
+    clarity: str,
+    num_labels: int = 1,
+    n_rows: int = 10,
+    labels: List[str] = None,
+    private: bool = False,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    dataframe = generate_dataset(
+        system_prompt=system_prompt,
+        difficulty=difficulty,
+        clarity=clarity,
+        num_labels=num_labels,
+        labels=labels,
+        num_rows=n_rows,
+    )
+    push_dataset_to_hub(
+        dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
+    )
+    dataframe = dataframe[
+        (dataframe["text"].str.strip() != "") & (dataframe["text"].notna())
+    ]
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        labels = get_preprocess_labels(labels)
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(
+                    name="text",
+                    description="The text classification data",
+                    title="Text",
+                ),
+            ],
+            questions=[
+                (
+                    rg.LabelQuestion(
+                        name="label",
+                        title="Label",
+                        description="The label of the text",
+                        labels=labels,
+                    )
+                    if num_labels == 1
+                    else rg.MultiLabelQuestion(
+                        name="labels",
+                        title="Labels",
+                        description="The labels of the conversation",
+                        labels=labels,
+                    )
+                ),
+            ],
+            metadata=[
+                rg.IntegerMetadataProperty(name="text_length", title="Text Length"),
+            ],
+            vectors=[
+                rg.VectorField(
+                    name="text_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ],
+            guidelines="Please review the text and provide or correct the label where needed.",
+        )
+        dataframe["text_length"] = dataframe["text"].apply(len)
+        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
+        progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=repo_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        records = [
+            rg.Record(
+                fields={
+                    "text": sample["text"],
+                },
+                metadata={"text_length": sample["text_length"]},
+                vectors={"text_embeddings": sample["text_embeddings"]},
+                suggestions=(
+                    [
+                        rg.Suggestion(
+                            question_name="label" if num_labels == 1 else "labels",
+                            value=(
+                                sample["label"] if num_labels == 1 else sample["labels"]
+                            ),
+                        )
+                    ]
+                    if (
+                        (num_labels == 1 and sample["label"] in labels)
+                        or (
+                            num_labels > 1
+                            and all(label in labels for label in sample["labels"])
+                        )
+                    )
+                    else []
+                ),
+            )
+            for sample in hf_dataset
+        ]
+        rg_dataset.records.log(records=records)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
 def update_suggested_labels(system_prompt):
     new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
     if not new_labels:
     return gr.update(maximum=len(labels) if labels else 1)
+with gr.Blocks() as app:
+    gr.Markdown("## Describe the dataset you want")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            dataset_description = gr.Textbox(
+                label="Dataset description",
+                placeholder="Give a precise description of your desired dataset.",
+            )
+            examples = gr.Examples(
+                examples=DEFAULT_DATASET_DESCRIPTIONS,
+                inputs=[dataset_description],
+                cache_examples=False,
+                label="Example descriptions",
+            )
+            system_prompt = gr.Textbox(
+                label="System prompt",
+                placeholder="You are a helpful assistant.",
+                visible=False,
+            )
+            load_btn = gr.Button("Load Dataset")
+        with gr.Column(scale=3):
+            pass
+    gr.Markdown("## Configure your task")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
             difficulty = gr.Dropdown(
                 choices=[
                     ("High School", "high school"),
                 value="mixed",
                 label="Difficulty",
                 info="Select the comprehension level for the text. Ensure it matches the task context.",
+                interactive=True,
             )
             clarity = gr.Dropdown(
                 choices=[
                 value="mixed",
                 label="Clarity",
                 info="Set how easily the correct label or labels can be identified.",
+                interactive=True,
+            )
+            labels = gr.Dropdown(
+                choices=[],
+                allow_custom_value=True,
+                interactive=True,
+                label="Labels",
+                multiselect=True,
+                info="Add the labels to classify the text.",
             )
             num_labels = gr.Number(
                 label="Number of labels per text",
                 value=1,
                 minimum=1,
                 maximum=10,
                 info="Select 1 for single-label and >1 for multi-label.",
+                interactive=True,
+            )
+            btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
+        with gr.Column(scale=3):
+            dataframe = gr.Dataframe()
+    gr.Markdown("## Generate your dataset")
+    gr.HTML("<hr>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            org_name = get_org_dropdown()
+            repo_name = gr.Textbox(
+                label="Repo name",
+                placeholder="dataset_name",
+                value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                interactive=True,
             )
+            n_rows = gr.Number(
                 label="Number of rows",
                 value=10,
+                interactive=True,
+                scale=1,
             )
+            private = gr.Checkbox(
+                label="Private dataset",
+                value=False,
+                interactive=True,
+                scale=1,
             )
+            btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+        with gr.Column(scale=3):
+            success_message = gr.Markdown(visible=True)
+    pipeline_code = get_pipeline_code_ui(
+        generate_pipeline_code(
+            system_prompt.value,
+            difficulty=difficulty.value,
+            clarity=clarity.value,
+            labels=labels.value,
+            num_labels=num_labels.value,
+            num_rows=n_rows.value,
         )
+    )
+    gr.on(
+        triggers=[load_btn.click, btn_apply_to_sample_dataset.click],
+        fn=generate_system_prompt,
+        inputs=[dataset_description],
+        outputs=[system_prompt, dataframe],
+        show_progress=True,
+    ).then(
+        fn=generate_sample_dataset,
+        inputs=[system_prompt],
+        outputs=[dataframe],
+        show_progress=True,
+    ).then(
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
         outputs=[num_labels],
     )
+    btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
         outputs=[success_message],
         show_progress=True,
     ).then(
+        fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
         show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
+        inputs=[
+            org_name,
+            repo_name,
+            system_prompt,
+            difficulty,
+            clarity,
+            num_labels,
+            n_rows,
+            labels,
+            private,
+        ],
+        outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
+        inputs=[org_name, repo_name],
         outputs=[success_message],
     )
+    app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
@@ -119,36 +118,11 @@ The prompt you write should follow the same style and structure as the following
 User dataset description:
 """
-DEFAULT_DATASET_DESCRIPTIONS = (
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
-)
-DEFAULT_SYSTEM_PROMPTS = [
-    """You are a customer support agent for a phone company. Your purpose is to assist customers with their phone-related issues, but you are not very patient and tend to be a bit rude. User queries will be straightforward and clear, but you will respond in a somewhat blunt and curt manner. Remember to keep your responses concise and to the point. User queries are often about phone plans, billing, and technical issues. Your responses should be direct and focus on resolving the issue at hand, but with a slightly abrasive tone. User queries will be concise and to the point, User queries are often about phone plans, billing, and technical issues.""",
-    """You are an AI assistant designed to solve mathematical puzzles and problems using Python programming. Your purpose is to help users tackle various math-related challenges by writing, testing, and explaining Python code. Provide step-by-step solutions, break down complex problems into manageable parts, and offer clear explanations of mathematical concepts and their implementation in Python. Assist users in understanding the logic behind the code and the mathematical principles that govern the solution. User questions are direct and concise.""",
-]
-DEFAULT_DATASETS = [
-    pd.DataFrame.from_dict(
-        {
-            "prompt": [
-                "I'm having trouble with my phone signal at home. It's weak. What can you do to help me?"
-            ],
-            "completion": [
-                """Look, we're not responsible for your signal quality at your house. It's probably a homeowner issue, like your roof covering our tower or something. Can you check if you have a clear view of the roof or if there are any obstructions?"""
-            ],
-        }
-    ),
-    pd.DataFrame.from_dict(
-        {
-            "prompt": [
-                "If the product of the largest and smallest numbers of the five given numbers (3, 7, 8, 12, 20) is greater than the sum of the other three numbers, then the largest number among them is A) 7 B) 12 C) 20 D) 8 Why is this statement true?"
-            ],
-            "completion": [
-                """**To solve this problem, we will follow these steps: 1. Calculate the product of the largest and smallest numbers among the given set. 2. Calculate the sum of the remaining three numbers. 3. Compare the product with the sum to check the given condition. Here's the Python code to solve the problem: ```python # Define the given numbers numbers = [3, 7, 8, 12, 20] # Sort the numbers in ascending order numbers.sort() # Get the smallest number (first element after sorting) and the largest number (last element after sorting) smallest_number = numbers[0] largest_number = numbers[-1] # Calculate the product of the smallest and largest numbers product = smallest_number * largest_number # Calculate the sum of the remaining three numbers sum_of_remaining_numbers = sum(numbers[1:-1]) # Check if the product is greater than the sum of the remaining numbers if product > sum_of_remaining_numbers: print("The statement is true.") else: print("The statement is false.") # Print the largest number among the options largest_option = [7, 12, 20] print("The largest number among the options is:", max(largest"""
-            ],
-        }
-    ),
 ]
 _STOP_SEQUENCES = [
     "<|eot_id|>",
     "<|start_header_id|>",

 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 User dataset description:
 """
+DEFAULT_DATASET_DESCRIPTIONS = [
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
 ]
 _STOP_SEQUENCES = [
     "<|eot_id|>",
     "<|start_header_id|>",

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from typing import List
-import pandas as pd
-import random
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
     TextGeneration,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
     _get_next_api_key,
@@ -50,32 +50,6 @@ DEFAULT_DATASET_DESCRIPTIONS = [
     "A dataset covering news articles about various topics.",
 ]
-DEFAULT_DATASETS = [
-    pd.DataFrame.from_dict(
-        {
-            "text": [
-                "I love the product! It's amazing and I'll buy it again.",
-                "The product was okay, but I wouldn't buy it again.",
-            ],
-            "label": ["positive", "negative"],
-        }
-    ),
-    pd.DataFrame.from_dict(
-        {
-            "text": [
-                "Yesterday, the US stock market had a significant increase.",
-                "New research suggests that the Earth is not a perfect sphere.",
-            ],
-            "labels": [["economy", "politics"], ["science", "environment"]],
-        }
-    ),
-]
-DEFAULT_SYSTEM_PROMPTS = [
-    "Classify the following customer review as either 'positive' or 'negative'.",
-    "Classify the following news article into one of the following categories: 'politics', 'economy', 'environment', 'science', 'health'.",
-]
 def generate_pipeline_code(
     system_prompt: str,

+import random
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
     TextGeneration,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
     _get_next_api_key,
     "A dataset covering news articles about various topics.",
 ]
 def generate_pipeline_code(
     system_prompt: str,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Union, List, Optional
 import argilla as rg
 import gradio as gr
@@ -36,9 +36,7 @@ else:
 def get_login_button():
-    return gr.LoginButton(
-        value="Sign in with Hugging Face!", size="lg", scale=2
-    ).activate()
 def get_duplicate_button():
@@ -52,6 +50,8 @@ def list_orgs(oauth_token: OAuthToken = None):
     data = whoami(oauth_token.token)
     if data["auth"]["type"] == "oauth":
         organisations = [data["name"]] + [org["name"] for org in data["orgs"]]
     else:
         organisations = [
             entry["entity"]["name"]
@@ -64,12 +64,16 @@ def list_orgs(oauth_token: OAuthToken = None):
 def get_org_dropdown(oauth_token: OAuthToken = None):
-    orgs = list_orgs(oauth_token)
     return gr.Dropdown(
         label="Organization",
         choices=orgs,
         value=orgs[0] if orgs else None,
         allow_custom_value=True,
     )
@@ -123,5 +127,6 @@ def get_argilla_client() -> Union[rg.Argilla, None]:
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
-    return list(set([label.lower().strip() for label in labels])) if labels else []

 import os
+from typing import List, Optional, Union
 import argilla as rg
 import gradio as gr
 def get_login_button():
+    return gr.LoginButton(value="Sign in!", size="sm", scale=2).activate()
 def get_duplicate_button():
     data = whoami(oauth_token.token)
     if data["auth"]["type"] == "oauth":
         organisations = [data["name"]] + [org["name"] for org in data["orgs"]]
+    elif data["auth"]["type"] == "access_token":
+        organisations = [org["name"] for org in data["orgs"]]
     else:
         organisations = [
             entry["entity"]["name"]
 def get_org_dropdown(oauth_token: OAuthToken = None):
+    if oauth_token:
+        orgs = list_orgs(oauth_token)
+    else:
+        orgs = []
     return gr.Dropdown(
         label="Organization",
         choices=orgs,
         value=orgs[0] if orgs else None,
         allow_custom_value=True,
+        interactive=True,
     )
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
+    return list(set([label.lower().strip() for label in labels])) if labels else []