Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Oct 15, 2024

Commit

f1f92f7

1 Parent(s): 3445828

feat: Add Argilla review integration

Browse files

Files changed (7) hide show

app.py +5 -1
pdm.lock +0 -0
pyproject.toml +3 -2
requirements.txt +3 -2
src/distilabel_dataset_generator/apps/sft.py +264 -30
src/distilabel_dataset_generator/pipelines/embeddings.py +16 -0
src/distilabel_dataset_generator/utils.py +10 -0

app.py CHANGED Viewed

@@ -54,6 +54,10 @@ demo = gr.TabbedInterface(
                 margin-bottom: 20px;
             }
         }
     </style>
     <div class="header-container">
         <div class="logo-container">
@@ -62,7 +66,7 @@ demo = gr.TabbedInterface(
             </a>
         </div>
         <div class="title-container">
-            <h1 style="margin: 0; font-size: 2em;">🧬  Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

                 margin-bottom: 20px;
             }
         }
+        button[role="tab"].selected {
+            color: black;
+            font-weight: bold;
+        }
     </style>
     <div class="header-container">
         <div class="logo-container">
             </a>
         </div>
         <div class="title-container">
+            <h1 style="margin: 0; font-size: 2em;">🧬 Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

pdm.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -6,11 +6,12 @@ authors = [
     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
 ]
-requires-python = ">=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla]==1.4.0",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
+    "sentence-transformers>=3.2.0",
 ]
+requires-python = "<3.13,>=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 transformers
 gradio[oauth]
-distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop
-beautifulsoup4

 transformers
 gradio[oauth]
+distilabel[hf-inference-endpoints,argilla]
+beautifulsoup4
+sentence-transformers

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import io
-from typing import Union
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
@@ -8,7 +10,12 @@ from distilabel.distiset import Distiset
 from distilabel.steps.tasks.text_generation import TextGeneration
 from gradio.oauth import OAuthToken
 from huggingface_hub import upload_file
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
@@ -21,12 +28,21 @@ from src.distilabel_dataset_generator.pipelines.sft import (
     get_response_generator,
 )
 from src.distilabel_dataset_generator.utils import (
     get_login_button,
     get_org_dropdown,
     swap_visibilty,
 )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
@@ -82,7 +98,7 @@ def generate_dataset(
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
-):
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
@@ -191,7 +207,12 @@ def push_to_hub(
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
-):
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     distiset = Distiset(
@@ -208,7 +229,142 @@ def push_to_hub(
         create_pr=False,
     )
     progress(1.0, desc="Dataset pushed to hub")
-    return dataframe
 def upload_pipeline_code(
@@ -333,27 +489,47 @@ with gr.Blocks(
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
-            with gr.Row(variant="panel"):
-                org_name = get_org_dropdown()
-                repo_name = gr.Textbox(
-                    label="Repo name", placeholder="dataset_name", value="my-distiset"
-                )
-                private = gr.Checkbox(
-                    label="Private dataset",
-                    value=True,
-                    interactive=True,
-                    scale=0.5,
-                )
-            with gr.Row() as regenerate_row:
-                btn_generate_full_dataset = gr.Button(
-                    value="Generate", variant="primary", scale=2
-                )
-                btn_generate_and_push_to_hub = gr.Button(
-                    value="Generate and Push to Hub", variant="primary", scale=2
-                )
-                btn_push_to_hub = gr.Button(
-                    value="Push to Hub", variant="primary", scale=2
-                )
             with gr.Row():
                 final_dataset = gr.Dataframe(
                     value=DEFAULT_DATASETS[0],
@@ -365,7 +541,29 @@ with gr.Blocks(
             with gr.Row():
                 success_message = gr.Markdown(visible=False)
-    def show_success_message(org_name, repo_name):
         return gr.Markdown(
             value=f"""
             <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
@@ -378,7 +576,7 @@ with gr.Blocks(
                     </a>
                 </p>
             </div>
-        """,
             visible=True,
         )
@@ -407,8 +605,21 @@ with gr.Blocks(
         inputs=[sample_dataset],
         outputs=[final_dataset],
     )
-    btn_generate_full_dataset.click(
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
@@ -416,6 +627,15 @@ with gr.Blocks(
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
     )
     btn_generate_and_push_to_hub.click(
@@ -437,7 +657,7 @@ with gr.Blocks(
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
@@ -456,11 +676,25 @@ with gr.Blocks(
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],

+import ast
 import io
+from typing import Dict, List, Union
+import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.steps.tasks.text_generation import TextGeneration
 from gradio.oauth import OAuthToken
 from huggingface_hub import upload_file
+from huggingface_hub.hf_api import HfApi
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
     get_response_generator,
 )
 from src.distilabel_dataset_generator.utils import (
+    get_argilla_client,
     get_login_button,
     get_org_dropdown,
     swap_visibilty,
 )
+def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
+    return ast.literal_eval(
+        messages.replace("'user'}", "'user'},")
+        .replace("'system'}", "'system'},")
+        .replace("'assistant'}", "'assistant'},")
+    )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
+) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     distiset = Distiset(
         create_pr=False,
     )
     progress(1.0, desc="Dataset pushed to hub")
+    return original_dataframe
+def push_to_argilla(
+    dataframe: pd.DataFrame,
+    dataset_name: str,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        # Create user if it doesn't exist
+        rg_user = client.users(username=hf_user)
+        if rg_user is None:
+            rg_user = client.users.add(rg.User(username=hf_user, role="admin"))
+        # Create workspace if it doesn't exist
+        workspace = client.workspaces(name=rg_user.username)
+        if workspace is None:
+            workspace = client.workspaces.add(rg.Workspace(name=rg_user.username))
+            workspace.add_user(rg_user)
+        if "messages" in dataframe.columns:
+            settings = rg.Settings(
+                fields=[
+                    rg.ChatField(
+                        name="messages", description="The messages in the conversation"
+                    ),
+                ],
+                questions=[
+                    rg.TextQuestion(
+                        name="correct_response",
+                        description="The corrected response from the assistant",
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="messages_length", title="Messages Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="response_length", title="Response Length"
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="messages_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and provide a score for the assistant's response.",
+            )
+            import pdb
+            pdb.set_trace()
+            dataframe["messages_length"] = dataframe["messages"].apply(
+                lambda x: sum([len(y["content"]) for y in x])
+            )
+            dataframe["messages_embeddings"] = get_embeddings(
+                dataframe["messages"].apply(
+                    lambda x: " ".join([y["content"] for y in x])
+                )
+            )
+            dataframe["correct_response"] = dataframe["messages"].apply(
+                lambda x: x[-1]["content"]
+            )
+            dataframe["response_length"] = dataframe["correct_response"].apply(len)
+            dataframe["messages"] = dataframe["messages"].apply(lambda x: x[:-1])
+        else:
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name="prompt",
+                        description="The prompt used for the conversation",
+                    ),
+                    rg.TextField(
+                        name="completion",
+                        description="The completion from the assistant",
+                    ),
+                ],
+                questions=[
+                    rg.TextQuestion(
+                        name="correct_prompt",
+                        description="The corrected prompt from the assistant",
+                    ),
+                    rg.TextQuestion(
+                        name="correct_completion",
+                        description="The corrected completion from the assistant",
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="prompt_length", title="Prompt Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="completion_length", title="Completion Length"
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="prompt_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and correct the prompt and completion where needed.",
+            )
+            dataframe["correct_prompt"] = dataframe["prompt"]
+            dataframe["correct_completion"] = dataframe["completion"]
+            dataframe["prompt_length"] = dataframe["prompt"].apply(len)
+            dataframe["completion_length"] = dataframe["completion"].apply(len)
+            dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
+        progress(0.5, desc="Creating dataset")
+        if client.datasets(name=dataset_name, workspace=rg_user.username) is not None:
+            raise gr.Error(f"Dataset {dataset_name} already exists")
+        rg_dataset = rg.Dataset(
+            name=dataset_name,
+            workspace=rg_user.username,
+            settings=settings,
+            client=client,
+        )
+        rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return original_dataframe
 def upload_pipeline_code(
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
+            with gr.Tab("Hugging Face Hub"):
+                with gr.Row(variant="panel"):
+                    org_name = get_org_dropdown()
+                    repo_name = gr.Textbox(
+                        label="Repo name",
+                        placeholder="dataset_name",
+                        value="my-distiset",
+                    )
+                    private = gr.Checkbox(
+                        label="Private dataset",
+                        value=True,
+                        interactive=True,
+                        scale=0.5,
+                    )
+                with gr.Row(variant="panel"):
+                    btn_generate_full_dataset = gr.Button(
+                        value="Generate", variant="primary", scale=2
+                    )
+                    btn_generate_and_push_to_hub = gr.Button(
+                        value="Generate and Push to Hub", variant="primary", scale=2
+                    )
+                    btn_push_to_hub = gr.Button(
+                        value="Push to Hub", variant="primary", scale=2
+                    )
+            with gr.Tab(label="Argilla"):
+                with gr.Row(variant="panel"):
+                    dataset_name = gr.Textbox(
+                        label="Dataset name",
+                        placeholder="dataset_name",
+                        value="my-distiset",
+                    )
+                with gr.Row(variant="panel"):
+                    btn_generate_full_dataset_copy = gr.Button(
+                        value="Generate", variant="primary", scale=2
+                    )
+                    btn_generate_and_push_to_argilla = gr.Button(
+                        value="Generate and Push to Argilla", variant="primary", scale=2
+                    )
+                    btn_push_to_argilla = gr.Button(
+                        value="Push to Argilla", variant="primary", scale=2
+                    )
             with gr.Row():
                 final_dataset = gr.Dataframe(
                     value=DEFAULT_DATASETS[0],
             with gr.Row():
                 success_message = gr.Markdown(visible=False)
+    def show_success_message_argilla():
+        client = get_argilla_client()
+        argilla_api_url = client.api_url
+        return gr.Markdown(
+            value=f"""
+            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+                <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+                <p style="margin-top: 0.5em;">
+                    Your dataset is now available at:
+                    <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                        {argilla_api_url}
+                    </a>
+                    Here are some docs to help you:
+                    • <a href="https://docs.argilla.io/latest/getting_started/quickstart/#sign-in-into-the-argilla-ui" target="_blank">Login with OAuth</a>
+                    • <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">Curate your data</a>
+                    • <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">Export your data</a>
+                </p>
+            </div>
+            """,
+            visible=True,
+        )
+    def show_success_message_hub(org_name, repo_name):
         return gr.Markdown(
             value=f"""
             <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
                     </a>
                 </p>
             </div>
+            """,
             visible=True,
         )
         inputs=[sample_dataset],
         outputs=[final_dataset],
     )
+    gr.on(
+        triggers=[
+            btn_generate_full_dataset.click,
+            btn_generate_full_dataset_copy.click,
+        ],
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=generate_dataset,
+        inputs=[system_prompt, num_turns, num_rows],
+        outputs=[final_dataset],
+        show_progress=True,
+    )
+    btn_generate_and_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
+    ).then(
+        fn=push_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
     )
     btn_generate_and_push_to_hub.click(
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
+    btn_push_to_argilla.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=push_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],

src/distilabel_dataset_generator/pipelines/embeddings.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+# Initialize a StaticEmbedding module
+static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
+model = SentenceTransformer(modules=[static_embedding])
+def get_embeddings(texts: List[str]) -> List[List[float]]:
+    return [embedding.tolist() for embedding in model.encode(texts)]
+def get_sentence_embedding_dimensions() -> int:
+    return model.get_sentence_embedding_dimension()

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
@@ -81,3 +82,12 @@ def swap_visibilty(oauth_token: OAuthToken = None):
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
         return gr.update(elem_classes=["main_ui_logged_out"])

 import os
+import argilla as rg
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
         return gr.update(elem_classes=["main_ui_logged_out"])
+def get_argilla_client():
+    return rg.Argilla(
+        api_url=os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
+        or os.getenv("ARGILLA_API_URL"),
+        api_key=os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
+        or os.getenv("ARGILLA_API_KEY"),
+    )