Spaces:

argilla
/

synthetic-data-generator

Build error

App Files Files Community

sdiazlor commited on Oct 30, 2024

Commit

07a8bbc

verified ·

1 Parent(s): 3c2fc33

textcat-review (#12)

Browse files

- fix: apply feedback (3f2128047133bf339ddb5d4c4a6d6d41edb368da)
- fix: remove extra args (d27c1e6872d8be8b25d12d2b7e4baa5c075ed8c5)
- fix: add seed for more randomized samples (46f00bc57d59efb6274c287aa3b3ab0046d4d64e)
- fix: typo (a3f4be77171e6db5c804079dc82e0e30354bcec9)
- fix: correction label or labels (d59361703bc3414d4d5845cbe57ae760b52be7fc)
- fix: duplicated labels in labels and number of rows update listener in raw pipeline (b92482822c81a2a4330d54fd35640c6984ae8bda)

Files changed (3) hide show

src/distilabel_dataset_generator/apps/base.py +7 -3
src/distilabel_dataset_generator/apps/textcat.py +28 -11
src/distilabel_dataset_generator/pipelines/textcat.py +11 -5

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -38,8 +38,8 @@ def get_main_ui(
         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
                 system_prompt=system_prompt,
-                difficulty="mixed",
-                clarity="mixed",
                 labels=[],
                 num_labels=1,
                 num_rows=1,
@@ -271,7 +271,11 @@ def get_iterate_on_sample_dataset_ui(
         with gr.Row():
             sample_dataset = gr.Dataframe(
                 value=default_datasets[0],
-                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
                 interactive=False,
                 wrap=True,
             )

         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
                 system_prompt=system_prompt,
+                difficulty="high school",
+                clarity="clear",
                 labels=[],
                 num_labels=1,
                 num_rows=1,
         with gr.Row():
             sample_dataset = gr.Dataframe(
                 value=default_datasets[0],
+                label=(
+                    "Sample dataset. Text truncated to 256 tokens."
+                    if task == TEXTCAT_TASK
+                    else "Sample dataset. Prompts and completions truncated to 256 tokens."
+                ),
                 interactive=False,
                 wrap=True,
             )

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -215,7 +215,6 @@ def generate_dataset(
         system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
-        is_sample=is_sample,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -280,11 +279,13 @@ def generate_dataset(
         else:
             dataframe["labels"] = dataframe["labels"].apply(
                 lambda x: (
-                    [
-                        label.lower().strip()
-                        for label in x
-                        if label.lower().strip() in labels
-                    ]
                     if isinstance(x, list)
                     else None
                 )
@@ -309,6 +310,9 @@ def validate_input_labels(labels):
         )
     return labels
 (
     app,
@@ -354,7 +358,7 @@ with app:
                 ],
                 value="mixed",
                 label="Difficulty",
-                info="The difficulty of the text to be generated.",
             )
             clarity = gr.Dropdown(
                 choices=[
@@ -368,7 +372,7 @@ with app:
                 ],
                 value="mixed",
                 label="Clarity",
-                info="The clarity of the text to be generated.",
             )
             with gr.Column():
                 labels = gr.Dropdown(
@@ -385,18 +389,18 @@ with app:
                         size="sm",
                     )
             num_labels = gr.Number(
-                label="Number of labels",
                 value=1,
                 minimum=1,
                 maximum=10,
-                info="The number of labels to classify the text.",
             )
             num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 minimum=1,
                 maximum=500,
-                info="More rows will take longer to generate.",
             )
         pipeline_code = get_pipeline_code_ui(
@@ -415,6 +419,10 @@ with app:
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
     )
     gr.on(
@@ -540,9 +548,18 @@ with app:
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
     )
     num_labels.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
     )

         system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
         else:
             dataframe["labels"] = dataframe["labels"].apply(
                 lambda x: (
+                    list(
+                        set(
+                            label.lower().strip()
+                            for label in x
+                            if label.lower().strip() in labels
+                        )
+                    )
                     if isinstance(x, list)
                     else None
                 )
         )
     return labels
+def update_max_num_labels(labels):
+    return gr.update(maximum=len(labels) if labels else 1)
 (
     app,
                 ],
                 value="mixed",
                 label="Difficulty",
+                info="Select the comprehension level for the text. Ensure it matches the task context.",
             )
             clarity = gr.Dropdown(
                 choices=[
                 ],
                 value="mixed",
                 label="Clarity",
+                info="Set how easily the correct label or labels can be identified.",
             )
             with gr.Column():
                 labels = gr.Dropdown(
                         size="sm",
                     )
             num_labels = gr.Number(
+                label="Number of labels per text",
                 value=1,
                 minimum=1,
                 maximum=10,
+                info="Select 1 for single-label and >1 for multi-label.",
             )
             num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 minimum=1,
                 maximum=500,
+                info="Select the number of rows in the dataset. More rows will take more time.",
             )
         pipeline_code = get_pipeline_code_ui(
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
+    ).then(
+        fn=update_max_num_labels,
+        inputs=[labels],
+        outputs=[num_labels],
     )
     gr.on(
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
+    ).then(
+        fn=update_max_num_labels,
+        inputs=[labels],
+        outputs=[num_labels],
     )
     num_labels.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
     )
+    num_rows.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
@@ -88,6 +89,7 @@ def generate_pipeline_code(
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
@@ -111,6 +113,8 @@ with Pipeline(name="textcat") as pipeline:
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
             }},
         ),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
@@ -175,8 +179,10 @@ def get_textcat_generator(difficulty, clarity, is_sample):
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,
@@ -186,15 +192,15 @@ def get_textcat_generator(difficulty, clarity, is_sample):
     return textcat_generator
-def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
         context=system_prompt,

 from typing import List
 import pandas as pd
+import random
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
+import random
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
+                "do_sample": True,
+                "seed": random.randint(0, 2**32 - 1),
             }},
         ),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
+                "temperature": 0.9,
+                "max_new_tokens": 256 if is_sample else 2048,
+                "do_sample": True,
+                "seed": random.randint(0, 2**32 - 1),
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,
     return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
             },
         ),
         context=system_prompt,