Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor commited on Dec 2, 2024

Commit

4e19310

1 Parent(s): 857f1ba

update textcat (separate prompt and labels) and use input parameters

Browse files

Files changed (2) hide show

src/distilabel_dataset_generator/apps/textcat.py +58 -81
src/distilabel_dataset_generator/pipelines/textcat.py +89 -70

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import uuid
 from typing import List, Union
@@ -24,7 +25,6 @@ from src.distilabel_dataset_generator.pipelines.embeddings import (
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
-    PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
@@ -44,36 +44,33 @@ def generate_system_prompt(dataset_description, temperature, progress=gr.Progres
     progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating text classification task")
-    system_prompt = next(
         generate_description.process(
             [
                 {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
-    return system_prompt, pd.DataFrame()
-def generate_sample_dataset(system_prompt, progress=gr.Progress()):
-    df = generate_dataset(
         system_prompt=system_prompt,
-        difficulty="mixed",
-        clarity="mixed",
-        labels=[],
-        num_labels=1,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
-    if "label" in df.columns:
-        df = df[["label", "text"]]
-    elif "labels" in df.columns:
-        df = df[["labels", "text"]]
-    return df
 def generate_dataset(
@@ -86,17 +83,13 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
-    if is_sample:
-        multiplier = 1
-    else:
-        multiplier = 2
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
-        system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
     )
@@ -108,13 +101,15 @@ def generate_dataset(
     textcat_results = []
     while n_processed < num_rows:
         progress(
-            multiplier * 0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
-        inputs = [{"task": system_prompt} for _ in range(batch_size)]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
@@ -122,58 +117,41 @@ def generate_dataset(
         result["text"] = result["input_text"]
     # label text classification data
-    progress(multiplier * 0.5, desc="(1/2) Generating text classification data")
-    if not is_sample:
-        n_processed = 0
-        labeller_results = []
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(1/2) Labeling text classification data",
-            )
-            batch = textcat_results[n_processed : n_processed + batch_size]
-            labels_batch = list(labeller_generator.process(inputs=batch))
-            labeller_results.extend(labels_batch[0])
-            n_processed += batch_size
         progress(
-            1,
             total=total_steps,
-            desc="(2/2) Creating dataset",
         )
     # create final dataset
     distiset_results = []
-    source_results = textcat_results if is_sample else labeller_results
-    for result in source_results:
         record = {
             key: result[key]
-            for key in ["text", "label" if is_sample else "labels"]
             if key in result
         }
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
-    if not is_sample:
-        if num_labels == 1:
-            dataframe = dataframe.rename(columns={"labels": "label"})
-            dataframe["label"] = dataframe["label"].apply(
-                lambda x: x.lower().strip() if x.lower().strip() in labels else None
-            )
-        else:
-            dataframe["labels"] = dataframe["labels"].apply(
-                lambda x: (
-                    list(
-                        set(
-                            label.lower().strip()
-                            for label in x
-                            if label.lower().strip() in labels
-                        )
-                    )
-                    if isinstance(x, list)
-                    else None
-                )
-            )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
@@ -281,7 +259,7 @@ def push_dataset_to_argilla(
         )
         dataframe["text_length"] = dataframe["text"].apply(len)
-        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
         progress(0.5, desc="Creating dataset")
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
@@ -330,15 +308,6 @@ def push_dataset_to_argilla(
     return ""
-def update_suggested_labels(system_prompt):
-    new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
-    if not new_labels:
-        return gr.Warning(
-            "No labels found in the system prompt. Please add labels manually."
-        )
-    return gr.update(choices=new_labels, value=new_labels)
 def validate_input_labels(labels):
     if not labels or len(labels) < 2:
         raise gr.Error(
@@ -448,7 +417,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                 )
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
-                    headers=["labels", "text"], wrap=True, height=300, column_widths=[1, 3]
                 )
         gr.HTML("<hr>")
@@ -496,27 +465,35 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                         label="Distilabel Pipeline Code",
                     )
-    gr.on(
-        triggers=[load_btn.click, btn_apply_to_sample_dataset.click],
         fn=generate_system_prompt,
         inputs=[dataset_description, temperature],
-        outputs=[system_prompt, dataframe],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
-        inputs=[system_prompt],
         outputs=[dataframe],
         show_progress=True,
     ).then(
-        fn=update_suggested_labels,
-        inputs=[system_prompt],
-        outputs=labels,
-    ).then(
         fn=update_max_num_labels,
         inputs=[labels],
         outputs=[num_labels],
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],

+import json
 import re
 import uuid
 from typing import List, Union
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
     progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating text classification task")
+    result = next(
         generate_description.process(
             [
                 {
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
+    data = json.loads(result)
+    system_prompt = data["classification_task"]
+    labels = data["labels"]
+    return system_prompt, labels
+def generate_sample_dataset(system_prompt, difficulty, clarity, labels, num_labels, progress=gr.Progress()):
+    dataframe = generate_dataset(
         system_prompt=system_prompt,
+        difficulty=difficulty,
+        clarity=clarity,
+        labels=labels,
+        num_labels=num_labels,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
+    return dataframe
 def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
+        system_prompt=f"{system_prompt} {', '.join(labels)}",
         labels=labels,
         num_labels=num_labels,
     )
     textcat_results = []
     while n_processed < num_rows:
         progress(
+            2 * 0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
+        inputs = [
+            {"task": f"{system_prompt} {', '.join(labels)}"} for _ in range(batch_size)
+        ]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
         result["text"] = result["input_text"]
     # label text classification data
+    progress(2 * 0.5, desc="(1/2) Generating text classification data")
+    n_processed = 0
+    labeller_results = []
+    while n_processed < num_rows:
         progress(
+            0.5 + 0.5 * n_processed / num_rows,
             total=total_steps,
+            desc="(1/2) Labeling text classification data",
         )
+        batch = textcat_results[n_processed : n_processed + batch_size]
+        labels_batch = list(labeller_generator.process(inputs=batch))
+        labeller_results.extend(labels_batch[0])
+        n_processed += batch_size
+    progress(
+        1,
+        total=total_steps,
+        desc="(2/2) Creating dataset",
+    )
     # create final dataset
     distiset_results = []
+    for result in labeller_results:
         record = {
             key: result[key]
+            for key in ["labels", "text"]
             if key in result
         }
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
+    if num_labels == 1:
+        dataframe = dataframe.rename(columns={"labels": "label"})
+        dataframe["label"] = dataframe["label"].apply(
+            lambda x: x.lower().strip() if x.lower().strip() in labels else None
+        )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
         )
         dataframe["text_length"] = dataframe["text"].apply(len)
+        dataframe["text_embeddings"] = get_embeddings(dataframe["text"].to_list())
         progress(0.5, desc="Creating dataset")
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
     return ""
 def validate_input_labels(labels):
     if not labels or len(labels) < 2:
         raise gr.Error(
                 )
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
+                    headers=["labels", "text"], wrap=True, height=500, interactive=False
                 )
         gr.HTML("<hr>")
                         label="Distilabel Pipeline Code",
                     )
+    load_btn.click(
         fn=generate_system_prompt,
         inputs=[dataset_description, temperature],
+        outputs=[system_prompt, labels],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
         outputs=[dataframe],
         show_progress=True,
     ).then(
         fn=update_max_num_labels,
         inputs=[labels],
         outputs=[num_labels],
     )
+    labels.input(
+        fn=update_max_num_labels,
+        inputs=[labels],
+        outputs=[num_labels],
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
+        outputs=[dataframe],
+        show_progress=True,
+    )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import random
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
@@ -22,25 +23,27 @@ The prompt you write should follow the same style and structure as the following
 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
-Classify the following customer review of a cinema as either 'positive' or 'negative'.
-Classify the following news article into one or more of the following categories: 'politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international'.
-Determine the sentiment of the following social media post: 'ambiguous', 'sarcastic', 'informative', 'emotional'.
-Identify the issue category for the following technical support ticket: 'billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription'.
-Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
-Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent'.
-Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
-Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
-Classify the following restaurant review into one of the following categories: 'food-quality', 'service', 'ambiance', or 'price'.
-Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable-fashion'.
 User dataset description:
 """
@@ -51,6 +54,82 @@ DEFAULT_DATASET_DESCRIPTIONS = [
 ]
 def generate_pipeline_code(
     system_prompt: str,
     difficulty: str = None,
@@ -146,63 +225,3 @@ with Pipeline(name="textcat") as pipeline:
         distiset = pipeline.run()
     """
     )
-def get_prompt_generator(temperature):
-    prompt_generator = TextGeneration(
-        llm=InferenceEndpointsLLM(
-            api_key=_get_next_api_key(),
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            generation_kwargs={
-                "temperature": temperature,
-                "max_new_tokens": 2048,
-                "do_sample": True,
-            },
-        ),
-        use_system_prompt=True,
-    )
-    prompt_generator.load()
-    return prompt_generator
-def get_textcat_generator(difficulty, clarity, is_sample):
-    textcat_generator = GenerateTextClassificationData(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 0.9,
-                "max_new_tokens": 256 if is_sample else 2048,
-                "do_sample": True,
-                "top_k": 50,
-                "top_p": 0.95,
-            },
-        ),
-        difficulty=None if difficulty == "mixed" else difficulty,
-        clarity=None if clarity == "mixed" else clarity,
-        seed=random.randint(0, 2**32 - 1),
-    )
-    textcat_generator.load()
-    return textcat_generator
-def get_labeller_generator(system_prompt, labels, num_labels):
-    labeller_generator = TextClassification(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 0.7,
-                "max_new_tokens": 2048,
-            },
-        ),
-        context=system_prompt,
-        available_labels=labels,
-        n=num_labels,
-        default_label="unknown",
-    )
-    labeller_generator.load()
-    return labeller_generator

 import random
+from pydantic import BaseModel, Field
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
+{"classification_task": "Classify the following customer review of a cinema as", "labels": ["positive", "negative"]}
+{"classification_task": "Categorize the following news article into one or more of the following categories:", "labels": ["politics", "sports", "technology", "entertainment", "health", "business", "environment", "education", "science", "international"]}
+{"classification_task": "Classify the following news article into one or more of the following categories:", "labels": ['politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international']}
+{"classification_task": "Determine the sentiment of the following social media post:", "labels": ['ambiguous', 'sarcastic', 'informative', 'emotional']}
+{"classification_task": "Identify the issue category for the following technical support ticket:", "labels": ['billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription']}
+{"classification_task": "Classify the following movie review into one of the following categories:", "labels": ['critical', 'praise', 'disappointed', 'enthusiastic']}
+{"classification_task": "Categorize the following customer service transcript into one of the following categories:", "labels": ['satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent']}
+{"classification_task": "Classify the following product description into one of the following product types:", "labels": ['smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones']}
+{"classification_task": "Categorize the following tweet expressing the political event discussed as", "labels": ['support', 'opposition']}
+{"classification_task": "Classify the following restaurant review into one of the following categories:", "labels": ['food-quality', 'service', 'ambiance', 'price']}
+{"classification_task": "Categorize the following blog post based on its primary fashion trend or style:", "labels": ['casual', 'formal', 'streetwear', 'vintage', 'sustainable-fashion']}
 User dataset description:
 """
 ]
+class TextClassificationTask(BaseModel):
+    classification_task: str = Field(
+        ...,
+        title="classification_task",
+        description="The classification task to be performed.",
+    )
+    labels: list[str] = Field(
+        ...,
+        title="Labels",
+        description="The possible labels for the classification task.",
+    )
+def get_prompt_generator(temperature):
+    prompt_generator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            structured_output={"format": "json", "schema": TextClassificationTask},
+            generation_kwargs={
+                "temperature": temperature,
+                "max_new_tokens": 2048,
+                "do_sample": True,
+            },
+        ),
+        system_prompt=PROMPT_CREATION_PROMPT,
+        use_system_prompt=True,
+    )
+    prompt_generator.load()
+    return prompt_generator
+def get_textcat_generator(difficulty, clarity, is_sample):
+    textcat_generator = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.9,
+                "max_new_tokens": 256 if is_sample else 2048,
+                "do_sample": True,
+                "top_k": 50,
+                "top_p": 0.95,
+            },
+        ),
+        difficulty=None if difficulty == "mixed" else difficulty,
+        clarity=None if clarity == "mixed" else clarity,
+        seed=random.randint(0, 2**32 - 1),
+    )
+    textcat_generator.load()
+    return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels):
+    labeller_generator = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            },
+        ),
+        context=system_prompt,
+        available_labels=labels,
+        n=num_labels,
+        default_label="unknown",
+    )
+    labeller_generator.load()
+    return labeller_generator
 def generate_pipeline_code(
     system_prompt: str,
     difficulty: str = None,
         distiset = pipeline.run()
     """
     )