data-generator

Running

App Files Files

davidberenstein1957 HF staff commited on 9 days ago

Commit

5532825

•

1 Parent(s): 3b90025

fix setting contants upon launch

Browse files

Files changed (10) hide show

README.md +4 -3
app.py +7 -3
examples/argilla_deployment.py +2 -2
examples/enforce_mapgie_template copy.py +2 -2
examples/ollama_local.py +2 -2
examples/openai_local.py +2 -2
src/synthetic_dataset_generator/__init__.py +17 -159
src/synthetic_dataset_generator/_distiset.py +113 -0
src/synthetic_dataset_generator/_inference_endpoints.py +58 -0
src/synthetic_dataset_generator/app.py +0 -4

README.md CHANGED Viewed

@@ -67,9 +67,9 @@ pip install synthetic-dataset-generator
 ### Quickstart
 ```python
-from synthetic_dataset_generator.app import demo
-demo.launch()
 ```
 ### Environment Variables
@@ -87,7 +87,8 @@ Optionally, you can use different models and APIs.
 - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
 - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
-- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie generation to either `llama3`, `qwen2`. Not that this is only used if the model is a Qwen or Llama model. If you want to use other model families for chat data generation, feel free to [implement your own pre-query template](https://github.com/argilla-io/distilabel/pull/778/files).
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

 ### Quickstart
 ```python
+from synthetic_dataset_generator import launch
+launch()
 ```
 ### Environment Variables
 - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
 - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
+- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie. Llama3 and Qwen2 are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"` respectively. For other models, you can pass a custom pre-query template string.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

app.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from synthetic_dataset_generator.app import demo
-if __name__ == "__main__":
-    demo.launch()

+import os
+from synthetic_dataset_generator import launch
+os.environ["BASE_URL"] = "http://localhost:11434"
+os.environ["MODEL"] = "llama3.1"
+launch()

examples/argilla_deployment.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # pip install synthetic-dataset-generator
 import os
-from synthetic_dataset_generator.app import demo
 # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
 os.environ["ARGILLA_API_URL"] = "https://[your-owner-name]-[your_space_name].hf.space"
 os.environ["ARGILLA_API_KEY"] = "my_api_key"
-demo.launch()

 # pip install synthetic-dataset-generator
 import os
+from synthetic_dataset_generator import launch
 # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
 os.environ["ARGILLA_API_URL"] = "https://[your-owner-name]-[your_space_name].hf.space"
 os.environ["ARGILLA_API_KEY"] = "my_api_key"
+launch()

examples/enforce_mapgie_template copy.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # pip install synthetic-dataset-generator
 import os
-from synthetic_dataset_generator.app import demo
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"
 os.environ["MODEL"] = "my_custom_model_trained_on_llama3"
-demo.launch()

 # pip install synthetic-dataset-generator
 import os
+from synthetic_dataset_generator import launch
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"
 os.environ["MODEL"] = "my_custom_model_trained_on_llama3"
+launch()

examples/ollama_local.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # pip install synthetic-dataset-generator
 import os
-from synthetic_dataset_generator.app import demo
 assert os.getenv("HF_TOKEN")
 os.environ["BASE_URL"] = "http://127.0.0.1:11434/v1/"
 os.environ["MODEL"] = "llama3.1"
-demo.launch()

 # pip install synthetic-dataset-generator
 import os
+from synthetic_dataset_generator import launch
 assert os.getenv("HF_TOKEN")
 os.environ["BASE_URL"] = "http://127.0.0.1:11434/v1/"
 os.environ["MODEL"] = "llama3.1"
+launch()

examples/openai_local.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # pip install synthetic-dataset-generator
 import os
-from synthetic_dataset_generator.app import demo
 assert os.getenv("HF_TOKEN")
 os.environ["BASE_URL"] = "https://api.openai.com/v1/"
 os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")
 os.environ["MODEL"] = "gpt-4o"
-demo.launch()

 # pip install synthetic-dataset-generator
 import os
+from synthetic_dataset_generator import launch
 assert os.getenv("HF_TOKEN")
 os.environ["BASE_URL"] = "https://api.openai.com/v1/"
 os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")
 os.environ["MODEL"] = "gpt-4o"
+launch()

src/synthetic_dataset_generator/__init__.py CHANGED Viewed

@@ -1,166 +1,24 @@
-import warnings
-from typing import Optional
-import distilabel
-import distilabel.distiset
-from distilabel.llms import InferenceEndpointsLLM
-from distilabel.utils.card.dataset_card import (
-    DistilabelDatasetCard,
-    size_categories_parser,
-)
-from huggingface_hub import DatasetCardData, HfApi
-from pydantic import (
-    ValidationError,
-    model_validator,
-)
-class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
-    @model_validator(mode="after")  # type: ignore
-    def only_one_of_model_id_endpoint_name_or_base_url_provided(
-        self,
-    ) -> "InferenceEndpointsLLM":
-        """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
-        provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
-        favour of the dynamically calculated one.."""
-        if self.base_url and (self.model_id or self.endpoint_name):
-            warnings.warn(  # type: ignore
-                f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
-                " or `endpoint_name` is also provided, the `base_url` will either be ignored"
-                " or overwritten with the one generated from either of those args, for serverless"
-                " or dedicated inference endpoints, respectively."
-            )
-        if self.use_magpie_template and self.tokenizer_id is None:
-            raise ValueError(
-                "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
-                " set a `tokenizer_id` and try again."
-            )
-        if (
-            self.model_id
-            and self.tokenizer_id is None
-            and self.structured_output is not None
-        ):
-            self.tokenizer_id = self.model_id
-        if self.base_url and not (self.model_id or self.endpoint_name):
-            return self
-        if self.model_id and not self.endpoint_name:
-            return self
-        if self.endpoint_name and not self.model_id:
-            return self
-        raise ValidationError(
-            f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
-            f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
-            f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
-        )
-class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
-    def _generate_card(
-        self,
-        repo_id: str,
-        token: str,
-        include_script: bool = False,
-        filename_py: Optional[str] = None,
-    ) -> None:
-        """Generates a dataset card and pushes it to the Hugging Face Hub, and
-        if the `pipeline.yaml` path is available in the `Distiset`, uploads that
-        to the same repository.
-        Args:
-            repo_id: The ID of the repository to push to, from the `push_to_hub` method.
-            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
-            include_script: Whether to upload the script to the hugging face repository.
-            filename_py: The name of the script. If `include_script` is True, the script will
-                be uploaded to the repository using this name, otherwise it won't be used.
-        """
-        card = self._get_card(
-            repo_id=repo_id,
-            token=token,
-            include_script=include_script,
-            filename_py=filename_py,
-        )
-        card.push_to_hub(
-            repo_id,
-            repo_type="dataset",
-            token=token,
-        )
-        if self.pipeline_path:
-            # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
-            HfApi().upload_file(
-                path_or_fileobj=self.pipeline_path,
-                path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
-                repo_id=repo_id,
-                repo_type="dataset",
-                token=token,
-            )
-    def _get_card(
-        self,
-        repo_id: str,
-        token: Optional[str] = None,
-        include_script: bool = False,
-        filename_py: Optional[str] = None,
-    ) -> DistilabelDatasetCard:
-        """Generates the dataset card for the `Distiset`.
-        Note:
-            If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
-            on the hub.
-        Args:
-            repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
-            token: The token to authenticate with the Hugging Face Hub.
-                We assume that if it's provided, the dataset will be in the Hugging Face Hub,
-                so the README metadata will be extracted from there.
-            include_script: Whether to upload the script to the hugging face repository.
-            filename_py: The name of the script. If `include_script` is True, the script will
-                be uploaded to the repository using this name, otherwise it won't be used.
-        Returns:
-            The dataset card for the `Distiset`.
-        """
-        sample_records = {}
-        for name, dataset in self.items():
-            sample_records[name] = (
-                dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
-            )
-        readme_metadata = {}
-        if repo_id and token:
-            readme_metadata = self._extract_readme_metadata(repo_id, token)
-        metadata = {
-            **readme_metadata,
-            "size_categories": size_categories_parser(
-                max(len(dataset) for dataset in self.values())
-            ),
-            "tags": [
-                "synthetic",
-                "distilabel",
-                "rlaif",
-                "datacraft",
-            ],
-        }
-        card = DistilabelDatasetCard.from_template(
-            card_data=DatasetCardData(**metadata),
-            repo_id=repo_id,
-            sample_records=sample_records,
-            include_script=include_script,
-            filename_py=filename_py,
-            references=self.citations,
-        )
-        return card
-distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
-distilabel.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM

+import inspect
+from gradio import TabbedInterface
+from synthetic_dataset_generator import (  # noqa
+    _distiset,
+    _inference_client,
+    _inference_endpoints,
+)
+def launch(*args, **kwargs):
+    """Launch the synthetic dataset generator.
+    Based on the `TabbedInterface` from Gradio.
+    Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
+    """
+    from synthetic_dataset_generator.app import demo
+    return demo.launch(*args, **kwargs)
+launch.__doc__ = TabbedInterface.launch.__doc__
+launch.__signature__ = inspect.signature(TabbedInterface.launch)
+launch.__annotations__ = TabbedInterface.launch.__annotations__

src/synthetic_dataset_generator/_distiset.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from typing import Optional
+import distilabel
+import distilabel.distiset
+from distilabel.utils.card.dataset_card import (
+    DistilabelDatasetCard,
+    size_categories_parser,
+)
+from huggingface_hub import DatasetCardData, HfApi
+class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
+    def _generate_card(
+        self,
+        repo_id: str,
+        token: str,
+        include_script: bool = False,
+        filename_py: Optional[str] = None,
+    ) -> None:
+        """Generates a dataset card and pushes it to the Hugging Face Hub, and
+        if the `pipeline.yaml` path is available in the `Distiset`, uploads that
+        to the same repository.
+        Args:
+            repo_id: The ID of the repository to push to, from the `push_to_hub` method.
+            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
+            include_script: Whether to upload the script to the hugging face repository.
+            filename_py: The name of the script. If `include_script` is True, the script will
+                be uploaded to the repository using this name, otherwise it won't be used.
+        """
+        card = self._get_card(
+            repo_id=repo_id,
+            token=token,
+            include_script=include_script,
+            filename_py=filename_py,
+        )
+        card.push_to_hub(
+            repo_id,
+            repo_type="dataset",
+            token=token,
+        )
+        if self.pipeline_path:
+            # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
+            HfApi().upload_file(
+                path_or_fileobj=self.pipeline_path,
+                path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=token,
+            )
+    def _get_card(
+        self,
+        repo_id: str,
+        token: Optional[str] = None,
+        include_script: bool = False,
+        filename_py: Optional[str] = None,
+    ) -> DistilabelDatasetCard:
+        """Generates the dataset card for the `Distiset`.
+        Note:
+            If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
+            on the hub.
+        Args:
+            repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
+            token: The token to authenticate with the Hugging Face Hub.
+                We assume that if it's provided, the dataset will be in the Hugging Face Hub,
+                so the README metadata will be extracted from there.
+            include_script: Whether to upload the script to the hugging face repository.
+            filename_py: The name of the script. If `include_script` is True, the script will
+                be uploaded to the repository using this name, otherwise it won't be used.
+        Returns:
+            The dataset card for the `Distiset`.
+        """
+        sample_records = {}
+        for name, dataset in self.items():
+            sample_records[name] = (
+                dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
+            )
+        readme_metadata = {}
+        if repo_id and token:
+            readme_metadata = self._extract_readme_metadata(repo_id, token)
+        metadata = {
+            **readme_metadata,
+            "size_categories": size_categories_parser(
+                max(len(dataset) for dataset in self.values())
+            ),
+            "tags": [
+                "synthetic",
+                "distilabel",
+                "rlaif",
+                "datacraft",
+            ],
+        }
+        card = DistilabelDatasetCard.from_template(
+            card_data=DatasetCardData(**metadata),
+            repo_id=repo_id,
+            sample_records=sample_records,
+            include_script=include_script,
+            filename_py=filename_py,
+            references=self.citations,
+        )
+        return card
+distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag

src/synthetic_dataset_generator/_inference_endpoints.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import warnings
+import distilabel
+import distilabel.distiset
+from distilabel.llms import InferenceEndpointsLLM
+from pydantic import (
+    ValidationError,
+    model_validator,
+)
+class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
+    @model_validator(mode="after")  # type: ignore
+    def only_one_of_model_id_endpoint_name_or_base_url_provided(
+        self,
+    ) -> "InferenceEndpointsLLM":
+        """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
+        provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
+        favour of the dynamically calculated one.."""
+        if self.base_url and (self.model_id or self.endpoint_name):
+            warnings.warn(  # type: ignore
+                f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
+                " or `endpoint_name` is also provided, the `base_url` will either be ignored"
+                " or overwritten with the one generated from either of those args, for serverless"
+                " or dedicated inference endpoints, respectively."
+            )
+        if self.use_magpie_template and self.tokenizer_id is None:
+            raise ValueError(
+                "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
+                " set a `tokenizer_id` and try again."
+            )
+        if (
+            self.model_id
+            and self.tokenizer_id is None
+            and self.structured_output is not None
+        ):
+            self.tokenizer_id = self.model_id
+        if self.base_url and not (self.model_id or self.endpoint_name):
+            return self
+        if self.model_id and not self.endpoint_name:
+            return self
+        if self.endpoint_name and not self.model_id:
+            return self
+        raise ValidationError(
+            f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
+            f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
+            f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
+        )
+distilabel.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -28,7 +28,3 @@ demo = TabbedInterface(
     title=image,
     theme=theme,
 )
-if __name__ == "__main__":
-    demo.launch()

     title=image,
     theme=theme,
 )