davidberenstein1957 HF staff commited on
Commit
5532825
1 Parent(s): 3b90025

fix setting contants upon launch

Browse files
README.md CHANGED
@@ -67,9 +67,9 @@ pip install synthetic-dataset-generator
67
  ### Quickstart
68
 
69
  ```python
70
- from synthetic_dataset_generator.app import demo
71
 
72
- demo.launch()
73
  ```
74
 
75
  ### Environment Variables
@@ -87,7 +87,8 @@ Optionally, you can use different models and APIs.
87
  - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
88
  - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
89
  - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
90
- - `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie generation to either `llama3`, `qwen2`. Not that this is only used if the model is a Qwen or Llama model. If you want to use other model families for chat data generation, feel free to [implement your own pre-query template](https://github.com/argilla-io/distilabel/pull/778/files).
 
91
 
92
  Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
93
 
 
67
  ### Quickstart
68
 
69
  ```python
70
+ from synthetic_dataset_generator import launch
71
 
72
+ launch()
73
  ```
74
 
75
  ### Environment Variables
 
87
  - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
88
  - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
89
  - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
90
+ - `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie. Llama3 and Qwen2 are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"` respectively. For other models, you can pass a custom pre-query template string.
91
+
92
 
93
  Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
94
 
app.py CHANGED
@@ -1,4 +1,8 @@
1
- from synthetic_dataset_generator.app import demo
2
 
3
- if __name__ == "__main__":
4
- demo.launch()
 
 
 
 
 
1
+ import os
2
 
3
+ from synthetic_dataset_generator import launch
4
+
5
+ os.environ["BASE_URL"] = "http://localhost:11434"
6
+ os.environ["MODEL"] = "llama3.1"
7
+
8
+ launch()
examples/argilla_deployment.py CHANGED
@@ -1,10 +1,10 @@
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
- from synthetic_dataset_generator.app import demo
5
 
6
  # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
7
  os.environ["ARGILLA_API_URL"] = "https://[your-owner-name]-[your_space_name].hf.space"
8
  os.environ["ARGILLA_API_KEY"] = "my_api_key"
9
 
10
- demo.launch()
 
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
+ from synthetic_dataset_generator import launch
5
 
6
  # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
7
  os.environ["ARGILLA_API_URL"] = "https://[your-owner-name]-[your_space_name].hf.space"
8
  os.environ["ARGILLA_API_KEY"] = "my_api_key"
9
 
10
+ launch()
examples/enforce_mapgie_template copy.py CHANGED
@@ -1,9 +1,9 @@
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
- from synthetic_dataset_generator.app import demo
5
 
6
  os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"
7
  os.environ["MODEL"] = "my_custom_model_trained_on_llama3"
8
 
9
- demo.launch()
 
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
+ from synthetic_dataset_generator import launch
5
 
6
  os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"
7
  os.environ["MODEL"] = "my_custom_model_trained_on_llama3"
8
 
9
+ launch()
examples/ollama_local.py CHANGED
@@ -1,10 +1,10 @@
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
- from synthetic_dataset_generator.app import demo
5
 
6
  assert os.getenv("HF_TOKEN")
7
  os.environ["BASE_URL"] = "http://127.0.0.1:11434/v1/"
8
  os.environ["MODEL"] = "llama3.1"
9
 
10
- demo.launch()
 
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
+ from synthetic_dataset_generator import launch
5
 
6
  assert os.getenv("HF_TOKEN")
7
  os.environ["BASE_URL"] = "http://127.0.0.1:11434/v1/"
8
  os.environ["MODEL"] = "llama3.1"
9
 
10
+ launch()
examples/openai_local.py CHANGED
@@ -1,11 +1,11 @@
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
- from synthetic_dataset_generator.app import demo
5
 
6
  assert os.getenv("HF_TOKEN")
7
  os.environ["BASE_URL"] = "https://api.openai.com/v1/"
8
  os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")
9
  os.environ["MODEL"] = "gpt-4o"
10
 
11
- demo.launch()
 
1
  # pip install synthetic-dataset-generator
2
  import os
3
 
4
+ from synthetic_dataset_generator import launch
5
 
6
  assert os.getenv("HF_TOKEN")
7
  os.environ["BASE_URL"] = "https://api.openai.com/v1/"
8
  os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")
9
  os.environ["MODEL"] = "gpt-4o"
10
 
11
+ launch()
src/synthetic_dataset_generator/__init__.py CHANGED
@@ -1,166 +1,24 @@
1
- import warnings
2
- from typing import Optional
3
 
4
- import distilabel
5
- import distilabel.distiset
6
- from distilabel.llms import InferenceEndpointsLLM
7
- from distilabel.utils.card.dataset_card import (
8
- DistilabelDatasetCard,
9
- size_categories_parser,
10
- )
11
- from huggingface_hub import DatasetCardData, HfApi
12
- from pydantic import (
13
- ValidationError,
14
- model_validator,
15
- )
16
-
17
-
18
- class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
19
- @model_validator(mode="after") # type: ignore
20
- def only_one_of_model_id_endpoint_name_or_base_url_provided(
21
- self,
22
- ) -> "InferenceEndpointsLLM":
23
- """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
24
- provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
25
- favour of the dynamically calculated one.."""
26
-
27
- if self.base_url and (self.model_id or self.endpoint_name):
28
- warnings.warn( # type: ignore
29
- f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
30
- " or `endpoint_name` is also provided, the `base_url` will either be ignored"
31
- " or overwritten with the one generated from either of those args, for serverless"
32
- " or dedicated inference endpoints, respectively."
33
- )
34
-
35
- if self.use_magpie_template and self.tokenizer_id is None:
36
- raise ValueError(
37
- "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
38
- " set a `tokenizer_id` and try again."
39
- )
40
-
41
- if (
42
- self.model_id
43
- and self.tokenizer_id is None
44
- and self.structured_output is not None
45
- ):
46
- self.tokenizer_id = self.model_id
47
-
48
- if self.base_url and not (self.model_id or self.endpoint_name):
49
- return self
50
-
51
- if self.model_id and not self.endpoint_name:
52
- return self
53
-
54
- if self.endpoint_name and not self.model_id:
55
- return self
56
 
57
- raise ValidationError(
58
- f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
59
- f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
60
- f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
61
- )
62
-
63
-
64
- class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
65
- def _generate_card(
66
- self,
67
- repo_id: str,
68
- token: str,
69
- include_script: bool = False,
70
- filename_py: Optional[str] = None,
71
- ) -> None:
72
- """Generates a dataset card and pushes it to the Hugging Face Hub, and
73
- if the `pipeline.yaml` path is available in the `Distiset`, uploads that
74
- to the same repository.
75
-
76
- Args:
77
- repo_id: The ID of the repository to push to, from the `push_to_hub` method.
78
- token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
79
- include_script: Whether to upload the script to the hugging face repository.
80
- filename_py: The name of the script. If `include_script` is True, the script will
81
- be uploaded to the repository using this name, otherwise it won't be used.
82
- """
83
- card = self._get_card(
84
- repo_id=repo_id,
85
- token=token,
86
- include_script=include_script,
87
- filename_py=filename_py,
88
- )
89
-
90
- card.push_to_hub(
91
- repo_id,
92
- repo_type="dataset",
93
- token=token,
94
- )
95
- if self.pipeline_path:
96
- # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
97
- HfApi().upload_file(
98
- path_or_fileobj=self.pipeline_path,
99
- path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
100
- repo_id=repo_id,
101
- repo_type="dataset",
102
- token=token,
103
- )
104
-
105
- def _get_card(
106
- self,
107
- repo_id: str,
108
- token: Optional[str] = None,
109
- include_script: bool = False,
110
- filename_py: Optional[str] = None,
111
- ) -> DistilabelDatasetCard:
112
- """Generates the dataset card for the `Distiset`.
113
-
114
- Note:
115
- If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
116
- on the hub.
117
-
118
- Args:
119
- repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
120
- token: The token to authenticate with the Hugging Face Hub.
121
- We assume that if it's provided, the dataset will be in the Hugging Face Hub,
122
- so the README metadata will be extracted from there.
123
- include_script: Whether to upload the script to the hugging face repository.
124
- filename_py: The name of the script. If `include_script` is True, the script will
125
- be uploaded to the repository using this name, otherwise it won't be used.
126
-
127
- Returns:
128
- The dataset card for the `Distiset`.
129
- """
130
- sample_records = {}
131
- for name, dataset in self.items():
132
- sample_records[name] = (
133
- dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
134
- )
135
-
136
- readme_metadata = {}
137
- if repo_id and token:
138
- readme_metadata = self._extract_readme_metadata(repo_id, token)
139
 
140
- metadata = {
141
- **readme_metadata,
142
- "size_categories": size_categories_parser(
143
- max(len(dataset) for dataset in self.values())
144
- ),
145
- "tags": [
146
- "synthetic",
147
- "distilabel",
148
- "rlaif",
149
- "datacraft",
150
- ],
151
- }
152
 
153
- card = DistilabelDatasetCard.from_template(
154
- card_data=DatasetCardData(**metadata),
155
- repo_id=repo_id,
156
- sample_records=sample_records,
157
- include_script=include_script,
158
- filename_py=filename_py,
159
- references=self.citations,
160
- )
161
 
162
- return card
163
 
164
 
165
- distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
166
- distilabel.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM
 
 
1
+ import inspect
 
2
 
3
+ from gradio import TabbedInterface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from synthetic_dataset_generator import ( # noqa
6
+ _distiset,
7
+ _inference_client,
8
+ _inference_endpoints,
9
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def launch(*args, **kwargs):
13
+ """Launch the synthetic dataset generator.
14
+ Based on the `TabbedInterface` from Gradio.
15
+ Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
16
+ """
17
+ from synthetic_dataset_generator.app import demo
 
 
18
 
19
+ return demo.launch(*args, **kwargs)
20
 
21
 
22
+ launch.__doc__ = TabbedInterface.launch.__doc__
23
+ launch.__signature__ = inspect.signature(TabbedInterface.launch)
24
+ launch.__annotations__ = TabbedInterface.launch.__annotations__
src/synthetic_dataset_generator/_distiset.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import distilabel
4
+ import distilabel.distiset
5
+ from distilabel.utils.card.dataset_card import (
6
+ DistilabelDatasetCard,
7
+ size_categories_parser,
8
+ )
9
+ from huggingface_hub import DatasetCardData, HfApi
10
+
11
+
12
+ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
13
+ def _generate_card(
14
+ self,
15
+ repo_id: str,
16
+ token: str,
17
+ include_script: bool = False,
18
+ filename_py: Optional[str] = None,
19
+ ) -> None:
20
+ """Generates a dataset card and pushes it to the Hugging Face Hub, and
21
+ if the `pipeline.yaml` path is available in the `Distiset`, uploads that
22
+ to the same repository.
23
+
24
+ Args:
25
+ repo_id: The ID of the repository to push to, from the `push_to_hub` method.
26
+ token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
27
+ include_script: Whether to upload the script to the hugging face repository.
28
+ filename_py: The name of the script. If `include_script` is True, the script will
29
+ be uploaded to the repository using this name, otherwise it won't be used.
30
+ """
31
+ card = self._get_card(
32
+ repo_id=repo_id,
33
+ token=token,
34
+ include_script=include_script,
35
+ filename_py=filename_py,
36
+ )
37
+
38
+ card.push_to_hub(
39
+ repo_id,
40
+ repo_type="dataset",
41
+ token=token,
42
+ )
43
+ if self.pipeline_path:
44
+ # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
45
+ HfApi().upload_file(
46
+ path_or_fileobj=self.pipeline_path,
47
+ path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
48
+ repo_id=repo_id,
49
+ repo_type="dataset",
50
+ token=token,
51
+ )
52
+
53
+ def _get_card(
54
+ self,
55
+ repo_id: str,
56
+ token: Optional[str] = None,
57
+ include_script: bool = False,
58
+ filename_py: Optional[str] = None,
59
+ ) -> DistilabelDatasetCard:
60
+ """Generates the dataset card for the `Distiset`.
61
+
62
+ Note:
63
+ If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
64
+ on the hub.
65
+
66
+ Args:
67
+ repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
68
+ token: The token to authenticate with the Hugging Face Hub.
69
+ We assume that if it's provided, the dataset will be in the Hugging Face Hub,
70
+ so the README metadata will be extracted from there.
71
+ include_script: Whether to upload the script to the hugging face repository.
72
+ filename_py: The name of the script. If `include_script` is True, the script will
73
+ be uploaded to the repository using this name, otherwise it won't be used.
74
+
75
+ Returns:
76
+ The dataset card for the `Distiset`.
77
+ """
78
+ sample_records = {}
79
+ for name, dataset in self.items():
80
+ sample_records[name] = (
81
+ dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
82
+ )
83
+
84
+ readme_metadata = {}
85
+ if repo_id and token:
86
+ readme_metadata = self._extract_readme_metadata(repo_id, token)
87
+
88
+ metadata = {
89
+ **readme_metadata,
90
+ "size_categories": size_categories_parser(
91
+ max(len(dataset) for dataset in self.values())
92
+ ),
93
+ "tags": [
94
+ "synthetic",
95
+ "distilabel",
96
+ "rlaif",
97
+ "datacraft",
98
+ ],
99
+ }
100
+
101
+ card = DistilabelDatasetCard.from_template(
102
+ card_data=DatasetCardData(**metadata),
103
+ repo_id=repo_id,
104
+ sample_records=sample_records,
105
+ include_script=include_script,
106
+ filename_py=filename_py,
107
+ references=self.citations,
108
+ )
109
+
110
+ return card
111
+
112
+
113
+ distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
src/synthetic_dataset_generator/_inference_endpoints.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ import distilabel
4
+ import distilabel.distiset
5
+ from distilabel.llms import InferenceEndpointsLLM
6
+ from pydantic import (
7
+ ValidationError,
8
+ model_validator,
9
+ )
10
+
11
+
12
+ class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
13
+ @model_validator(mode="after") # type: ignore
14
+ def only_one_of_model_id_endpoint_name_or_base_url_provided(
15
+ self,
16
+ ) -> "InferenceEndpointsLLM":
17
+ """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
18
+ provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
19
+ favour of the dynamically calculated one.."""
20
+
21
+ if self.base_url and (self.model_id or self.endpoint_name):
22
+ warnings.warn( # type: ignore
23
+ f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
24
+ " or `endpoint_name` is also provided, the `base_url` will either be ignored"
25
+ " or overwritten with the one generated from either of those args, for serverless"
26
+ " or dedicated inference endpoints, respectively."
27
+ )
28
+
29
+ if self.use_magpie_template and self.tokenizer_id is None:
30
+ raise ValueError(
31
+ "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
32
+ " set a `tokenizer_id` and try again."
33
+ )
34
+
35
+ if (
36
+ self.model_id
37
+ and self.tokenizer_id is None
38
+ and self.structured_output is not None
39
+ ):
40
+ self.tokenizer_id = self.model_id
41
+
42
+ if self.base_url and not (self.model_id or self.endpoint_name):
43
+ return self
44
+
45
+ if self.model_id and not self.endpoint_name:
46
+ return self
47
+
48
+ if self.endpoint_name and not self.model_id:
49
+ return self
50
+
51
+ raise ValidationError(
52
+ f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
53
+ f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
54
+ f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
55
+ )
56
+
57
+
58
+ distilabel.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM
src/synthetic_dataset_generator/app.py CHANGED
@@ -28,7 +28,3 @@ demo = TabbedInterface(
28
  title=image,
29
  theme=theme,
30
  )
31
-
32
-
33
- if __name__ == "__main__":
34
- demo.launch()
 
28
  title=image,
29
  theme=theme,
30
  )