davidberenstein1957 HF staff commited on
Commit
90e8636
1 Parent(s): bc3b4e5

feat: sft generation pipeline

Browse files
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm-project.org/#use-with-ide
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
app.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from distilabel_dataset_generator.sft import demo
4
+
5
+ demo = gr.TabbedInterface(
6
+ [demo],
7
+ ["Supervised Fine-Tuning"],
8
+ title="⚗️ Distilabel Dataset Generator",
9
+ head="⚗️ Distilabel Dataset Generator",
10
+ )
11
+
12
+ if __name__ == "__main__":
13
+ demo.launch()
pdm.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "distilabel-dataset-generator"
3
+ version = "0.1.0"
4
+ description = "Default template for PDM package"
5
+ authors = [
6
+ {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
7
+ ]
8
+ dependencies = [
9
+ "distilabel[hf-inference-endpoints]>=1.3.2",
10
+ "gradio",
11
+ "transformers>=4.44.2",
12
+ ]
13
+ requires-python = ">=3.10"
14
+ readme = "README.md"
15
+ license = {text = "apache 2"}
16
+
17
+ [build-system]
18
+ requires = ["pdm-backend"]
19
+ build-backend = "pdm.backend"
20
+
21
+
22
+ [tool.pdm]
23
+ distribution = true
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ distilabel[hf-inference-endpoints]>=1.3.2
2
+ gradio
src/distilabel_dataset_generator/__init__.py ADDED
File without changes
src/distilabel_dataset_generator/sft.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from distilabel.llms import InferenceEndpointsLLM
3
+ from distilabel.pipeline import Pipeline
4
+ from distilabel.steps.tasks import MagpieGenerator, TextGeneration
5
+
6
+ INFORMATION_SEEKING_PROMPT = (
7
+ "You are an AI assistant designed to provide accurate and concise information on a wide"
8
+ " range of topics. Your purpose is to assist users in finding specific facts,"
9
+ " explanations, or details about various subjects. Provide clear, factual responses and,"
10
+ " when appropriate, offer additional context or related information that might be useful"
11
+ " to the user."
12
+ )
13
+
14
+ REASONING_PROMPT = (
15
+ "You are an AI assistant specialized in logical thinking and problem-solving. Your"
16
+ " purpose is to help users work through complex ideas, analyze situations, and draw"
17
+ " conclusions based on given information. Approach each query with structured thinking,"
18
+ " break down problems into manageable parts, and guide users through the reasoning"
19
+ " process step-by-step."
20
+ )
21
+
22
+ PLANNING_PROMPT = (
23
+ "You are an AI assistant focused on helping users create effective plans and strategies."
24
+ " Your purpose is to assist in organizing thoughts, setting goals, and developing"
25
+ " actionable steps for various projects or activities. Offer structured approaches,"
26
+ " consider potential challenges, and provide tips for efficient execution of plans."
27
+ )
28
+
29
+ EDITING_PROMPT = (
30
+ "You are an AI assistant specialized in editing and improving written content. Your"
31
+ " purpose is to help users refine their writing by offering suggestions for grammar,"
32
+ " style, clarity, and overall structure. Provide constructive feedback, explain your"
33
+ " edits, and offer alternative phrasings when appropriate."
34
+ )
35
+
36
+ CODING_DEBUGGING_PROMPT = (
37
+ "You are an AI assistant designed to help with programming tasks. Your purpose is to"
38
+ " assist users in writing, reviewing, and debugging code across various programming"
39
+ " languages. Provide clear explanations, offer best practices, and help troubleshoot"
40
+ " issues. When appropriate, suggest optimizations or alternative approaches to coding"
41
+ " problems."
42
+ )
43
+
44
+ MATH_SYSTEM_PROMPT = (
45
+ "You are an AI assistant designed to provide helpful, step-by-step guidance on solving"
46
+ " math problems. The user will ask you a wide range of complex mathematical questions."
47
+ " Your purpose is to assist users in understanding mathematical concepts, working through"
48
+ " equations, and arriving at the correct solutions."
49
+ )
50
+
51
+ ROLE_PLAYING_PROMPT = (
52
+ "You are an AI assistant capable of engaging in various role-playing scenarios. Your"
53
+ " purpose is to adopt different personas or characters as requested by the user. Maintain"
54
+ " consistency with the chosen role, respond in character, and help create immersive and"
55
+ " interactive experiences for the user."
56
+ )
57
+
58
+ DATA_ANALYSIS_PROMPT = (
59
+ "You are an AI assistant specialized in data analysis and interpretation. Your purpose is"
60
+ " to help users understand and derive insights from data sets, statistics, and analytical"
61
+ " tasks. Offer clear explanations of data trends, assist with statistical calculations,"
62
+ " and provide guidance on data visualization and interpretation techniques."
63
+ )
64
+
65
+ CREATIVE_WRITING_PROMPT = (
66
+ "You are an AI assistant designed to support creative writing endeavors. Your purpose is"
67
+ " to help users craft engaging stories, poems, and other creative texts. Offer"
68
+ " suggestions for plot development, character creation, dialogue writing, and other"
69
+ " aspects of creative composition. Provide constructive feedback and inspire creativity."
70
+ )
71
+
72
+ ADVICE_SEEKING_PROMPT = (
73
+ "You are an AI assistant focused on providing thoughtful advice and guidance. Your"
74
+ " purpose is to help users navigate various personal or professional issues by offering"
75
+ " balanced perspectives, considering potential outcomes, and suggesting practical"
76
+ " solutions. Encourage users to think critically about their situations while providing"
77
+ " supportive and constructive advice."
78
+ )
79
+
80
+ BRAINSTORMING_PROMPT = (
81
+ "You are an AI assistant specialized in generating ideas and facilitating creative"
82
+ " thinking. Your purpose is to help users explore possibilities, think outside the box,"
83
+ " and develop innovative concepts. Encourage free-flowing thoughts, offer diverse"
84
+ " perspectives, and help users build upon and refine their ideas."
85
+ )
86
+
87
+ PROMPT_CREATION_PROMPT = f"""You are an AI assistant specialized in generating very precise prompts for dataset creation.
88
+ Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
89
+ The prompt you write should follow the same style and structure as the following example prompts:
90
+
91
+ {INFORMATION_SEEKING_PROMPT}
92
+
93
+ {REASONING_PROMPT}
94
+
95
+ {PLANNING_PROMPT}
96
+
97
+ {CODING_DEBUGGING_PROMPT}
98
+
99
+ {EDITING_PROMPT}
100
+
101
+ {ROLE_PLAYING_PROMPT}
102
+
103
+ {DATA_ANALYSIS_PROMPT}
104
+
105
+ {CREATIVE_WRITING_PROMPT}
106
+
107
+ {ADVICE_SEEKING_PROMPT}
108
+
109
+ {BRAINSTORMING_PROMPT}
110
+
111
+ User dataset description:
112
+ """
113
+
114
+ MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
115
+
116
+ generate_description = TextGeneration(
117
+ llm=InferenceEndpointsLLM(
118
+ model_id=MODEL,
119
+ tokenizer_id=MODEL,
120
+ generation_kwargs={"temperature": 0.8, "max_new_tokens": 2048},
121
+ ),
122
+ use_system_prompt=True,
123
+ )
124
+ generate_description.load()
125
+
126
+
127
+ def _generate_system_prompt(_dataset_description):
128
+ return next(
129
+ generate_description.process(
130
+ [
131
+ {
132
+ "system_prompt": PROMPT_CREATION_PROMPT,
133
+ "instruction": _dataset_description,
134
+ }
135
+ ]
136
+ )
137
+ )[0]["generation"]
138
+
139
+
140
+ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
141
+ with Pipeline(name="sft") as pipeline:
142
+ magpie_step = MagpieGenerator(
143
+ llm=InferenceEndpointsLLM(
144
+ model_id=MODEL,
145
+ tokenizer_id=MODEL,
146
+ magpie_pre_query_template="llama3",
147
+ generation_kwargs={
148
+ "temperature": 0.8, # it's the best value for Llama 3.1 70B Instruct
149
+ },
150
+ ),
151
+ n_turns=_num_turns,
152
+ num_rows=_num_rows,
153
+ system_prompt=_system_prompt,
154
+ )
155
+ distiset = pipeline.run()
156
+ print(distiset)
157
+ return distiset
158
+
159
+
160
+ with gr.Blocks(
161
+ title="⚗️ Distilabel Dataset Generator", head="⚗️ Distilabel Dataset Generator"
162
+ ) as demo:
163
+ dataset_description = gr.Textbox(
164
+ label="Provide a description of the dataset", value="I am a dataset"
165
+ )
166
+
167
+ btn_generate_system_prompt = gr.Button(
168
+ value="🧪 Generate Sytem Prompt",
169
+ )
170
+
171
+ system_prompt = gr.Textbox(label="Provide or correct the system prompt")
172
+
173
+ btn_generate_system_prompt.click(
174
+ fn=_generate_system_prompt,
175
+ inputs=[dataset_description],
176
+ outputs=[system_prompt],
177
+ )
178
+
179
+ btn_generate_sample_dataset = gr.Button(
180
+ value="🧪 Generate Sample Dataset of 10 rows and a single turn"
181
+ )
182
+
183
+ table = gr.Dataframe(label="Generated Dataset")
184
+
185
+ btn_generate_sample_dataset.click(
186
+ fn=_generate_dataset,
187
+ inputs=[system_prompt],
188
+ outputs=[table],
189
+ )
190
+
191
+ with gr.Row(variant="panel"):
192
+ with gr.Column():
193
+ num_turns = gr.Number(value=1, label="Number of turns in the conversation")
194
+ with gr.Column():
195
+ num_rows = gr.Number(value=1, label="Number of rows in the dataset")
196
+
197
+ dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
198
+
199
+ btn_generate_full_dataset = gr.Button(
200
+ value="⚗️ Generate Full Dataset", variant="primary"
201
+ )
202
+
203
+ btn_generate_full_dataset.click(
204
+ fn=_generate_dataset,
205
+ inputs=[system_prompt, num_turns, num_rows],
206
+ outputs=[table],
207
+ )
208
+
209
+ demo
tests/__init__.py ADDED
File without changes