data-generator

Running

App Files Files

davidberenstein1957 HF staff commited on Sep 9

Commit

90e8636

•

1 Parent(s): bc3b4e5

feat: sft generation pipeline

Browse files

Files changed (8) hide show

.gitignore +162 -0
app.py +13 -0
pdm.lock +0 -0
pyproject.toml +23 -0
requirements.txt +2 -0
src/distilabel_dataset_generator/__init__.py +0 -0
src/distilabel_dataset_generator/sft.py +209 -0
tests/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm-project.org/#use-with-ide
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import gradio as gr
+from distilabel_dataset_generator.sft import demo
+demo = gr.TabbedInterface(
+    [demo],
+    ["Supervised Fine-Tuning"],
+    title="⚗️ Distilabel Dataset Generator",
+    head="⚗️ Distilabel Dataset Generator",
+)
+if __name__ == "__main__":
+    demo.launch()

pdm.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+name = "distilabel-dataset-generator"
+version = "0.1.0"
+description = "Default template for PDM package"
+authors = [
+    {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
+]
+dependencies = [
+    "distilabel[hf-inference-endpoints]>=1.3.2",
+    "gradio",
+    "transformers>=4.44.2",
+]
+requires-python = ">=3.10"
+readme = "README.md"
+license = {text = "apache 2"}
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+[tool.pdm]
+distribution = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ distilabel[hf-inference-endpoints]>=1.3.2
2	+ gradio

src/distilabel_dataset_generator/__init__.py ADDED Viewed

File without changes

src/distilabel_dataset_generator/sft.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import gradio as gr
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps.tasks import MagpieGenerator, TextGeneration
+INFORMATION_SEEKING_PROMPT = (
+    "You are an AI assistant designed to provide accurate and concise information on a wide"
+    " range of topics. Your purpose is to assist users in finding specific facts,"
+    " explanations, or details about various subjects. Provide clear, factual responses and,"
+    " when appropriate, offer additional context or related information that might be useful"
+    " to the user."
+)
+REASONING_PROMPT = (
+    "You are an AI assistant specialized in logical thinking and problem-solving. Your"
+    " purpose is to help users work through complex ideas, analyze situations, and draw"
+    " conclusions based on given information. Approach each query with structured thinking,"
+    " break down problems into manageable parts, and guide users through the reasoning"
+    " process step-by-step."
+)
+PLANNING_PROMPT = (
+    "You are an AI assistant focused on helping users create effective plans and strategies."
+    " Your purpose is to assist in organizing thoughts, setting goals, and developing"
+    " actionable steps for various projects or activities. Offer structured approaches,"
+    " consider potential challenges, and provide tips for efficient execution of plans."
+)
+EDITING_PROMPT = (
+    "You are an AI assistant specialized in editing and improving written content. Your"
+    " purpose is to help users refine their writing by offering suggestions for grammar,"
+    " style, clarity, and overall structure. Provide constructive feedback, explain your"
+    " edits, and offer alternative phrasings when appropriate."
+)
+CODING_DEBUGGING_PROMPT = (
+    "You are an AI assistant designed to help with programming tasks. Your purpose is to"
+    " assist users in writing, reviewing, and debugging code across various programming"
+    " languages. Provide clear explanations, offer best practices, and help troubleshoot"
+    " issues. When appropriate, suggest optimizations or alternative approaches to coding"
+    " problems."
+)
+MATH_SYSTEM_PROMPT = (
+    "You are an AI assistant designed to provide helpful, step-by-step guidance on solving"
+    " math problems. The user will ask you a wide range of complex mathematical questions."
+    " Your purpose is to assist users in understanding mathematical concepts, working through"
+    " equations, and arriving at the correct solutions."
+)
+ROLE_PLAYING_PROMPT = (
+    "You are an AI assistant capable of engaging in various role-playing scenarios. Your"
+    " purpose is to adopt different personas or characters as requested by the user. Maintain"
+    " consistency with the chosen role, respond in character, and help create immersive and"
+    " interactive experiences for the user."
+)
+DATA_ANALYSIS_PROMPT = (
+    "You are an AI assistant specialized in data analysis and interpretation. Your purpose is"
+    " to help users understand and derive insights from data sets, statistics, and analytical"
+    " tasks. Offer clear explanations of data trends, assist with statistical calculations,"
+    " and provide guidance on data visualization and interpretation techniques."
+)
+CREATIVE_WRITING_PROMPT = (
+    "You are an AI assistant designed to support creative writing endeavors. Your purpose is"
+    " to help users craft engaging stories, poems, and other creative texts. Offer"
+    " suggestions for plot development, character creation, dialogue writing, and other"
+    " aspects of creative composition. Provide constructive feedback and inspire creativity."
+)
+ADVICE_SEEKING_PROMPT = (
+    "You are an AI assistant focused on providing thoughtful advice and guidance. Your"
+    " purpose is to help users navigate various personal or professional issues by offering"
+    " balanced perspectives, considering potential outcomes, and suggesting practical"
+    " solutions. Encourage users to think critically about their situations while providing"
+    " supportive and constructive advice."
+)
+BRAINSTORMING_PROMPT = (
+    "You are an AI assistant specialized in generating ideas and facilitating creative"
+    " thinking. Your purpose is to help users explore possibilities, think outside the box,"
+    " and develop innovative concepts. Encourage free-flowing thoughts, offer diverse"
+    " perspectives, and help users build upon and refine their ideas."
+)
+PROMPT_CREATION_PROMPT = f"""You are an AI assistant specialized in generating very precise prompts for dataset creation.
+Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
+The prompt you write should follow the same style and structure as the following example prompts:
+{INFORMATION_SEEKING_PROMPT}
+{REASONING_PROMPT}
+{PLANNING_PROMPT}
+{CODING_DEBUGGING_PROMPT}
+{EDITING_PROMPT}
+{ROLE_PLAYING_PROMPT}
+{DATA_ANALYSIS_PROMPT}
+{CREATIVE_WRITING_PROMPT}
+{ADVICE_SEEKING_PROMPT}
+{BRAINSTORMING_PROMPT}
+User dataset description:
+"""
+MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+generate_description = TextGeneration(
+    llm=InferenceEndpointsLLM(
+        model_id=MODEL,
+        tokenizer_id=MODEL,
+        generation_kwargs={"temperature": 0.8, "max_new_tokens": 2048},
+    ),
+    use_system_prompt=True,
+)
+generate_description.load()
+def _generate_system_prompt(_dataset_description):
+    return next(
+        generate_description.process(
+            [
+                {
+                    "system_prompt": PROMPT_CREATION_PROMPT,
+                    "instruction": _dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
+    with Pipeline(name="sft") as pipeline:
+        magpie_step = MagpieGenerator(
+            llm=InferenceEndpointsLLM(
+                model_id=MODEL,
+                tokenizer_id=MODEL,
+                magpie_pre_query_template="llama3",
+                generation_kwargs={
+                    "temperature": 0.8,  # it's the best value for Llama 3.1 70B Instruct
+                },
+            ),
+            n_turns=_num_turns,
+            num_rows=_num_rows,
+            system_prompt=_system_prompt,
+        )
+    distiset = pipeline.run()
+    print(distiset)
+    return distiset
+with gr.Blocks(
+    title="⚗️ Distilabel Dataset Generator", head="⚗️ Distilabel Dataset Generator"
+) as demo:
+    dataset_description = gr.Textbox(
+        label="Provide a description of the dataset", value="I am a dataset"
+    )
+    btn_generate_system_prompt = gr.Button(
+        value="🧪 Generate Sytem Prompt",
+    )
+    system_prompt = gr.Textbox(label="Provide or correct the system prompt")
+    btn_generate_system_prompt.click(
+        fn=_generate_system_prompt,
+        inputs=[dataset_description],
+        outputs=[system_prompt],
+    )
+    btn_generate_sample_dataset = gr.Button(
+        value="🧪 Generate Sample Dataset of 10 rows and a single turn"
+    )
+    table = gr.Dataframe(label="Generated Dataset")
+    btn_generate_sample_dataset.click(
+        fn=_generate_dataset,
+        inputs=[system_prompt],
+        outputs=[table],
+    )
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            num_turns = gr.Number(value=1, label="Number of turns in the conversation")
+        with gr.Column():
+            num_rows = gr.Number(value=1, label="Number of rows in the dataset")
+    dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
+    btn_generate_full_dataset = gr.Button(
+        value="⚗️ Generate Full Dataset", variant="primary"
+    )
+    btn_generate_full_dataset.click(
+        fn=_generate_dataset,
+        inputs=[system_prompt, num_turns, num_rows],
+        outputs=[table],
+    )
+demo

tests/__init__.py ADDED Viewed

File without changes