Spaces:

Guilherme34
/

synthetic-data-generator

Build error

App Files Files Community

Guilherme34 commited on Sep 17

Commit

e77427d

verified ·

1 Parent(s): b9a08dd

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +57 -0
.env.local.template +54 -0
.gitattributes +5 -0
.gitignore +173 -0
LICENSE +201 -0
README.md +168 -6
app.py +4 -0
assets/argilla.png +3 -0
assets/flow.png +3 -0
assets/logo.png +0 -0
assets/logo.svg +1 -0
assets/ui-full.png +3 -0
assets/ui.png +3 -0
docker-compose.yml +17 -0
docker/.env.docker.template +43 -0
docker/Dockerfile +45 -0
docker/README.md +80 -0
docker/argilla/compose.yml +118 -0
docker/ollama/compose.yml +48 -0
docker/ollama/entrypoint.sh +35 -0
examples/argilla-deployment.py +18 -0
examples/blog_private_synthetic_data_generation.md +222 -0
examples/fine-tune-deepseek-reasoning-sft.ipynb +0 -0
examples/fine-tune-modernbert-classifier.ipynb +538 -0
examples/fine-tune-modernbert-rag.ipynb +980 -0
examples/fine-tune-smollm2-on-synthetic-data.ipynb +310 -0
examples/hf-dedicated-or-tgi-deployment.py +19 -0
examples/hf-serverless-deployment-deepseek.py +16 -0
examples/hf-serverless-deployment.py +15 -0
examples/hf-serverless-different-model-for-completion.py +16 -0
examples/ollama-deployment.py +22 -0
examples/ollama-different-model-for-completion.py +26 -0
examples/openai-deployment.py +18 -0
examples/vllm-deployment.py +21 -0
packages.txt +2 -0
pdm.lock +0 -0
pyproject.toml +40 -0
requirements.txt +1 -0
src/synthetic_dataset_generator/__init__.py +20 -0
src/synthetic_dataset_generator/__main__.py +4 -0
src/synthetic_dataset_generator/_distiset.py +148 -0
src/synthetic_dataset_generator/_inference_endpoints.py +58 -0
src/synthetic_dataset_generator/_tabbedinterface.py +69 -0
src/synthetic_dataset_generator/app.py +35 -0
src/synthetic_dataset_generator/apps/__init__.py +0 -0
src/synthetic_dataset_generator/apps/about.py +15 -0
src/synthetic_dataset_generator/apps/base.py +270 -0
src/synthetic_dataset_generator/apps/chat.py +1142 -0
src/synthetic_dataset_generator/apps/eval.py +894 -0
src/synthetic_dataset_generator/apps/rag.py +972 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,57 @@

+# Version control
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.env*
+!.env.example
+.venv
+env/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Testing
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Project specific
+nltk_data/
+.pdm-python
+.pdm.toml
+__pypackages__/

.env.local.template ADDED Viewed

	@@ -0,0 +1,54 @@

+# =============================================================================
+# LOCAL/API CONFIGURATION
+# =============================================================================
+# -----------------------------------------------------------------------------
+# REQUIRED CONFIGURATION
+# -----------------------------------------------------------------------------
+# Hugging Face token (required for all setups)
+HF_TOKEN=hf_...
+# Generation Settings
+MAX_NUM_TOKENS=2048
+MAX_NUM_ROWS=1000
+DEFAULT_BATCH_SIZE=5
+# Required for chat data generation with Llama or Qwen models
+# Options: "llama3", "qwen2", or custom template string
+MAGPIE_PRE_QUERY_TEMPLATE=llama3
+# -----------------------------------------------------------------------------
+# A. CLOUD API SERVICES
+# -----------------------------------------------------------------------------
+# 1. HUGGING FACE INFERENCE API (Default, Recommended)
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+# MODEL=Qwen/Qwen2.5-1.5B-Instruct
+# 2. OPENAI API
+# OPENAI_BASE_URL=https://api.openai.com/v1/
+# MODEL=gpt-4
+# API_KEY=sk-...
+# 3. HUGGING FACE SPACE FOR ARGILLA (optional)
+# ARGILLA_API_URL=https://your-space.hf.space/
+# ARGILLA_API_KEY=your_key
+# -----------------------------------------------------------------------------
+# B. LOCAL SERVICES (Requires Installation)
+# -----------------------------------------------------------------------------
+# 1. LOCAL OLLAMA
+# OLLAMA_BASE_URL=http://127.0.0.1:11434/
+# MODEL=llama3.2:1b
+# TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
+# 2. LOCAL VLLM
+# VLLM_BASE_URL=http://127.0.0.1:8000/
+# MODEL=Qwen/Qwen2.5-1.5B-Instruct
+# TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct
+# 3. LOCAL TGI
+# HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
+# MODEL=meta-llama/Llama-3.1-8B-Instruct
+# TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/flow.png filter=lfs diff=lfs merge=lfs -text
+*.sh text eol=lf
+assets/argilla.png filter=lfs diff=lfs merge=lfs -text
+assets/ui-full.png filter=lfs diff=lfs merge=lfs -text
+assets/ui.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,173 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm-project.org/#use-with-ide
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.python-version
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_Store
+# nltk
+nltk_data/
+# examples
+models/
+# Elasticsearch data
+elasticsearch_data/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,10 +1,172 @@
 ---
 title: Synthetic Data Generator
-emoji: 🐠
-colorFrom: blue
-colorTo: blue
-sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Synthetic Data Generator
+short_description: Build datasets using natural language
+emoji: 🧬
+colorFrom: yellow
+colorTo: pink
+sdk: gradio
+sdk_version: 5.8.0
+app_file: app.py
+pinned: true
+license: apache-2.0
+hf_oauth: true
+#header: mini
+hf_oauth_scopes:
+- read-repos
+- write-repos
+- manage-repos
+- inference-api
 ---
+> [!IMPORTANT]
+The original authors have moved on to other projects. While the code might still be functional for its original purpose, please be aware that the original team does not plan to develop new features, bug fixes, or updates. If you'd like to become a maintainer, please open an issue to discuss.
+>
+>
+<br>
+<h2 align="center">
+  <a href=""><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" width="80%"></a>
+</h2>
+<h3 align="center">Build datasets using natural language</h3>
+![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)
+## Introduction
+Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also watch the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action.
+Supported Tasks:
+- Text Classification
+- Chat Data for Supervised Fine-Tuning
+- Retrieval Augmented Generation
+This tool simplifies the process of creating custom datasets, enabling you to:
+- Describe the characteristics of your desired application
+- Iterate on sample datasets
+- Produce full-scale datasets
+- Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/)
+By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.
+<p align="center">
+<a href="https://twitter.com/argilla_io">
+<img src="https://img.shields.io/badge/twitter-black?logo=x"/>
+</a>
+<a href="https://www.linkedin.com/company/argilla-io">
+<img src="https://img.shields.io/badge/linkedin-blue?logo=linkedin"/>
+</a>
+<a href="http://hf.co/join/discord">
+<img src="https://img.shields.io/badge/Discord-7289DA?&logo=discord&logoColor=white"/>
+</a>
+</p>
+## Installation
+You can simply install the package with:
+```bash
+pip install synthetic-dataset-generator
+```
+### Quickstart
+```python
+from synthetic_dataset_generator import launch
+launch()
+```
+### Environment Variables
+- `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder.
+You can set the following environment variables to customize the generation process.
+- `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
+- `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
+- `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
+Optionally, you can use different API providers and models.
+- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`, `llama3.1`.
+- `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the `HF_TOKEN` environment variable.
+- `OPENAI_BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`.
+- `OLLAMA_BASE_URL`: The base URL for any Ollama compatible API, e.g. `http://127.0.0.1:11434/`.
+- `HUGGINGFACE_BASE_URL`: The base URL for any Hugging Face compatible API, e.g. TGI server or Dedicated Inference Endpoints. If you want to use serverless inference, only set the `MODEL`.
+- `VLLM_BASE_URL`: The base URL for any VLLM compatible API, e.g. `http://localhost:8000/`.
+To use a specific model exclusively for generating completions, set the corresponding environment variables by appending `_COMPLETION` to the ones mentioned earlier. For example, you can use `MODEL_COMPLETION` and `OPENAI_BASE_URL_COMPLETION`.
+SFT and Chat Data generation is not supported with OpenAI Endpoints. Additionally, you need to configure it per model family based on their prompt templates using the right `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE` environment variables.
+- `TOKENIZER_ID`: The tokenizer ID to use for the magpie pipeline, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`.
+- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. `llama3` and `qwen2` are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"`, respectively. For other models, you can pass a custom pre-query template string.
+Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
+- `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla.
+- `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla.
+To save the generated datasets to a local directory instead of pushing them to the Hugging Face Hub, set the following environment variable:
+- `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
+You can use our environment template as a starting point:
+```bash
+cp .env.local.template .env
+```
+### Argilla integration
+Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
+![Argilla integration](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/argilla.png)
+## Custom synthetic data generation?
+Each pipeline is based on distilabel, so you can easily change the LLM or the pipeline steps.
+Check out the [distilabel library](https://github.com/argilla-io/distilabel) for more information.
+## Development
+Install the dependencies:
+```bash
+# Create a virtual environment
+python -m venv .venv
+source .venv/bin/activate
+# Install the dependencies
+pip install -e . # pdm install
+```
+Run the app:
+```bash
+python app.py
+```
+## 🐳 Docker Setup
+The containerized tool uses Ollama for local LLM inference and Argilla for data curation. Here's the architecture:
+![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
+Quick setup with all services (App + Ollama + Argilla):
+```bash
+# Copy environment template
+cp docker/.env.docker.template .env # Add your HF_TOKEN in .env
+# Build all services (this may take a few minutes)
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
+# Start all services
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+> For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md)

app.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from synthetic_dataset_generator import launch
+if __name__ == "__main__":
+    launch()

assets/argilla.png ADDED Viewed

Git LFS Details

SHA256: 1892b7867842f7f5154c3923278c42d21ec7b6c4bacd159951b8d32d9e64524b
Pointer size: 131 Bytes
Size of remote file: 475 kB

assets/flow.png ADDED Viewed

Git LFS Details

SHA256: b0465f5f3ed2a87b14cc609a1f25a1e7b0bfeb1cc8cab534a6ec79a9a8651996
Pointer size: 132 Bytes
Size of remote file: 1.81 MB

assets/logo.png ADDED Viewed

assets/logo.svg ADDED Viewed

assets/ui-full.png ADDED Viewed

Git LFS Details

SHA256: a38e10e98dd3ed4c93bfd0a5ec7ebc2584cd4ed54c120aad5da9809b8422dc75
Pointer size: 131 Bytes
Size of remote file: 968 kB

assets/ui.png ADDED Viewed

Git LFS Details

SHA256: fdd5805b833fca7b064a67f220489e88bee139348b094bf50a907adb733aad5b
Pointer size: 131 Bytes
Size of remote file: 652 kB

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+services:
+  app:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    image: synthetic-data-generator:app
+    ports:
+      - "7860:7860"
+    env_file:
+      - .env
+    networks:
+      - app-network
+networks:
+  app-network:
+    name: synthetic-data-network
+    driver: bridge

docker/.env.docker.template ADDED Viewed

	@@ -0,0 +1,43 @@

+# =============================================================================
+# DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA)
+# =============================================================================
+# Note: Before building:
+# 1. Copy this template to the root directory: cp docker/.env.docker.template .env
+# 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA)
+# 3. Then build and run with the appropriate docker compose command
+# Hugging Face token with read/write permissions
+HF_TOKEN=your_token_here
+# -----------------------------------------------------------------------------
+# GENERATION SETTINGS
+# -----------------------------------------------------------------------------
+MAX_NUM_TOKENS=2048
+MAX_NUM_ROWS=1000
+DEFAULT_BATCH_SIZE=5
+# -----------------------------------------------------------------------------
+# OLLAMA DOCKER CONFIGURATION
+# -----------------------------------------------------------------------------
+OLLAMA_BASE_URL=http://ollama:11434
+OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD)
+# LLAMA 3.2
+MODEL=llama3.2:1b
+TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
+MAGPIE_PRE_QUERY_TEMPLATE=llama3
+# DEEPSEEK R1
+#MODEL=deepseek-r1:1.5b # must match ollama tags https://ollama.com/library/deepseek-r1:1.5b
+#TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+#MAGPIE_PRE_QUERY_TEMPLATE= "<｜begin▁of▁sentence｜>User: "
+# -----------------------------------------------------------------------------
+# ARGILLA DOCKER CONFIGURATION (persistent data)
+# -----------------------------------------------------------------------------
+ARGILLA_API_URL=http://argilla:6900
+ARGILLA_USERNAME=admin
+ARGILLA_PASSWORD=admin1234
+ARGILLA_API_KEY=admin.1234
+ARGILLA_REINDEX_DATASET=1

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+# Use Python slim image as base
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1
+# Create and set working directory
+WORKDIR /app
+# Create non-root user first
+RUN useradd -m -u 1000 appuser
+# Install system dependencies including build tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    build-essential \
+    cmake \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Install pdm
+RUN pip install --no-cache-dir pdm
+# Copy project files and set permissions
+COPY . .
+RUN chown -R appuser:appuser /app && \
+    chmod -R 755 /app
+# Switch to non-root user
+USER appuser
+# Install dependencies in a virtual environment
+RUN pdm install --prod --frozen-lockfile
+# Expose Gradio port
+EXPOSE 7860
+# Start command using pdm run to use the virtual environment
+CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"]

docker/README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Docker Configuration Guide
+Each service runs in its own container, communicating through internal networks. The core app connects to Ollama for model inference and Argilla for data review:
+![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
+The application can be run with different configurations using Docker Compose:
+- `docker-compose.yml`: Core application
+- `docker/ollama/compose.yml`: Ollama service for local LLM inference
+- `docker/argilla/compose.yml`: Argilla service for data curation
+## Ollama Integration
+The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example:
+```env
+MODEL=llama3.2:1b
+```
+## Setup Options
+### Full Setup (App + Ollama + Argilla)
+```bash
+# Keep all sections uncommented in .env
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+### App + Ollama
+```bash
+# Comment out ARGILLA section in .env
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml build
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d
+```
+### App + Argilla
+```bash
+# Comment out OLLAMA section in .env
+docker compose -f docker-compose.yml -f docker/argilla/compose.yml build
+docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d
+```
+### App Only
+```bash
+# Comment out both OLLAMA and ARGILLA sections in .env
+docker compose -f docker-compose.yml build
+docker compose -f docker-compose.yml up -d
+```
+## Managing Services
+Services are built separately but are linked together. If you already have some services built and want to add another:
+1. You don't need to rebuild existing services
+2. Just build the new service
+3. Stop everything with `down` and start again with `up`
+For example, if you have App + Ollama and want to add Argilla:
+```bash
+docker compose -f docker/argilla/compose.yml build  # only build Argilla
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+Similarly, if you have built all services but want to run only some of them:
+> **Important**: When running specific services, remember to comment out unused services in `.env` first
+```bash
+# No need to build again, just start the services you need
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d  # start only App + Ollama
+```
+## Service URLs
+Once running, access the services at:
+- App: http://localhost:7860
+- Argilla: http://localhost:6900 (if enabled)
+- Ollama: http://localhost:11434 (if enabled)
+> Note:  Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts

docker/argilla/compose.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+services:
+  app:
+    extends:
+      file: docker-compose.yml
+      service: app
+    depends_on:
+      argilla:
+        condition: service_healthy
+        required: false
+    environment:
+      - ARGILLA_API_URL=http://argilla:6900
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0
+    environment:
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m
+      - node.name=elasticsearch
+      - cluster.name=es-argilla-local
+      - discovery.type=single-node
+      - cluster.routing.allocation.disk.threshold_enabled=false
+      - xpack.security.enabled=false
+    volumes:
+      - es_data:/usr/share/elasticsearch/data
+    networks:
+      - app-network
+    ports:
+      - "9200:9200"
+      - "9300:9300"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9200"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+  postgres:
+    image: postgres:14
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: argilla
+    networks:
+      - app-network
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+  redis:
+    image: redis
+    networks:
+      - app-network
+  argilla:
+    image: argilla/argilla-server:latest
+    ports:
+      - "6900:6900"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    env_file:
+      - .env
+    environment:
+      - ARGILLA_HOME_PATH=/var/lib/argilla
+      - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
+      - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
+      - ARGILLA_REDIS_URL=redis://redis:6379/0
+      - USERNAME=${ARGILLA_USERNAME}
+      - PASSWORD=${ARGILLA_PASSWORD}
+      - API_KEY=${ARGILLA_API_KEY}
+      - WORKSPACE=default
+    volumes:
+      - argilla_data:/argilla
+    networks:
+      - app-network
+    depends_on:
+      elasticsearch:
+        condition: service_healthy
+      postgres:
+        condition: service_started
+      redis:
+        condition: service_started
+  worker:
+    image: argilla/argilla-server:latest
+    env_file:
+      - .env
+    environment:
+      - ARGILLA_HOME_PATH=/var/lib/argilla
+      - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
+      - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
+      - ARGILLA_REDIS_URL=redis://redis:6379/0
+      - BACKGROUND_NUM_WORKERS=2
+      - USERNAME=${ARGILLA_USERNAME}
+      - PASSWORD=${ARGILLA_PASSWORD}
+      - API_KEY=${ARGILLA_API_KEY}
+      - WORKSPACE=default
+    networks:
+      - app-network
+    depends_on:
+      - postgres
+      - elasticsearch
+      - redis
+    command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}'
+volumes:
+  es_data:
+    name: synthetic-data-es
+  argilla_data:
+    name: synthetic-data-argilla
+  postgres_data:
+    name: synthetic-data-postgres

docker/ollama/compose.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+services:
+  app:
+    extends:
+      file: docker-compose.yml
+      service: app
+    depends_on:
+      ollama:
+        condition: service_healthy
+        required: true
+    environment:
+      - OLLAMA_BASE_URL=http://ollama:11434
+  ollama:
+    image: ollama/ollama:${OLLAMA_HARDWARE:-latest}
+    ports:
+      - "11434:11434"
+    env_file:
+      - .env
+    environment:
+      - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-}
+    volumes:
+      - ollama_data:/root/.ollama
+      - ./docker/ollama/entrypoint.sh:/entrypoint.sh
+    networks:
+      - app-network
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    tty: true
+    entrypoint: ["/usr/bin/bash", "/entrypoint.sh"]
+    healthcheck:
+      test:
+        - "CMD-SHELL"
+        - |
+          test -f /tmp/ollama_ready && \
+          bash -c '</dev/tcp/localhost/11434'
+      interval: 10s
+      timeout: 10s
+      retries: 100
+      start_period: 10s
+volumes:
+  ollama_data:
+    name: synthetic-data-ollama

docker/ollama/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Start Ollama in the background
+/bin/ollama serve &
+# Record Process ID
+pid=$!
+# Pause for Ollama to start
+sleep 5
+# Extract model name from MODEL variable (removing quotes if present)
+MODEL_NAME=$(echo $MODEL | tr -d '"')
+# Verificar que MODEL_NAME tenga un valor
+if [ -z "$MODEL_NAME" ]; then
+    echo "❌ No model specified in MODEL environment variable"
+else
+    # Check if model exists
+    if ollama list | grep -q "$MODEL_NAME"; then
+        echo "🟢 Model ($MODEL_NAME) already installed"
+        touch /tmp/ollama_ready
+    else
+        echo "🔴 Retrieving model ($MODEL_NAME)..."
+        # Intentar descargar el modelo sin crear el archivo hasta estar seguros
+        if ollama pull "$MODEL_NAME" 2>/dev/null && ollama list | grep -q "$MODEL_NAME"; then
+            echo "🟢 Model download complete!"
+            touch /tmp/ollama_ready
+        else
+            echo "❌ Error downloading model ($MODEL_NAME)"
+        fi
+    fi
+fi
+# Wait for Ollama process to finish
+wait $pid

examples/argilla-deployment.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+# Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
+os.environ["HF_TOKEN"] = "hf_..."
+os.environ["ARGILLA_API_URL"] = (
+    "https://[your-owner-name]-[your_space_name].hf.space"  # argilla base url
+)
+os.environ["ARGILLA_API_KEY"] = "my_api_key"  # argilla api key
+launch()

examples/blog_private_synthetic_data_generation.md ADDED Viewed

	@@ -0,0 +1,222 @@

+# Private Synthetic Data Generation Made Easy: Out-of-the-Box with Docker, Argilla & Ollama
+> "Empowering organizations with a turnkey solution for synthetic dataset creation in private environments."
+The increasing adoption of AI solutions across industries has created an unprecedented demand for high-quality training data. As organizations scale their AI initiatives, they face the dual challenge of generating substantial, domain-specific datasets while ensuring data privacy and security. Traditional approaches often involve compromises: either using public datasets that may not fully align with specific needs, or investing heavily in custom data generation infrastructure.
+The complexity of this challenge is amplified by regulatory requirements, resource constraints, and the need for specialized expertise. Organizations must navigate GDPR, CCPA, and industry-specific regulations while maintaining efficient data generation pipelines. This has created a pressing need for solutions that can operate entirely within private infrastructure while maintaining enterprise-grade capabilities.
+## The Challenge
+The development of AI models requires extensive training data, yet organizations face significant obstacles in data generation and management. Privacy regulations and security requirements often prevent the use of public datasets or cloud-based generation services. Additionally, existing solutions typically demand complex infrastructure setups and significant technical expertise, increasing both implementation time and costs.
+Modern enterprises require a solution that addresses several critical aspects:
+1. Data Privacy: Complete control over data generation and storage
+2. Infrastructure Flexibility: Deployment options that fit existing systems
+3. Quality Assurance: Tools for data validation and curation
+4. Scalability: Ability to grow with increasing data needs
+5. Cost Efficiency: Reduction in infrastructure and maintenance costs
+## The Solution
+This out-of-the-box Synthetic Dataset Generator approach leverages the power of three technologies to create a seamless, private data generation pipeline. At its core is the [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator), a tool designed for dataset creation. [Ollama](https://ollama.ai/) ensures secure local LLM inference with [Distilabel](https://github.com/argilla-io/distilabel) integration, while [Argilla's](https://argilla.io/) data curation capabilities complete the workflow, all operating within your secure infrastructure.
+This architecture delivers key technical advantages:
+- Full data sovereignty with containerized local deployment
+- End-to-end pipeline from generation to validation
+- Modular design for system integration
+Here's how it all fits together:
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
+Let's explore how these components work together in a practical workflow.
+## 1. Installation & Setup
+### 1.1 Clone Repository
+```bash
+git clone https://github.com/argilla-io/synthetic-data-generator
+cd synthetic-data-generator
+```
+### 1.2 Environment Setup
+```bash
+# Copy environment template
+cp docker/.env.docker.template .env
+# Model configuration in .env (if using Ollama)
+MODEL="deepseek-r1:1.5b"  # Must match Ollama model name
+```
+### 1.3 Build & Deploy Services
+> Pro tip: Even if you're planning to use just one component initially, we recommend building all services to enable future functionality without rebuilding. For detailed deployment options, check the [Docker documentation](https://github.com/argilla-io/synthetic-data-generator/blob/main/docker/README.md).
+> Note: Ollama runs on CPU/GPU for Linux/Windows in Docker. For macOS, only CPU is supported in Docker - for GPU support, install Ollama separately ([details](https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image)).
+```bash
+# Build all services
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
+# Start all services
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+To view logs, either:
+- Use Docker Desktop's interface
+- Remove the `-d` flag when running the above command
+- Or execute the following for specific service logs:
+  ```bash
+  # Core App logs
+  docker compose logs -f app
+  # Ollama logs
+  docker compose -f docker-compose.yml -f docker/ollama/compose.yml logs -f ollama
+  # Argilla logs
+  docker compose -f docker-compose.yml -f docker/argilla/compose.yml logs -f argilla
+  ```
+## 2. Dataset Generation
+The tool currently supports **Text Classification**, **Chat**, and **RAG** datasets. These tasks will determine the type of dataset you will generate: classification requires categories, chat data requires a conversation format, and RAG requires question-answer pairs with relevant context, offering options for both retrieval and reranking data generation to enhance different aspects of information retrieval systems.
+For a detailed overview of the generation process, check out the [introduction to the Synthetic Data Generator](https://huggingface.co/blog/synthetic-data-generator).
+### 2.1. **Dataset Description**
+   Let's walk through creating a **RAG dataset**.
+   ```text
+   A dataset to retrieve information from information security policies
+   ```
+   System initializes and processes the prompt:
+   ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/sxH8JChF-HnGMOilymYpA.png)
+### 2.2. **Task Configuration & Sample Generation**
+   System analyzes and generates the system prompt and optimal parameters automatically. Then, samples are generated for validation (modify system prompt or parameters manually if needed, then click save to generate sample data):
+   ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/mYVlGNnz6YNrPJutxmBtR.png)
+### 2.3. **Full Dataset Generation**
+After validating the sample data quality, proceed with full dataset generation. Configure the following parameters:
+- **Repository Owner**: Your Hugging Face username for dataset hosting
+- **Dataset Name**: A descriptive name following standard naming conventions
+- **Number of Examples**: Define dataset size (recommended: 100-1000 for initial deployments)
+- **Temperature**: Controls generation creativity (default 0.7 balances coherence and diversity)
+- **Privacy Settings**: Optional dataset privacy configuration for Hugging Face Hub
+The temperature parameter significantly impacts output quality:
+- 0.5-0.7: Optimal for technical documentation and factual content
+- 0.7-0.8: Balanced for general purpose datasets
+- 0.8-1.0: Increased creativity, suitable for conversational data
+The system initiates the generation pipeline, leveraging Distilabel for structured output:
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/PWNT_bLHwFjeoFX7AhA-z.png)
+Upon completion, the dataset is pushed to Hugging Face Hub:
+![Generation Complete](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/ohd4S-RyNI406uLPf4bnZ.png)
+Access your generated dataset through the Hugging Face Hub interface:
+<iframe
+   src="https://huggingface.co/datasets/daqc/info-security-policies-rag-distiset/embed/viewer/default/train"
+   frameborder="0"
+   width="100%"
+   height="560px"
+></iframe>
+## 3. Data Curation with Argilla
+The integration with Argilla provides enterprise-grade dataset curation capabilities through a comprehensive review system. This phase is crucial for ensuring data quality and maintaining high standards in your training datasets.
+### Environment Configuration
+Before accessing Argilla's features, ensure proper configuration in your `.env` file.
+### Curation Workflow
+1. **Dataset Integration**
+   Upon generation completion, the dataset is automatically ingested into Argilla. The system maintains data integrity and version control throughout the process. All datasets and progress persist across Docker restarts unless you explicitly remove the Argilla services and volumes.
+   ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/0gF6iLywhKafEo3z94cd-.png)
+2. **Quality Assurance Process**
+   Argilla's interface provides comprehensive tools for dataset validation:
+   - Semantic analysis of generated content
+   - Consistency checking across entries
+   - Metadata validation and enrichment
+   - Collaborative review capabilities
+   ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/h9kJ-4lA0LcFC8g6g_vwF.png)
+3. **Dataset Publication**
+   After thorough review, export your curated dataset to Hugging Face Hub:
+   > Note: Consider using a new repository name to preserve both raw and curated datasets separately.
+   - Configure repository settings
+   - Set visibility and access controls
+   - Add dataset cards and documentation
+   ![Export Configuration](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/CPwtVr_Jw6mndNCOU2a5T.png)
+The curated dataset maintains full provenance tracking and quality metrics:
+<iframe
+   src="https://huggingface.co/datasets/daqc/info-security-policies-rag-distiset-argilla/embed/viewer/default/train"
+   frameborder="0"
+   width="100%"
+   height="560px"
+></iframe>
+# 🎉 You're Done!
+Congratulations! You've successfully completed the end-to-end dataset generation and curation process. Your curated dataset is now ready for model training.
+## Experience the Solution
+For a hands-on preview of the Synthetic Dataset Generator's capabilities, explore the hosted space. This allows you to evaluate the interface and functionality before deploying your own instance:
+<iframe
+  src="https://argilla-synthetic-data-generator.hf.space"
+  frameborder="0"
+  width="850"
+  height="450"
+  referrerpolicy="same-origin"
+  sandbox="allow-scripts"
+></iframe>
+Create your own deployment by <a href="https://huggingface.co/spaces/argilla/synthetic-data-generator?duplicate=true">duplicating this Space</a>.
+## What's Next?
+After successfully generating your first dataset, several advanced implementation paths are available:
+Extend your dataset generation capabilities:
+- [Fine-tune models on synthetic data](https://huggingface.co/blog/davidberenstein1957/fine-tune-a-smollm-on-synthetic-data-of-llm) for domain-specific tasks
+- [Create specialized reasoning datasets](https://huggingface.co/blog/sdiazlor/fine-tune-deepseek-with-a-synthetic-reasoning-data) for advanced model training
+## Conclusion
+The Synthetic Dataset Generator represents a significant advancement in private data generation technology, addressing the growing need for high-quality training data while maintaining security and control. By leveraging containerized architecture and local LLM inference, organizations can now generate custom datasets without compromising on data privacy or quality.
+The solution's modular design enables seamless integration with existing ML pipelines while providing enterprise-grade features like persistent storage, comprehensive monitoring, and scalable infrastructure. Through collaborative validation workflows and structured quality control processes, teams can efficiently create and curate datasets tailored to their specific needs.
+This combination of security, efficiency, and flexibility makes the Synthetic Dataset Generator an essential tool for organizations looking to accelerate their AI development while maintaining complete control over their data generation pipeline.
+## References & Documentation
+- [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator): Open-source tool for  dataset generation using natural language
+- [Distilabel Framework](https://github.com/argilla-io/distilabel): Advanced dataset generation framework
+- [Docker Best Practices](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/): Container optimization guidelines
+- [Argilla Documentation](https://docs.argilla.io): Data curation platform documentation
+- [Ollama Integration](https://github.com/jmorganca/ollama): Local LLM deployment guide

examples/fine-tune-deepseek-reasoning-sft.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/fine-tune-modernbert-classifier.ipynb ADDED Viewed

	@@ -0,0 +1,538 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tune ModernBERT for text classification using synthetic data\n",
+    "\n",
+    "LLMs are great general purpose models, but they are not always the best choice for a specific task. Therefore, smaller and more specialized models are important for sustainable, efficient, and cheaper AI.\n",
+    "A lack of domain sepcific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
+    "\n",
+    "In this example, we will fine-tune a ModernBERT model on a synthetic dataset generated from the synthetic-data-generator. This demonstrates the effectiveness of synthetic data and the novel ModernBERT model, which is a new and improved version of BERT models, with an 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n",
+    "\n",
+    "## Install the dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install Pytorch & other libraries\n",
+    "%pip install \"torch==2.5.0\" \"torchvision==0.20.0\" \n",
+    "%pip install \"setuptools<71.0.0\" scikit-learn \n",
+    " \n",
+    "# Install Hugging Face libraries\n",
+    "%pip install  --upgrade \\\n",
+    "  \"datasets==3.1.0\" \\\n",
+    "  \"accelerate==1.2.1\" \\\n",
+    "  \"hf-transfer==0.1.8\"\n",
+    " \n",
+    "# ModernBERT is not yet available in an official release, so we need to install it from github\n",
+    "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The problem\n",
+    "\n",
+    "The [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier), is a model that can classify the domain of a text which can help with curating data. This model is cool but is based on the Deberta V3 Base, which is an outdated architecture that requires custom code to run, has a context length of 512 tokens, and is not as fast as the ModernBERT model. The labels for the model are:\n",
+    "\n",
+    "```\n",
+    "'Adult', 'Arts_and_Entertainment', 'Autos_and_Vehicles', 'Beauty_and_Fitness', 'Books_and_Literature', 'Business_and_Industrial', 'Computers_and_Electronics', 'Finance', 'Food_and_Drink', 'Games', 'Health', 'Hobbies_and_Leisure', 'Home_and_Garden', 'Internet_and_Telecom', 'Jobs_and_Education', 'Law_and_Government', 'News', 'Online_Communities', 'People_and_Society', 'Pets_and_Animals', 'Real_Estate', 'Science', 'Sensitive_Subjects', 'Shopping', 'Sports', 'Travel_and_Transportation'\n",
+    "```\n",
+    "\n",
+    "The data on which the model was trained is not available, so we cannot use it for our purposes. We can however generate a synthetic data to solve this problem."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## Let's generate some data\n",
+    "\n",
+    "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blogpost](https://huggingface.co/blog/synthetic-data-generator). \n",
+    "\n",
+    "<iframe\n",
+    "\tsrc=\"https://argilla-synthetic-data-generator.hf.space\"\n",
+    "\tframeborder=\"0\"\n",
+    "\twidth=\"850\"\n",
+    "\theight=\"450\"\n",
+    "></iframe>\n",
+    "\n",
+    "For this example, we will generate 1000 examples with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
+    "\n",
+    "```\n",
+    "Long texts (at least 2000 words) from various media sources like Wikipedia, Reddit, Common Crawl, websites, commercials, online forums, books, newspapers and folders that cover multiple topics. Classify the text based on its main subject matter into one of the following categories\n",
+    "```\n",
+    "\n",
+    "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few minutes and we end up with a dataset with 1000 examples. The labels are nicely distributed across the categories, varied in length, and the texts look diverse and interesting.\n",
+    "\n",
+    "<iframe\n",
+    "  src=\"https://huggingface.co/datasets/argilla/synthetic-domain-text-classification/embed/viewer/default/train\"\n",
+    "  frameborder=\"0\"\n",
+    "  width=\"100%\"\n",
+    "  height=\"560px\"\n",
+    "></iframe>\n",
+    "\n",
+    "The data is pushed to Argilla to so we recommend inspecting and validating the labels before finetuning the model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetuning the ModernBERT model\n",
+    "\n",
+    "We mostly rely on the blog from [Phillip Schmid](https://www.philschmid.de/fine-tune-modern-bert-in-2025). I will basic consumer hardware, my Apple M1 Max with 32GB of shared memory. We will use the `datasets` library to load the data and the `transformers` library to finetune the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'text': 'Recently, there has been an increase in property values within the suburban areas of several cities due to improvements in infrastructure and lifestyle amenities such as parks, retail stores, and educational institutions nearby. Additionally, new housing developments are emerging, catering to different family needs with varying sizes and price ranges. These changes have influenced investment decisions for many looking to buy or sell properties.',\n",
+       " 'label': 14}"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from datasets.arrow_dataset import Dataset\n",
+    "from datasets.dataset_dict import DatasetDict, IterableDatasetDict\n",
+    "from datasets.iterable_dataset import IterableDataset\n",
+    " \n",
+    "# Dataset id from huggingface.co/dataset\n",
+    "dataset_id = \"argilla/synthetic-domain-text-classification\"\n",
+    " \n",
+    "# Load raw dataset\n",
+    "train_dataset = load_dataset(dataset_id, split='train')\n",
+    "\n",
+    "split_dataset = train_dataset.train_test_split(test_size=0.1)\n",
+    "split_dataset['train'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we need to tokenize the data. We will use the `AutoTokenizer` class from the `transformers` library to load the tokenizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 900/900 [00:00<00:00, 4787.61 examples/s]\n",
+      "Map: 100%|██████████| 100/100 [00:00<00:00, 4163.70 examples/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['labels', 'input_ids', 'attention_mask'])"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    " \n",
+    "# Model id to load the tokenizer\n",
+    "model_id = \"answerdotai/ModernBERT-base\"\n",
+    "\n",
+    "# Load Tokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    " \n",
+    "# Tokenize helper function\n",
+    "def tokenize(batch):\n",
+    "    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors=\"pt\")\n",
+    " \n",
+    "# Tokenize dataset\n",
+    "if \"label\" in split_dataset[\"train\"].features.keys():\n",
+    "    split_dataset =  split_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
+    "tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
+    " \n",
+    "tokenized_dataset[\"train\"].features.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we need to prepare the model. We will use the `AutoModelForSequenceClassification` class from the `transformers` library to load the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSequenceClassification\n",
+    " \n",
+    "# Model id to load the tokenizer\n",
+    "model_id = \"answerdotai/ModernBERT-base\"\n",
+    " \n",
+    "# Prepare model labels - useful for inference\n",
+    "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n",
+    "num_labels = len(labels)\n",
+    "label2id, id2label = dict(), dict()\n",
+    "for i, label in enumerate(labels):\n",
+    "    label2id[label] = str(i)\n",
+    "    id2label[str(i)] = label\n",
+    " \n",
+    "# Download the model from huggingface.co/models\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use a simple F1 score as the evaluation metric."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.metrics import f1_score\n",
+    " \n",
+    "# Metric helper method\n",
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    predictions = np.argmax(predictions, axis=1)\n",
+    "    score = f1_score(\n",
+    "            labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n",
+    "        )\n",
+    "    return {\"f1\": float(score) if score == 1 else score}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we need to define the training arguments. We will use the `TrainingArguments` class from the `transformers` library to define the training arguments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/transformers/training_args.py:2241: UserWarning: `use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. `mps` device will be used by default if available similar to the way `cuda` device is used.Therefore, no action from user is required. \n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfFolder\n",
+    "from transformers import Trainer, TrainingArguments\n",
+    " \n",
+    "# Define training args\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir= \"ModernBERT-domain-classifier\",\n",
+    "    per_device_train_batch_size=32,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    learning_rate=5e-5,\n",
+    "\t\tnum_train_epochs=5,\n",
+    "    bf16=True, # bfloat16 training \n",
+    "    optim=\"adamw_torch_fused\", # improved optimizer \n",
+    "    # logging & evaluation strategies\n",
+    "    logging_strategy=\"steps\",\n",
+    "    logging_steps=100,\n",
+    "    eval_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    save_total_limit=2,\n",
+    "    load_best_model_at_end=True,\n",
+    "    use_mps_device=True,\n",
+    "    metric_for_best_model=\"f1\",\n",
+    "    # push to hub parameters\n",
+    "    push_to_hub=True,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    hub_token=HfFolder.get_token(),\n",
+    ")\n",
+    " \n",
+    "# Create a Trainer instance\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset[\"train\"],\n",
+    "    eval_dataset=tokenized_dataset[\"test\"],\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                 \n",
+      " 20%|██        | 29/145 [11:32<33:16, 17.21s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.729780912399292, 'eval_f1': 0.7743598318036522, 'eval_runtime': 3.5337, 'eval_samples_per_second': 28.299, 'eval_steps_per_second': 1.981, 'epoch': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                  \n",
+      " 40%|████      | 58/145 [22:57<25:56, 17.89s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.4369044005870819, 'eval_f1': 0.8310764765820946, 'eval_runtime': 3.3266, 'eval_samples_per_second': 30.061, 'eval_steps_per_second': 2.104, 'epoch': 2.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                \n",
+      " 60%|██████    | 87/145 [35:16<17:06, 17.70s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.6091340184211731, 'eval_f1': 0.8399274488570763, 'eval_runtime': 3.2772, 'eval_samples_per_second': 30.514, 'eval_steps_per_second': 2.136, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 69%|██████▉   | 100/145 [41:03<18:02, 24.06s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.7663, 'grad_norm': 7.232136249542236, 'learning_rate': 1.5517241379310346e-05, 'epoch': 3.45}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                 \n",
+      " 80%|████████  | 116/145 [47:23<08:50, 18.30s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.43516409397125244, 'eval_f1': 0.8797674004703547, 'eval_runtime': 3.2975, 'eval_samples_per_second': 30.326, 'eval_steps_per_second': 2.123, 'epoch': 4.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                   \n",
+      "100%|██████████| 145/145 [1:00:40<00:00, 19.18s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.39272159337997437, 'eval_f1': 0.8914389523348718, 'eval_runtime': 3.5564, 'eval_samples_per_second': 28.118, 'eval_steps_per_second': 1.968, 'epoch': 5.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 145/145 [1:00:42<00:00, 25.12s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 3642.7783, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.04, 'train_loss': 0.535627057634551, 'epoch': 5.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "events.out.tfevents.1735555878.Davids-MacBook-Pro.local.23438.0: 100%|██████████| 9.32k/9.32k [00:00<00:00, 55.0kB/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/davidberenstein1957/domain-classifier/commit/915f4b03c230cc8f376f13729728f14347400041', commit_message='End of training', commit_description='', oid='915f4b03c230cc8f376f13729728f14347400041', pr_url=None, repo_url=RepoUrl('https://huggingface.co/davidberenstein1957/domain-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='davidberenstein1957/domain-classifier'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()\n",
+    "# Save processor and create model card\n",
+    "tokenizer.save_pretrained(\"ModernBERT-domain-classifier\")\n",
+    "trainer.create_model_card()\n",
+    "trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We get an F1 score of 0.89 on the test set, which is pretty good for the small dataset and time spent."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run inference\n",
+    "\n",
+    "We can now load the model and run inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use mps:0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'label': 'health', 'score': 0.6779336333274841}]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    " \n",
+    "# load model from huggingface.co/models using our repository id\n",
+    "classifier = pipeline(\n",
+    "    task=\"text-classification\", \n",
+    "    model=\"argilla/ModernBERT-domain-classifier\", \n",
+    "    device=0,\n",
+    ")\n",
+    " \n",
+    "sample = \"Smoking is bad for your health.\"\n",
+    " \n",
+    "classifier(sample)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "We have shown that we can generate a synthetic dataset from an LLM and finetune a ModernBERT model on it. This the effectiveness of synthetic data and the novel ModernBERT model, which is new and improved version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds. \n",
+    "\n",
+    "Pretty cool for 20 minutes of generating data, and an hour of fine-tuning on consumer hardware."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

examples/fine-tune-modernbert-rag.ipynb ADDED Viewed

	@@ -0,0 +1,980 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tune ModernBERT with Synthetic Data for RAG\n",
+    "\n",
+    "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
+    "\n",
+    "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting Started"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Install the Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install torch\n",
+    "!pip install datasets\n",
+    "!pip install sentence-transformers\n",
+    "!pip install haystack-ai\n",
+    "!pip install git+https://github.com/huggingface/transformers.git  # for the latest version of transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import the Required Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict\n",
+    "\n",
+    "\n",
+    "from sentence_transformers import (\n",
+    "    SentenceTransformer,\n",
+    "    SentenceTransformerModelCardData,\n",
+    "    CrossEncoder,\n",
+    "    InputExample,\n",
+    "    SentenceTransformerTrainer,\n",
+    ")\n",
+    "from sentence_transformers.losses import TripletLoss\n",
+    "from sentence_transformers.training_args import (\n",
+    "    SentenceTransformerTrainingArguments,\n",
+    "    BatchSamplers,\n",
+    ")\n",
+    "from sentence_transformers.evaluation import TripletEvaluator\n",
+    "from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator\n",
+    "\n",
+    "\n",
+    "from haystack import Document, Pipeline\n",
+    "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+    "from haystack.components.embedders import (\n",
+    "    SentenceTransformersDocumentEmbedder,\n",
+    "    SentenceTransformersTextEmbedder,\n",
+    ")\n",
+    "from haystack.components.rankers import SentenceTransformersDiversityRanker\n",
+    "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n",
+    "from haystack.components.builders import ChatPromptBuilder\n",
+    "from haystack.components.generators.chat import HuggingFaceAPIChatGenerator\n",
+    "from haystack.dataclasses import ChatMessage\n",
+    "from haystack.utils import Secret\n",
+    "from haystack.utils.hf import HFGenerationAPIType"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL = \"nomic-ai/modernbert-embed-base\"\n",
+    "REPO_NAME = \"sdiazlor\" # your HF username here\n",
+    "MODEL_NAME_BIENCODER = \"modernbert-embed-base-biencoder-human-rights\"\n",
+    "MODEL_NAME_CROSSENCODER = \"modernbert-embed-base-crossencoder-human-rights\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: mps\n"
+     ]
+    }
+   ],
+   "source": [
+    "if torch.backends.mps.is_available():\n",
+    "    device = \"mps\"\n",
+    "elif torch.cuda.is_available():\n",
+    "    device = \"cuda\"\n",
+    "else:\n",
+    "    device = \"cpu\"\n",
+    "\n",
+    "print(f\"Using device: {device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pre-process the Synthetic Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['context', 'question', 'response', 'positive_retrieval', 'negative_retrieval', 'positive_reranking', 'negative_reranking'],\n",
+       "    num_rows: 1000\n",
+       "})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Combine the generated datasets from files and prompts\n",
+    "\n",
+    "dataset_rag_from_file = load_dataset(f\"{REPO_NAME}/rag-human-rights-from-files\", split=\"train\")\n",
+    "dataset_rag_from_prompt = load_dataset(f\"{REPO_NAME}/rag-human-rights-from-prompt\", split=\"train\")\n",
+    "\n",
+    "combined_rag_dataset = concatenate_datasets(\n",
+    "    [dataset_rag_from_file, dataset_rag_from_prompt]\n",
+    ")\n",
+    "\n",
+    "combined_rag_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['context', 'question', 'response', 'positive_retrieval', 'negative_retrieval', 'positive_reranking', 'negative_reranking'],\n",
+       "    num_rows: 828\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Filter out examples with empty or NaN values\n",
+    "\n",
+    "def filter_empty_or_nan(example):\n",
+    "    return all(\n",
+    "        value is not None and str(value).strip() != \"\" for value in example.values()\n",
+    "    )\n",
+    "\n",
+    "filtered_rag_dataset = combined_rag_dataset.filter(filter_empty_or_nan).shuffle(seed=42)\n",
+    "filtered_rag_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['anchor', 'positive', 'negative'],\n",
+      "    num_rows: 828\n",
+      "})\n",
+      "Dataset({\n",
+      "    features: ['anchor', 'positive'],\n",
+      "    num_rows: 828\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Rename, select and reorder columns according to the expected format for the SentenceTransformer and CrossEncoder models\n",
+    "\n",
+    "def rename_and_reorder_columns(dataset, rename_map, selected_columns):\n",
+    "    for old_name, new_name in rename_map.items():\n",
+    "        if old_name in dataset.column_names:\n",
+    "            dataset = dataset.rename_column(old_name, new_name)\n",
+    "    dataset = dataset.select_columns(selected_columns)\n",
+    "    return dataset\n",
+    "\n",
+    "clean_rag_dataset_biencoder = rename_and_reorder_columns(\n",
+    "    filtered_rag_dataset,\n",
+    "    rename_map={\"context\": \"anchor\", \"positive_retrieval\": \"positive\", \"negative_retrieval\": \"negative\"},\n",
+    "    selected_columns=[\"anchor\", \"positive\", \"negative\"],\n",
+    ")\n",
+    "\n",
+    "clean_rag_dataset_crossencoder = rename_and_reorder_columns(\n",
+    "    filtered_rag_dataset,\n",
+    "    rename_map={\"context\": \"anchor\", \"positive_retrieval\": \"positive\"}, #TODO\n",
+    "    selected_columns=[\"anchor\", \"positive\"],\n",
+    ")\n",
+    "\n",
+    "print(clean_rag_dataset_biencoder)\n",
+    "print(clean_rag_dataset_crossencoder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m-v1.5 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "406c4d22f43f41d592d3b94da2955444",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/828 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['anchor', 'positive', 'score'],\n",
+       "    num_rows: 828\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Add scores to train the CrossEncoder model, which requires sentence pairs with a score indicating how related they are.\n",
+    "# Check the available models: https://huggingface.co/spaces/mteb/leaderboard\n",
+    "\n",
+    "model_reranking = CrossEncoder(\n",
+    "    model_name=\"Snowflake/snowflake-arctic-embed-m-v1.5\", device=device\n",
+    ")\n",
+    "\n",
+    "def add_reranking_scores(batch):\n",
+    "    pairs = list(zip(batch[\"anchor\"], batch[\"positive\"]))\n",
+    "    batch[\"score\"] = model_reranking.predict(pairs)\n",
+    "    return batch\n",
+    "\n",
+    "clean_rag_dataset_crossencoder = clean_rag_dataset_crossencoder.map(\n",
+    "    add_reranking_scores, batched=True, batch_size=250\n",
+    ")\n",
+    "clean_rag_dataset_crossencoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['anchor', 'positive', 'negative'],\n",
+      "        num_rows: 662\n",
+      "    })\n",
+      "    eval: Dataset({\n",
+      "        features: ['anchor', 'positive', 'negative'],\n",
+      "        num_rows: 166\n",
+      "    })\n",
+      "})\n",
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['anchor', 'positive', 'score'],\n",
+      "        num_rows: 662\n",
+      "    })\n",
+      "    eval: Dataset({\n",
+      "        features: ['anchor', 'positive', 'score'],\n",
+      "        num_rows: 166\n",
+      "    })\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Split the datasets into training and evaluation sets\n",
+    "def split_dataset(dataset, train_size=0.8, seed=42):\n",
+    "    train_eval_split = dataset.train_test_split(test_size=1 - train_size, seed=seed)\n",
+    "\n",
+    "    dataset_dict = DatasetDict(\n",
+    "        {\"train\": train_eval_split[\"train\"], \"eval\": train_eval_split[\"test\"]}\n",
+    "    )\n",
+    "\n",
+    "    return dataset_dict\n",
+    "\n",
+    "dataset_rag_biencoder = split_dataset(clean_rag_dataset_biencoder)\n",
+    "dataset_rag_crossencoder = split_dataset(clean_rag_dataset_crossencoder)\n",
+    "\n",
+    "print(dataset_rag_biencoder)\n",
+    "print(dataset_rag_crossencoder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train the Bi-Encoder model for Retrieval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the base model and create the SentenceTransformer model\n",
+    "model_biencoder = SentenceTransformer(\n",
+    "    MODEL,\n",
+    "    model_card_data=SentenceTransformerModelCardData(\n",
+    "        language=\"en\",\n",
+    "        license=\"apache-2.0\",\n",
+    "        model_name=MODEL_NAME_BIENCODER,\n",
+    "    ),\n",
+    ")\n",
+    "model_biencoder.gradient_checkpointing_enable()  # Enable gradient checkpointing to save memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select the TripleLoss loss function which requires sentence triplets (anchor, positive, negative)\n",
+    "# Check the available losses: https://sbert.net/docs/sentence_transformer/loss_overview.html\n",
+    "\n",
+    "loss_biencoder = TripletLoss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sdiazlor/.pyenv/versions/3.11.4/envs/distilabel-tutorials/lib/python3.11/site-packages/transformers/training_args.py:2243: UserWarning: `use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. `mps` device will be used by default if available similar to the way `cuda` device is used.Therefore, no action from user is required. \n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define the training arguments for the SentenceTransformer model\n",
+    "# Customize them as needed for your requirements\n",
+    "\n",
+    "training_args = SentenceTransformerTrainingArguments(\n",
+    "    output_dir=f\"models/{MODEL_NAME_BIENCODER}\",\n",
+    "    num_train_epochs=3,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    warmup_ratio=0.1,\n",
+    "    learning_rate=2e-5,\n",
+    "    lr_scheduler_type=\"cosine\",\n",
+    "    fp16=False,  # or True if stable on your MPS device\n",
+    "    bf16=False,\n",
+    "    batch_sampler=BatchSamplers.NO_DUPLICATES,\n",
+    "    eval_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    save_total_limit=2,\n",
+    "    logging_steps=100,\n",
+    "    load_best_model_at_end=True,\n",
+    "    use_mps_device=(device == \"mps\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the evaluator to assess the performance of the model\n",
+    "triplet_evaluator = TripletEvaluator(\n",
+    "    anchors=dataset_rag_biencoder[\"eval\"][\"anchor\"],\n",
+    "    positives=dataset_rag_biencoder[\"eval\"][\"positive\"],\n",
+    "    negatives=dataset_rag_biencoder[\"eval\"][\"negative\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sdiazlor/.pyenv/versions/3.11.4/envs/distilabel-tutorials/lib/python3.11/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+      "  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='123' max='123' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [123/123 25:34, Epoch 2/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Cosine Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>3.655929</td>\n",
+       "      <td>0.969880</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>14.374000</td>\n",
+       "      <td>3.498395</td>\n",
+       "      <td>0.981928</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "faad6e9752f34babadff7a966ae55d87",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sdiazlor/.pyenv/versions/3.11.4/envs/distilabel-tutorials/lib/python3.11/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+      "  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]\n",
+      "/Users/sdiazlor/.pyenv/versions/3.11.4/envs/distilabel-tutorials/lib/python3.11/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+      "  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train the model. This will take some time depending on the size of the dataset and the model\n",
+    "# Remember to adjust the training arguments according to your requirements\n",
+    "\n",
+    "trainer = SentenceTransformerTrainer(\n",
+    "    model=model_biencoder,\n",
+    "    args=training_args,\n",
+    "    train_dataset=dataset_rag_biencoder[\"train\"],\n",
+    "    eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
+    "    loss=loss_biencoder,\n",
+    "    evaluator=triplet_evaluator,\n",
+    ")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the model to the local directory and push it to the Hub\n",
+    "model_biencoder.save_pretrained(f\"models/{MODEL_NAME_BIENCODER}\")\n",
+    "model_biencoder.push_to_hub(f\"{REPO_NAME}/{MODEL_NAME_BIENCODER}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train the Cross-Encoder model for Ranking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the training and evaluation samples for the CrossEncoder model\n",
+    "\n",
+    "train_samples = []\n",
+    "for row in dataset_rag_crossencoder[\"train\"]:\n",
+    "    # Suppose 'score' is a float or an integer that you want to predict\n",
+    "    train_samples.append(\n",
+    "        InputExample(texts=[row[\"anchor\"], row[\"positive\"]], label=float(row[\"score\"]))\n",
+    "    )\n",
+    "\n",
+    "eval_samples = []\n",
+    "for row in dataset_rag_crossencoder[\"eval\"]:\n",
+    "    eval_samples.append(\n",
+    "        InputExample(texts=[row[\"anchor\"], row[\"positive\"]], label=float(row[\"score\"]))\n",
+    "    )\n",
+    "\n",
+    "# Initialize the DataLoader for the training samples\n",
+    "train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the CrossEncoder model. Set the number of labels to 1 for regression tasks\n",
+    "model_crossencoder = CrossEncoder(model_name=MODEL, num_labels=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the evaluator\n",
+    "evaluator = CECorrelationEvaluator.from_input_examples(eval_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9517a852f3d34cff86808c4b10cf8973",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Epoch:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6e942043c5a24e77bd6172cb5492d2a7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Iteration:   0%|          | 0/166 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d039d5acf3ed424e9ff6d0b30b51aceb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Iteration:   0%|          | 0/166 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5fd5d0442b76448e8cab18b652e29ad8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Iteration:   0%|          | 0/166 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Train the CrossEncoder model\n",
+    "\n",
+    "model_crossencoder.fit(\n",
+    "    train_dataloader=train_dataloader,\n",
+    "    evaluator=evaluator,\n",
+    "    epochs=3,\n",
+    "    warmup_steps=500,\n",
+    "    output_path=f\"models/{MODEL_NAME_CROSSENCODER}\",\n",
+    "    save_best_model=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the model to the local directory and push it to the Hub\n",
+    "model_crossencoder.save_pretrained(f\"models/{MODEL_NAME_CROSSENCODER}\")\n",
+    "model_crossencoder.push_to_hub(f\"{REPO_NAME}/{MODEL_NAME_CROSSENCODER}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build the RAG Pipeline\n",
+    "\n",
+    "The following section is inspired by the Haystack tutorial, check it for further details: [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add the documents to the DocumentStore\n",
+    "# Use the already chunked documents from original datasets\n",
+    "\n",
+    "df = combined_rag_dataset.to_pandas()\n",
+    "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
+    "df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
+    "dataset = Dataset.from_pandas(df)\n",
+    "\n",
+    "docs = [Document(content=doc[\"context\"]) for doc in dataset]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the document store and store the documents with the embeddings using our bi-encoder model\n",
+    "\n",
+    "document_store = InMemoryDocumentStore()\n",
+    "doc_embedder = SentenceTransformersDocumentEmbedder(\n",
+    "    model=f\"{REPO_NAME}/{MODEL_NAME_BIENCODER}\",\n",
+    ")\n",
+    "doc_embedder.warm_up()\n",
+    "\n",
+    "docs_with_embeddings = doc_embedder.run(docs)\n",
+    "document_store.write_documents(docs_with_embeddings[\"documents\"])\n",
+    "\n",
+    "text_embedder = SentenceTransformersTextEmbedder(\n",
+    "    model=f\"{REPO_NAME}/{MODEL_NAME_BIENCODER}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the retriever (our bi-encoder model) and the ranker (our cross-encoder model)\n",
+    "\n",
+    "retriever = InMemoryEmbeddingRetriever(document_store)\n",
+    "ranker = SentenceTransformersDiversityRanker(\n",
+    "    model=f\"{REPO_NAME}/{MODEL_NAME_CROSSENCODER}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the prompt builder and the chat generator to interact with the models using the HF Serverless Inference API\n",
+    "\n",
+    "template = [\n",
+    "    ChatMessage.from_user(\n",
+    "        \"\"\"\n",
+    "Given the following information, answer the question.\n",
+    "\n",
+    "Context:\n",
+    "{% for document in documents %}\n",
+    "    {{ document.content }}\n",
+    "{% endfor %}\n",
+    "\n",
+    "Question: {{question}}\n",
+    "Answer:\n",
+    "\"\"\"\n",
+    "    )\n",
+    "]\n",
+    "\n",
+    "prompt_builder = ChatPromptBuilder(template=template)\n",
+    "\n",
+    "chat_generator = HuggingFaceAPIChatGenerator(\n",
+    "    api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,\n",
+    "    api_params={\"model\": \"meta-llama/Llama-3.1-8B-Instruct\"},\n",
+    "    token=Secret.from_env_var(\"HF_TOKEN\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the pipeline with the components\n",
+    "\n",
+    "rag_pipeline = Pipeline()\n",
+    "rag_pipeline.add_component(\"text_embedder\", text_embedder)\n",
+    "rag_pipeline.add_component(\"retriever\", retriever)\n",
+    "rag_pipeline.add_component(\"ranker\", ranker)\n",
+    "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n",
+    "rag_pipeline.add_component(\"llm\", chat_generator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<haystack.core.pipeline.pipeline.Pipeline object at 0x32e75b4d0>\n",
+       "🚅 Components\n",
+       "  - text_embedder: SentenceTransformersTextEmbedder\n",
+       "  - retriever: InMemoryEmbeddingRetriever\n",
+       "  - ranker: SentenceTransformersDiversityRanker\n",
+       "  - prompt_builder: ChatPromptBuilder\n",
+       "  - llm: HuggingFaceAPIChatGenerator\n",
+       "🛤️ Connections\n",
+       "  - text_embedder.embedding -> retriever.query_embedding (List[float])\n",
+       "  - retriever.documents -> ranker.documents (List[Document])\n",
+       "  - ranker.documents -> prompt_builder.documents (List[Document])\n",
+       "  - prompt_builder.prompt -> llm.messages (List[ChatMessage])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Connect the components to each other\n",
+    "\n",
+    "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
+    "rag_pipeline.connect(\"retriever.documents\", \"ranker.documents\")\n",
+    "rag_pipeline.connect(\"ranker\", \"prompt_builder\")\n",
+    "rag_pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "80c813c847524f1493067f6dbe65c725",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "It seems that there is not enough information given in the human rights protocols provided to accurately answer the question. However, we can inform you that there are several types of human rights documents that this could be referring too. Event the most widely respected declared world document on human rights for Example - Exernal and some Individual (Part 1 Art.) and some other attempted Separation apart include: The convention lists several key rights such as \n",
+      "\n",
+      "1. Right to Life \n",
+      "2. Right to Liberty and Security \n",
+      "3. Freedom from Torture \n",
+      "4. Freedom from Slavery \n",
+      "5. Right to a Fair Trial \n",
+      "6. No Punishment without Law \n",
+      "7. Respect for Family Life \n",
+      "... (and throughout given information 44 protocals  - are actually chapter and not... How is the answer \n",
+      " \n",
+      "\n",
+      "Not possible to answer your question due to lack of information, however we can tell you Event the most widely respected declared world document on human rights.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Make a query to the pipeline without references included in your documentation\n",
+    "question = \"How many human rights there are?\"\n",
+    "\n",
+    "response = rag_pipeline.run(\n",
+    "    {\n",
+    "        \"text_embedder\": {\"text\": question},\n",
+    "        \"prompt_builder\": {\"question\": question},\n",
+    "        \"ranker\": {\"query\": question},\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(response[\"llm\"][\"replies\"][0].text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2995f14154d148589129a3f449adc5d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The information you provided does not directly list the \"Right of Fair Trial\" but looking under articles of the Convention for the Protection of Human Rights and Fundamental Freedoms, Article 6, also known as the Right to a Fair Trial, gives a clear idea.\n",
+      "\n",
+      " Article 6. Right to a fair Trial\n",
+      " \n",
+      "\n",
+      "1. Everyone is entitled to a fair and public hearing within a reasonable time by an independent and impartial tribunal established by law.\n",
+      " \n",
+      "2, everybody shall be presumed innocent until proven guilty by a final decision of a competent court.\n",
+      " \n",
+      "3. Everyone charged with a criminal offence has the following minimum rights:\n",
+      "\n",
+      "      a to be informed promptly, in a language which he understands and in detail, of the charges, if any, against him.\n",
+      "      b to have adequate time and facilities for the preparation of his defence.\n",
+      "      c to defend himself in person or through legal assistance of his own choosing or, if he has not sufficient means to pay for legal assistance, to be given it free when the interests of justice so require.\n",
+      "      d to be tried in his presence, and to defend himself in person or through legal assistance of his own choosing; to be informed, if he does not have legal assistance chosen or appointed under Article 5 Part 3 of this Convention, to communicate with the defence he has chosen\n",
+      "      e to have the free assistance of an interpreter if he cannot understand or speak the language used in court.\n",
+      " \n",
+      " \n",
+      "4. Everyone sentenced has the right to, review by a higher tribunal according to law\n",
+      "\n",
+      "5. Everyone sentenced has the right to, take up or pursue his occupation.\n",
+      "\n",
+      "6. Sentences may, also include restoration of rights or removal of disabilities\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Make a query to the pipeline with references included in your documentation\n",
+    "question = \"What's the Right of Fair Trial?\"\n",
+    "\n",
+    "response = rag_pipeline.run(\n",
+    "    {\n",
+    "        \"text_embedder\": {\"text\": question},\n",
+    "        \"prompt_builder\": {\"question\": question},\n",
+    "        \"ranker\": {\"query\": question},\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(response[\"llm\"][\"replies\"][0].text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "distilabel-tutorials",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

examples/fine-tune-smollm2-on-synthetic-data.ipynb ADDED Viewed

	@@ -0,0 +1,310 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tune a SmolLM on domain-specific synthetic data from a LLM\n",
+    "\n",
+    "Yes, smoll models can beat GPT4-like models on domain-specific tasks but don't expect miracles. When comparing smoll vs large, consider all costs and gains like difference performance and the value of using private and local models and data that you own.\n",
+    "\n",
+    "The [Hugging Face SmolLM models](https://github.com/huggingface/smollm) are blazingly fast and remarkably powerful. With its 135M, 360M and 1.7B parameter models, it is a great choice for a small and fast model. The great thing about SmolLM is that it is a general-purpose model that can be fine-tuned on domain-specific data.\n",
+    "\n",
+    "A lack of domain-specific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
+    "\n",
+    "In this example, we will fine-tune a SmolLM2 model on a synthetic dataset generated from `meta-llama/Meta-Llama-3.1-8B-Instruct` with the `synthetic-data-generator`.\n",
+    "\n",
+    "## Install the dependencies\n",
+    "\n",
+    "We will install some basic dependencies for the fine-tuning with `trl` but we will use the Synthetic Data Generator UI to generate the synthetic dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers datasets trl torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The problem\n",
+    "\n",
+    "Reasoning data has proven to be a fundamental change in the performance of generative models. Reasoning is amazing but it also means the model generates more \"chatty\" during the token generation process, causing the model to become slower and more expensive. For this reason, we want to create a model that can reason without being too chatty. Therefore, we will generate a concise reasoning dataset and fine-tune a SmolLM2 model on it.\n",
+    "\n",
+    "## Let's generate some data\n",
+    "\n",
+    "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blog post](https://huggingface.co/blog/synthetic-data-generator). \n",
+    "\n",
+    "<iframe\n",
+    "\tsrc=\"https://argilla-synthetic-data-generator.hf.space\"\n",
+    "\tframeborder=\"0\"\n",
+    "\twidth=\"850\"\n",
+    "\theight=\"450\"\n",
+    "></iframe>\n",
+    "\n",
+    "For this example, we will generate 5000 chat data examples for a single turn in the conversation. All examples have been generated with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
+    "\n",
+    "```\n",
+    "You are an AI assistant who provides brief and to-the-point responses with logical step-by-step reasoning. Your purpose is to offer straightforward explanations and answers so that you can get to the heart of the issue. Respond with extremely concise, direct justifications and evidence-based conclusions. User questions are direct and concise.\n",
+    "```\n",
+    "\n",
+    "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few hours and we end up with a dataset with 5000 examples, which is the maximum number of examples we can generate in a single run. You can scale this by deploying a private instance of the Synthetic Data Generator. \n",
+    "\n",
+    "<iframe\n",
+    "  src=\"https://huggingface.co/datasets/argilla/synthetic-concise-reasoning-sft-filtered/embed/viewer/default/train\"\n",
+    "  frameborder=\"0\"\n",
+    "  width=\"100%\"\n",
+    "  height=\"560px\"\n",
+    "></iframe>\n",
+    "\n",
+    "The data is pushed to Argilla too so we recommend inspecting and validating the the data before finetuning the actual model. We applied some basic filters and transformations to the data to make it more suitable for fine-tuning.\n",
+    "\n",
+    "## Fine-tune the model\n",
+    "\n",
+    "We will use TRL to fine-tune the model. It is part of the Hugging Face ecosystem and works seamlessly on top of datasets generated by the synthetic data generator without needing to do any data transformations.\n",
+    "\n",
+    "### Load the model\n",
+    "\n",
+    "We will first load the model and tokenizer and set up the chat format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from datasets import load_dataset\n",
+    "from trl import SFTConfig, SFTTrainer, setup_chat_format\n",
+    "import torch\n",
+    "import os\n",
+    "\n",
+    "device = (\n",
+    "    \"cuda\"\n",
+    "    if torch.cuda.is_available()\n",
+    "    else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
+    ")\n",
+    "\n",
+    "# Load the model and tokenizer\n",
+    "model_name = \"HuggingFaceTB/SmolLM2-360M\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    pretrained_model_name_or_path=model_name\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)\n",
+    "\n",
+    "# Set up the chat format\n",
+    "model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the base model\n",
+    "\n",
+    "We will first test the base model to see how it performs on the task. During this step we will also generate a prompt for the model to respond to, to see how it performs on the task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use mps:0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'generated_text': 'What is the primary function of mitochondria within a cell?\\n\\nMitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe mitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe'}]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
+    "\n",
+    "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=device)\n",
+    "pipe(prompt, max_new_tokens=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the dataset\n",
+    "\n",
+    "For fine-tuning, we need to load the dataset and tokenize it. We will use the `synthetic-concise-reasoning-sft-filtered` dataset that we generated in the previous step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 4133/4133 [00:00<00:00, 18478.53 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "ds = load_dataset(\"argilla/synthetic-concise-reasoning-sft-filtered\")\n",
+    "def tokenize_function(examples):\n",
+    "    examples[\"text\"] = tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": examples[\"prompt\"].strip()}, {\"role\": \"assistant\", \"content\": examples[\"completion\"].strip()}], tokenize=False)\n",
+    "    return examples\n",
+    "ds = ds.map(tokenize_function)\n",
+    "ds = ds.shuffle()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fine-tune the model\n",
+    "\n",
+    "We will now fine-tune the model. We will use the `SFTTrainer` from the `trl` library to fine-tune the model. We will use a batch size of 4 and a learning rate of 5e-5. We will also use the `use_mps_device` flag to use the MPS device if available."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"PYTORCH_MPS_HIGH_WATERMARK_RATIO\"] = \"0.0\"\n",
+    "\n",
+    "# Configure the SFTTrainer\n",
+    "sft_config = SFTConfig(\n",
+    "    output_dir=\"./sft_output\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=4,  # Set according to your GPU memory capacity\n",
+    "    learning_rate=5e-5,  # Common starting point for fine-tuning\n",
+    "    logging_steps=100,  # Frequency of logging training metrics\n",
+    "    use_mps_device= True if device == \"mps\" else False,\n",
+    "    hub_model_id=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",  # Set a unique name for your model\n",
+    "    push_to_hub=True,\n",
+    ")\n",
+    "\n",
+    "# Initialize the SFTTrainer\n",
+    "trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    args=sft_config,\n",
+    "    train_dataset=ds[\"train\"],\n",
+    "    tokenizer=tokenizer,\n",
+    ")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "# {'loss': 1.4498, 'grad_norm': 2.3919131755828857, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
+    "# {'loss': 1.362, 'grad_norm': 1.6650595664978027, 'learning_rate': 3e-05, 'epoch': 0.19}\n",
+    "# {'loss': 1.3778, 'grad_norm': 1.4778285026550293, 'learning_rate': 2e-05, 'epoch': 0.29}\n",
+    "# {'loss': 1.3735, 'grad_norm': 2.1424977779388428, 'learning_rate': 1e-05, 'epoch': 0.39}\n",
+    "# {'loss': 1.3512, 'grad_norm': 2.3498542308807373, 'learning_rate': 0.0, 'epoch': 0.48}\n",
+    "# {'train_runtime': 1911.514, 'train_samples_per_second': 1.046, 'train_steps_per_second': 0.262, 'train_loss': 1.3828572998046875, 'epoch': 0.48}\n",
+    "```\n",
+    "\n",
+    "For the example, we did not use a specific validation set but we can see the loss is decreasing, so we assume the model is generalsing well to the training data. To get a better understanding of the model's performance, let's test it again with the same prompt.\n",
+    "\n",
+    "### Run inference\n",
+    "\n",
+    "We can now run inference with [the fine-tuned model](https://huggingface.co/argilla/SmolLM2-360M-synthetic-concise-reasoning/blob/main/README.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use mps\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'The primary function of mitochondria is to generate energy for the cell. They are organelles found in eukaryotic cells that convert nutrients into ATP (adenosine triphosphate), which is the primary source of energy for cellular processes.\\nMitochondria are responsible for:\\n\\nEnergy production: Mitochondria produce ATP through a process called oxidative phosphorylation, which involves the transfer of electrons from food molecules to oxygen.\\nEnergy storage: Mitochondria store energy in the form of adenosine triphosphate (ATP), which is used by the cell for various cellular processes.\\nCellular respiration: Mitochondria also participate in cellular respiration, a'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
+    "\n",
+    "generator = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",\n",
+    "    device=\"mps\",\n",
+    ")\n",
+    "generator(\n",
+    "    [{\"role\": \"user\", \"content\": prompt}], max_new_tokens=128, return_full_text=False\n",
+    ")[0][\"generated_text\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "We have fine-tuned a SmolLM2 model on a synthetic dataset generated from a large language model. We have seen that the model performs well on the task and that the synthetic data is a great way to generate diverse and representative data for supervised fine-tuning. \n",
+    "\n",
+    "In practice, you would likely want to spend more time on the data quality and fine-tuning the model but the flow shows the Synthetic Data Generator is a great tool to generate synthetic data for any task.\n",
+    "\n",
+    "Overall, I think it is pretty cool for a couple of hours of generation and fine-tuning on consumer hardware.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

examples/hf-dedicated-or-tgi-deployment.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/"  # dedicated endpoint/TGI
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # magpie template
+os.environ["TOKENIZER_ID"] = (
+    "meta-llama/Llama-3.1-8B-Instruct"  # tokenizer for model hosted on endpoint
+)
+os.environ["MODEL"] = None  # model is linked to endpoint
+launch()

examples/hf-serverless-deployment-deepseek.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["MODEL"] = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # use model for instructions
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "<｜begin▁of▁sentence｜>User: "  # use the custom template for the model
+launch()

examples/hf-serverless-deployment.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use model for generation
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model
+launch()

examples/hf-serverless-different-model-for-completion.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use model for instruction generation
+os.environ["MODEL_COMPLETION"] = "meta-llama/Llama-3.1-70B-Instruct"  # use model for completion generation
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model
+launch()

examples/ollama-deployment.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+# ollama serve
+# ollama run qwen2.5:32b-instruct-q5_K_S
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/"  # ollama base url
+os.environ["MODEL"] = "qwen2.5:32b-instruct-q5_K_S"  # model id
+os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-32B-Instruct"  # tokenizer id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
+os.environ["MAX_NUM_ROWS"] = "10000"
+os.environ["DEFAULT_BATCH_SIZE"] = "2"
+os.environ["MAX_NUM_TOKENS"] = "1024"
+launch()

examples/ollama-different-model-for-completion.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+# ollama serve
+# ollama run llama3.2
+# ollama run llama3.2:1b
+import os
+from synthetic_dataset_generator import launch
+os.environ["OLLAMA_BASE_URL"] = (
+    "http://127.0.0.1:11434/"  # in this case, the same base url for both models
+)
+os.environ["MODEL"] = "llama3.2" # model for instruction generation
+os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation
+os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for instruction generation
+os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for completion generation
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation
+launch()

examples/openai-deployment.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/"  # openai base url
+os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")  # openai api key
+os.environ["MODEL"] = "gpt-4o"  # model id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = None  # chat data not supported with OpenAI
+launch()

examples/vllm-deployment.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+# vllm serve Qwen/Qwen2.5-1.5B-Instruct
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/"  # vllm base url
+os.environ["MODEL"] = "Qwen/Qwen2.5-1.5B-Instruct"  # model id
+os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-1.5B-Instruct"  # tokenizer id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
+os.environ["MAX_NUM_ROWS"] = "10000"
+os.environ["DEFAULT_BATCH_SIZE"] = "2"
+os.environ["MAX_NUM_TOKENS"] = "1024"
+launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ poppler-utils
2	+ tesseract-ocr

pdm.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,40 @@

+[project]
+name = "synthetic-dataset-generator"
+version = "0.2.0"
+description = "Build datasets using natural language"
+authors = [
+    {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
+]
+keywords = [
+    "gradio",
+    "synthetic-data",
+    "huggingface",
+    "argilla",
+    "generative-ai",
+    "ai",
+]
+requires-python = "<3.13,>=3.10"
+readme = "README.md"
+license = {text = "Apache 2"}
+dependencies = [
+    "argilla>=2.4.0,<3.0.0",
+    "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00",
+    "gradio[oauth]>=5.4.0,<6.0.0",
+    "gradio-huggingfacehub-search>=0.0.12,<1.0.0",
+    "huggingface-hub>=0.26.0,<0.28.0",
+    "model2vec>=0.2.4,<1.0.0",
+    "nltk>=3.9.1,<4.0.0",
+    "pydantic>=2.10.5,<3.0.0",
+    "sentence-transformers>=3.2.0,<4.0.0",
+    "transformers>=4.44.2,<5.0.0",
+    "unstructured[md,pdf,docx]>=0.16.3,<1.0.0",
+    "setuptools",
+]
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+[tool.pdm]
+distribution = true

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator

src/synthetic_dataset_generator/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import inspect
+from gradio import TabbedInterface
+from synthetic_dataset_generator import (  # noqa
+    _distiset,
+    _inference_endpoints,
+)
+def launch(*args, **kwargs):
+    """Launch the synthetic dataset generator.
+    Based on the `TabbedInterface` from Gradio.
+    Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
+    """
+    from synthetic_dataset_generator.app import demo
+    return demo.launch(*args, server_name="0.0.0.0", **kwargs)
+launch.__doc__ = TabbedInterface.launch.__doc__
+launch.__signature__ = inspect.signature(TabbedInterface.launch)
+launch.__annotations__ = TabbedInterface.launch.__annotations__

src/synthetic_dataset_generator/__main__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+if __name__ == "__main__":
+    from synthetic_dataset_generator import launch
+    launch()

src/synthetic_dataset_generator/_distiset.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from typing import Optional
+import distilabel
+import distilabel.distiset
+import gradio as gr
+from distilabel.utils.card.dataset_card import (
+    DistilabelDatasetCard,
+    size_categories_parser,
+)
+from huggingface_hub import DatasetCardData, HfApi
+class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
+    def _generate_card(
+        self,
+        repo_id: str,
+        token: str,
+        include_script: bool = False,
+        filename_py: Optional[str] = None,
+    ) -> None:
+        """Generates a dataset card and pushes it to the Hugging Face Hub, and
+        if the `pipeline.yaml` path is available in the `Distiset`, uploads that
+        to the same repository.
+        Args:
+            repo_id: The ID of the repository to push to, from the `push_to_hub` method.
+            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
+            include_script: Whether to upload the script to the hugging face repository.
+            filename_py: The name of the script. If `include_script` is True, the script will
+                be uploaded to the repository using this name, otherwise it won't be used.
+        """
+        card = self._get_card(
+            repo_id=repo_id,
+            token=token,
+            include_script=include_script,
+            filename_py=filename_py,
+        )
+        card.push_to_hub(
+            repo_id,
+            repo_type="dataset",
+            token=token,
+        )
+        if self.pipeline_path:
+            # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
+            HfApi().upload_file(
+                path_or_fileobj=self.pipeline_path,
+                path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=token,
+            )
+    def _get_card(
+        self,
+        repo_id: str,
+        token: Optional[str] = None,
+        include_script: bool = False,
+        filename_py: Optional[str] = None,
+    ) -> DistilabelDatasetCard:
+        """Generates the dataset card for the `Distiset`.
+        Note:
+            If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
+            on the hub.
+        Args:
+            repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
+            token: The token to authenticate with the Hugging Face Hub.
+                We assume that if it's provided, the dataset will be in the Hugging Face Hub,
+                so the README metadata will be extracted from there.
+            include_script: Whether to upload the script to the hugging face repository.
+            filename_py: The name of the script. If `include_script` is True, the script will
+                be uploaded to the repository using this name, otherwise it won't be used.
+        Returns:
+            The dataset card for the `Distiset`.
+        """
+        sample_records = {}
+        for name, dataset in self.items():
+            sample_records[name] = (
+                dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
+            )
+        columns = self["default"].column_names
+        columns = self["default"].column_names
+        if ("label" in columns and "text" in columns) or (
+            "labels" in columns and "text" in columns
+        ):
+            task_categories = ["text-classification"]
+        elif ("prompt" in columns and "completion" in columns) or (
+            "messages" in columns
+        ):
+            task_categories: list[str] = [
+                "text-generation",
+                "text2text-generation",
+                "question-answering",
+            ]
+        elif "context" in columns and "question" in columns and "response" in columns:
+            task_categories: list[str] = [
+                "text-generation",
+                "text2text-generation",
+                "text-retrieval",
+                "question-answering"
+            ]
+            if (
+                "positive_retrieval" in columns and "negative_retrieval" in columns
+            ) or ("positive_reranking" in columns and "negative_reranking" in columns):
+                task_categories.append("sentence-similarity")
+        else:
+            task_categories: list[str] = []
+            gr.Info(
+                f"No task categories found for dataset with columns: {columns}. "
+                "Please notify the distilabel team if you think this is an error."
+            )
+        readme_metadata = {}
+        if repo_id and token:
+            readme_metadata = self._extract_readme_metadata(repo_id, token)
+        metadata = {
+            **readme_metadata,
+            "size_categories": size_categories_parser(
+                max(len(dataset) for dataset in self.values())
+            ),
+            "task_categories": task_categories,
+            "tags": [
+                "synthetic",
+                "distilabel",
+                "rlaif",
+                "datacraft",
+            ],
+        }
+        card = DistilabelDatasetCard.from_template(
+            card_data=DatasetCardData(**metadata),
+            repo_id=repo_id,
+            sample_records=sample_records,
+            include_script=include_script,
+            filename_py=filename_py,
+            references=self.citations,
+        )
+        return card
+distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag

src/synthetic_dataset_generator/_inference_endpoints.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import warnings
+import distilabel
+import distilabel.distiset
+from distilabel.models import InferenceEndpointsLLM
+from pydantic import (
+    ValidationError,
+    model_validator,
+)
+class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
+    @model_validator(mode="after")  # type: ignore
+    def only_one_of_model_id_endpoint_name_or_base_url_provided(
+        self,
+    ) -> "InferenceEndpointsLLM":
+        """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
+        provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
+        favour of the dynamically calculated one.."""
+        if self.base_url and (self.model_id or self.endpoint_name):
+            warnings.warn(  # type: ignore
+                f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
+                " or `endpoint_name` is also provided, the `base_url` will either be ignored"
+                " or overwritten with the one generated from either of those args, for serverless"
+                " or dedicated inference endpoints, respectively."
+            )
+        if self.use_magpie_template and self.tokenizer_id is None:
+            raise ValueError(
+                "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
+                " set a `tokenizer_id` and try again."
+            )
+        if (
+            self.model_id
+            and self.tokenizer_id is None
+            and self.structured_output is not None
+        ):
+            self.tokenizer_id = self.model_id
+        if self.base_url and not (self.model_id or self.endpoint_name):
+            return self
+        if self.model_id and not self.endpoint_name:
+            return self
+        if self.endpoint_name and not self.model_id:
+            return self
+        raise ValidationError(
+            f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
+            f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
+            f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
+        )
+distilabel.models.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM

src/synthetic_dataset_generator/_tabbedinterface.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+This file defines two useful high-level abstractions to build Gradio apps: Interface and TabbedInterface.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+import gradio as gr
+from gradio.blocks import Blocks
+from gradio.layouts import Tab, Tabs
+from gradio.themes import ThemeClass as Theme
+from gradio_client.documentation import document
+@document()
+class TabbedInterface(Blocks):
+    """
+    A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
+    rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
+    Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
+    Demos: tabbed_interface_lite
+    """
+    def __init__(
+        self,
+        interface_list: Sequence[Blocks],
+        tab_names: list[str] | None = None,
+        title: str | None = None,
+        theme: Theme | str | None = None,
+        analytics_enabled: bool | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+    ):
+        """
+        Parameters:
+            interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
+            tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: The tab title to display when this demo is opened in a browser window.
+            theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js as a string or path to a js file. The custom js should in the form of a single js function. This function will automatically be executed when the page loads. For more flexibility, use the head parameter to insert js inside <script> tags.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, multiple scripts, stylesheets, etc. to the page.
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title="Synthetic Data Generator",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+            js=js,
+            head=head,
+        )
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            h3 = "<div style='text-align: center;'><h2>Build datasets using natural language</h2></div>"
+            if title:
+                gr.HTML(value=title + h3)
+            gr.LoginButton(value="Sign in", variant="primary", elem_id="sign_in_button")
+            with Tabs():
+                for interface, tab_name in zip(interface_list, tab_names, strict=False):
+                    with Tab(label=tab_name):
+                        interface.render()

src/synthetic_dataset_generator/app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from synthetic_dataset_generator._tabbedinterface import TabbedInterface
+# from synthetic_dataset_generator.apps.eval import app as eval_app
+from synthetic_dataset_generator.apps.rag import app as rag_app
+from synthetic_dataset_generator.apps.about import app as about_app
+from synthetic_dataset_generator.apps.chat import app as chat_app
+from synthetic_dataset_generator.apps.textcat import app as textcat_app
+theme = "argilla/argilla-theme"
+css = """
+.main_ui_logged_out{opacity: 0.3; pointer-events: none}
+button[role="tab"][aria-selected="true"] { border: 0; background: var(--button-primary-background-fill); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
+button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill); background: var(var(--button-primary-background-fill-hover))}
+.tabitem {border: 0; padding-inline: 0}
+.gallery-item {background: var(--background-fill-secondary); text-align: left}
+.table-wrap .tbody td {vertical-align: top}
+#system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
+.container {padding-inline: 0 !important}
+.gradio-container { width: 100% !important; }
+.gradio-row { display: flex !important; flex-direction: row !important; }
+.gradio-column { flex: 1 !important; min-width: 0 !important; }
+#sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
+.datasets {height: 70px;}
+"""
+image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
+demo = TabbedInterface(
+    [textcat_app, chat_app, rag_app, about_app],
+    ["Text Classification", "Chat Data", "RAG", "About"],
+    css=css,
+    title=image,
+    theme=theme,
+)

src/synthetic_dataset_generator/apps/__init__.py ADDED Viewed

File without changes

src/synthetic_dataset_generator/apps/about.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import gradio as gr
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+        Synthetic data is artificially generated information that mimics real-world data. It allows overcoming data limitations by expanding or enhancing datasets.
+        Introducing the Synthetic Data Generator, a user-friendly application that takes a no-code approach to creating custom datasets with Large Language Models (LLMs). The best part: A simple step-by-step process, making dataset creation a non-technical breeze, allowing anyone to create datasets and models in minutes and without any code.
+        The synthetic data generator takes your custom prompt and returns a dataset for your use case, using a synthetic data pipeline. In the background this is powered by [distilabel](https://distilabel.argilla.io/latest/) and the [free Hugging Face text-generation API](https://huggingface.co/docs/api-inference/en/index) but we don't need to worry about these complexities and we can focus on using the UI.
+        - Read more in [our announcement blog post](https://huggingface.co/blog/synthetic-data-generator)
+        - Find the library on [GitHub](https://github.com/argilla-io/synthetic-data-generator)
+        """
+    )

src/synthetic_dataset_generator/apps/base.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import io
+import uuid
+from tqdm import tqdm
+from typing import Union
+import argilla as rg
+import gradio as gr
+import pandas as pd
+from datasets import Dataset, concatenate_datasets, get_dataset_config_names, get_dataset_split_names, load_dataset
+from gradio import OAuthToken
+from huggingface_hub import HfApi, upload_file, repo_exists
+from unstructured.chunking.title import chunk_by_title
+from unstructured.partition.auto import partition
+from synthetic_dataset_generator.constants import MAX_NUM_ROWS, SAVE_LOCAL_DIR
+from synthetic_dataset_generator.utils import get_argilla_client
+if SAVE_LOCAL_DIR is not None:
+    import os
+    os.makedirs(SAVE_LOCAL_DIR, exist_ok=True)
+def validate_argilla_user_workspace_dataset(
+    dataset_name: str,
+    add_to_existing_dataset: bool = True,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> str:
+    progress(0.1, desc="Validating dataset configuration")
+    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+    client = get_argilla_client()
+    if dataset_name is None or dataset_name == "":
+        raise gr.Error("Dataset name is required")
+    # Create user if it doesn't exist
+    rg_user = client.users(username=hf_user)
+    if rg_user is None:
+        rg_user = client.users.add(
+            rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
+        )
+    # Create workspace if it doesn't exist
+    workspace = client.workspaces(name=hf_user)
+    if workspace is None:
+        workspace = client.workspaces.add(rg.Workspace(name=hf_user))
+        workspace.add_user(hf_user)
+    # Check if dataset exists
+    dataset = client.datasets(name=dataset_name, workspace=hf_user)
+    if dataset and not add_to_existing_dataset:
+        raise gr.Error(f"Dataset {dataset_name} already exists")
+    progress(1.0, desc="Dataset configuration validated")
+    return ""
+def push_pipeline_code_to_hub(
+    pipeline_code: str,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    repo_id: str | None = validate_push_to_hub(org_name, repo_name)
+    progress(0.1, desc="Uploading pipeline code")
+    with io.BytesIO(pipeline_code.encode("utf-8")) as f:
+        upload_file(
+            path_or_fileobj=f,
+            path_in_repo="pipeline.py",
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=oauth_token.token,
+            commit_message="Include pipeline script",
+            create_pr=False,
+        )
+    progress(1.0, desc="Pipeline code uploaded")
+def validate_push_to_hub(org_name: str, repo_name: str):
+    repo_id = (
+        f"{org_name}/{repo_name}"
+        if repo_name is not None and org_name is not None
+        else None
+    )
+    if repo_id is not None:
+        if not all([repo_id, org_name, repo_name]):
+            raise gr.Error(
+                "Please provide a `repo_name` and `org_name` to push the dataset to."
+            )
+    return repo_id
+def combine_datasets(
+    repo_id: str, dataset: Dataset, oauth_token: Union[OAuthToken, None]
+) -> Dataset:
+    try:
+        new_dataset = load_dataset(
+            repo_id,
+            split="train",
+            download_mode="force_redownload",
+            token=oauth_token.token,
+        )
+        return concatenate_datasets([dataset, new_dataset])
+    except Exception:
+        return dataset
+def show_success_message(org_name: str, repo_name: str) -> gr.Markdown:
+    client = get_argilla_client()
+    if client is None:
+        return gr.Markdown(
+            value=f"""
+                <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
+                    <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+                    <p style="margin-top: 0.5em;">
+                        The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
+                        <div style="display: flex; gap: 10px;">
+                            <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" class="lg primary svelte-1137axg" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
+                                Open in Hugging Face
+                            </a>
+                        </div>
+                    </p>
+                    <p style="margin-top: 1em; color: var(--block-title-text-color)">
+                        By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
+                        Unfamiliar with Argilla? Here are some docs to help you get started:
+                        <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
+                        <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+                        <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+                    </p>
+                </div>
+                """,
+            visible=True,
+            height=None,
+            min_height=None,
+            max_height=None,
+        )
+    argilla_api_url = client.api_url
+    # Transform Docker internal URL to localhost if needed
+    if "argilla:" in argilla_api_url:
+        argilla_api_url = argilla_api_url.replace("argilla:", "127.0.0.1:")
+    return gr.Markdown(
+        value=f"""
+                <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
+                    <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+                    <p style="margin-top: 0.5em;">
+                        The generated dataset is <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank">available in the Hub</a>. It is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
+                        <div style="display: flex; gap: 10px;">
+                            <a href="{argilla_api_url}" target="_blank" class="lg primary svelte-1137axg" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
+                                Open in Argilla
+                            </a>
+                        </div>
+                    </p>
+                    <p style="margin-top: 1em; color: var(--block-title-text-color)">
+                        Unfamiliar with Argilla? Here are some docs to help you get started:
+                        <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+                        <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+                    </p>
+                </div>
+            """,
+        visible=True,
+        height=None,
+        min_height=None,
+        max_height=None,
+    )
+def hide_success_message() -> gr.Markdown:
+    return gr.Markdown(value="", visible=True, height=100)
+def test_max_num_rows(num_rows: int) -> int:
+    if num_rows > MAX_NUM_ROWS:
+        num_rows = MAX_NUM_ROWS
+        gr.Info(
+            f"Number of rows is larger than the configured maximum. Setting number of rows to {MAX_NUM_ROWS}. Set environment variable `MAX_NUM_ROWS` to change this behavior."
+        )
+    return num_rows
+def get_iframe(hub_repo_id: str) -> str:
+    if not hub_repo_id:
+        return ""
+    if not repo_exists(repo_id=hub_repo_id, repo_type="dataset"):
+        return ""
+    url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
+    iframe = f"""
+    <iframe
+        src="{url}"
+        frameborder="0"
+        width="100%"
+        height="600px"
+    ></iframe>
+    """
+    return iframe
+def _get_valid_columns(dataframe: pd.DataFrame):
+    doc_valid_columns = []
+    for col in dataframe.columns:
+        sample_val = dataframe[col].iloc[0]
+        if isinstance(sample_val, str):
+            doc_valid_columns.append(col)
+    return doc_valid_columns
+def load_dataset_from_hub(
+    repo_id: str,
+    num_rows: int = 10,
+    token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if not repo_id:
+        raise gr.Error("Please provide a Hub repo ID")
+    subsets = get_dataset_config_names(repo_id, token=token)
+    splits = get_dataset_split_names(repo_id, subsets[0], token=token)
+    ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
+    rows = []
+    for idx, row in enumerate(tqdm(ds, desc="Loading the dataset", total=num_rows)):
+        rows.append(row)
+        if idx == num_rows:
+            break
+    ds = Dataset.from_list(rows)
+    dataframe = ds.to_pandas()
+    doc_valid_columns = _get_valid_columns(dataframe)
+    col_doc = doc_valid_columns[0] if doc_valid_columns else ""
+    return (
+        dataframe,
+        gr.Dropdown(
+            choices=doc_valid_columns,
+            label="Documents column",
+            value=col_doc,
+            interactive=(False if col_doc == "" else True),
+            multiselect=False,
+        ),
+    )
+def preprocess_input_data(
+    file_paths: list[str], num_rows: int, progress=gr.Progress(track_tqdm=True)
+):
+    if not file_paths:
+        raise gr.Error("Please provide an input file")
+    data = {}
+    total_chunks = 0
+    for file_path in tqdm(file_paths, desc="Processing files", total=len(file_paths)):
+        partitioned_file = partition(filename=file_path)
+        chunks = [str(chunk) for chunk in chunk_by_title(partitioned_file)]
+        data[file_path] = chunks
+        total_chunks += len(chunks)
+        if total_chunks >= num_rows:
+            break
+    dataframe = pd.DataFrame.from_records(
+        [(k, v) for k, values in data.items() for v in values],
+        columns=["filename", "chunks"],
+    )
+    col_doc = "chunks"
+    return (
+        dataframe,
+        gr.Dropdown(
+            choices=["chunks"],
+            label="Documents column",
+            value=col_doc,
+            interactive=(False if col_doc == "" else True),
+            multiselect=False,
+        ),
+    )

src/synthetic_dataset_generator/apps/chat.py ADDED Viewed

	@@ -0,0 +1,1142 @@

+import ast
+import json
+import os
+import random
+import uuid
+from typing import Dict, List, Union
+import argilla as rg
+import gradio as gr
+import pandas as pd
+from datasets import Dataset
+from distilabel.distiset import Distiset
+from gradio.oauth import OAuthToken
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
+from synthetic_dataset_generator.apps.base import (
+    combine_datasets,
+    hide_success_message,
+    load_dataset_from_hub,
+    preprocess_input_data,
+    push_pipeline_code_to_hub,
+    show_success_message,
+    test_max_num_rows,
+    validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
+)
+from synthetic_dataset_generator.constants import (
+    BASE_URL,
+    DEFAULT_BATCH_SIZE,
+    MODEL,
+    MODEL_COMPLETION,
+    SAVE_LOCAL_DIR,
+    SFT_AVAILABLE,
+)
+from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
+from synthetic_dataset_generator.pipelines.chat import (
+    DEFAULT_DATASET_DESCRIPTIONS,
+    generate_pipeline_code,
+    get_follow_up_generator,
+    get_magpie_generator,
+    get_prompt_generator,
+    get_response_generator,
+    get_sentence_pair_generator,
+)
+from synthetic_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from synthetic_dataset_generator.utils import (
+    column_to_list,
+    get_argilla_client,
+    get_org_dropdown,
+    get_random_repo_name,
+    swap_visibility,
+)
+def _get_dataframe():
+    return gr.Dataframe(
+        headers=["prompt", "completion"],
+        wrap=True,
+        interactive=False,
+    )
+def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
+    def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
+        return ast.literal_eval(
+            messages.replace("'user'}", "'user'},")
+            .replace("'system'}", "'system'},")
+            .replace("'assistant'}", "'assistant'},")
+        )
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
+    return dataframe
+def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
+    progress(0.1, desc="Initializing")
+    generate_description = get_prompt_generator()
+    progress(0.5, desc="Generating")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "instruction": dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+    progress(1.0, desc="Prompt generated")
+    return result
+def load_dataset_file(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    num_rows: int = 10,
+    token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    progress(0.1, desc="Loading the source data")
+    if input_type == "dataset-input":
+        return load_dataset_from_hub(repo_id=repo_id, num_rows=num_rows, token=token)
+    else:
+        return preprocess_input_data(file_paths=file_paths, num_rows=num_rows)
+def generate_sample_dataset(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+    oauth_token: Union[OAuthToken, None],
+    progress=gr.Progress(),
+):
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["prompt", "completion"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating sample dataset")
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        num_turns=num_turns,
+        num_rows=num_rows,
+        is_sample=True,
+    )
+    progress(1.0, desc="Sample dataset generated")
+    return dataframe
+def generate_dataset_from_prompt(
+    system_prompt: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    temperature_completion: Union[float, None] = None,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    num_rows = test_max_num_rows(num_rows)
+    progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(num_turns, temperature, is_sample)
+    response_generator = get_response_generator(
+        system_prompt=system_prompt,
+        num_turns=num_turns,
+        temperature=temperature or temperature_completion,
+        is_sample=is_sample,
+    )
+    total_steps: int = num_rows * 2
+    batch_size = DEFAULT_BATCH_SIZE
+    # create prompt rewrites
+    prompt_rewrites = get_rewritten_prompts(system_prompt, num_rows)
+    # create instructions
+    n_processed = 0
+    magpie_results = []
+    while n_processed < num_rows:
+        progress(
+            0.5 * n_processed / num_rows,
+            total=total_steps,
+            desc="(1/2) Generating instructions",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        rewritten_system_prompt = random.choice(prompt_rewrites)
+        inputs = [{"system_prompt": rewritten_system_prompt} for _ in range(batch_size)]
+        batch = list(magpie_generator.process(inputs=inputs))
+        magpie_results.extend(batch[0])
+        n_processed += batch_size
+        random.seed(a=random.randint(0, 2**32 - 1))
+    progress(0.5, desc="(1/2) Generating instructions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    if num_turns == 1:
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
+            )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+            random.seed(a=random.randint(0, 2**32 - 1))
+        for result in response_results:
+            result["prompt"] = result["instruction"]
+            result["completion"] = result["generation"]
+            result["system_prompt"] = system_prompt
+    else:
+        for result in magpie_results:
+            result["conversation"].insert(
+                0, {"role": "system", "content": system_prompt}
+            )
+            result["messages"] = result["conversation"]
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
+            )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+            random.seed(a=random.randint(0, 2**32 - 1))
+        for result in response_results:
+            result["messages"].append(
+                {"role": "assistant", "content": result["generation"]}
+            )
+    progress(
+        1,
+        total=total_steps,
+        desc="(2/2) Creating dataset",
+    )
+    # create distiset
+    distiset_results = []
+    for result in response_results:
+        record = {}
+        for relevant_keys in [
+            "messages",
+            "prompt",
+            "completion",
+            "model_name",
+            "system_prompt",
+        ]:
+            if relevant_keys in result:
+                record[relevant_keys] = result[relevant_keys]
+        distiset_results.append(record)
+    distiset = Distiset(
+        {
+            "default": Dataset.from_list(distiset_results),
+        }
+    )
+    # If not pushing to hub generate the dataset directly
+    distiset = distiset["default"]
+    if num_turns == 1:
+        outputs = distiset.to_pandas()[["prompt", "completion", "system_prompt"]]
+    else:
+        outputs = distiset.to_pandas()[["messages"]]
+    dataframe = pd.DataFrame(outputs)
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def generate_dataset_from_seed(
+    dataframe: pd.DataFrame,
+    document_column: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    temperature_completion: Union[float, None] = None,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    num_rows = test_max_num_rows(num_rows)
+    progress(0.0, desc="Initializing dataset generation")
+    document_data = column_to_list(dataframe, document_column)
+    if len(document_data) < num_rows:
+        document_data += random.choices(document_data, k=num_rows - len(document_data))
+    instruction_generator = get_sentence_pair_generator(
+        temperature=temperature, is_sample=is_sample
+    )
+    response_generator = get_response_generator(
+        system_prompt=None,
+        num_turns=1,
+        temperature=temperature or temperature_completion,
+        is_sample=is_sample,
+    )
+    follow_up_generator_instruction = get_follow_up_generator(
+        type="instruction", temperature=temperature, is_sample=is_sample
+    )
+    follow_up_generator_response = get_follow_up_generator(
+        type="response",
+        temperature=temperature or temperature_completion,
+        is_sample=is_sample,
+    )
+    steps = 2 * num_turns
+    total_steps: int = num_rows * steps
+    step_progress = round(1 / steps, 2)
+    batch_size = DEFAULT_BATCH_SIZE
+    # create instructions
+    n_processed = 0
+    instruction_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating instructions",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        batch = [
+            {"anchor": document}
+            for document in document_data[n_processed : n_processed + batch_size]
+        ]
+        questions = list(instruction_generator.process(inputs=batch))
+        instruction_results.extend(questions[0])
+        n_processed += batch_size
+    for result in instruction_results:
+        result["instruction"] = result["positive"]
+        result["prompt"] = result.pop("positive")
+    progress(step_progress, desc="Generating instructions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress + step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating responses",
+        )
+        batch = instruction_results[n_processed : n_processed + batch_size]
+        responses = list(response_generator.process(inputs=batch))
+        response_results.extend(responses[0])
+        n_processed += batch_size
+    for result in response_results:
+        result["completion"] = result.pop("generation")
+    # generate follow-ups
+    if num_turns > 1:
+        n_processed = 0
+        final_conversations = []
+        while n_processed < num_rows:
+            progress(
+                step_progress + step_progress * n_processed / num_rows,
+                total=total_steps,
+                desc="Generating follow-ups",
+            )
+            batch = response_results[n_processed : n_processed + batch_size]
+            conversations_batch = [
+                {
+                    "messages": [
+                        {"role": "user", "content": result["prompt"]},
+                        {"role": "assistant", "content": result["completion"]},
+                    ]
+                }
+                for result in batch
+            ]
+            for _ in range(num_turns - 1):
+                follow_up_instructions = list(
+                    follow_up_generator_instruction.process(inputs=conversations_batch)
+                )
+                for conv, follow_up in zip(
+                    conversations_batch, follow_up_instructions[0]
+                ):
+                    conv["messages"].append(
+                        {"role": "user", "content": follow_up["generation"]}
+                    )
+                follow_up_responses = list(
+                    follow_up_generator_response.process(inputs=conversations_batch)
+                )
+                for conv, follow_up in zip(conversations_batch, follow_up_responses[0]):
+                    conv["messages"].append(
+                        {"role": "assistant", "content": follow_up["generation"]}
+                    )
+            final_conversations.extend(
+                [{"messages": conv["messages"]} for conv in conversations_batch]
+            )
+            n_processed += batch_size
+    # create distiset
+    distiset_results = []
+    if num_turns == 1:
+        for result in response_results:
+            record = {}
+            for relevant_keys in ["prompt", "completion"]:
+                if relevant_keys in result:
+                    record[relevant_keys] = result[relevant_keys]
+            distiset_results.append(record)
+        dataframe = pd.DataFrame(distiset_results)
+    else:
+        distiset_results = final_conversations
+        dataframe = pd.DataFrame(distiset_results)
+        dataframe["messages"] = dataframe["messages"].apply(lambda x: json.dumps(x))
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def generate_dataset(
+    input_type: str,
+    dataframe: pd.DataFrame,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    temperature_completion: Union[float, None] = None,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = generate_dataset_from_prompt(
+            system_prompt=system_prompt,
+            num_turns=num_turns,
+            num_rows=num_rows,
+            temperature=temperature,
+            temperature_completion=temperature_completion,
+            is_sample=is_sample,
+        )
+    else:
+        dataframe = generate_dataset_from_seed(
+            dataframe=dataframe,
+            document_column=document_column,
+            num_turns=num_turns,
+            num_rows=num_rows,
+            temperature=temperature,
+            temperature_completion=temperature_completion,
+            is_sample=is_sample,
+        )
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[gr.OAuthToken, None],
+    private: bool,
+    pipeline_code: str,
+    progress=gr.Progress(),
+):
+    progress(0.0, desc="Validating")
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    progress(0.3, desc="Converting")
+    original_dataframe = dataframe.copy(deep=True)
+    dataframe = convert_dataframe_messages(dataframe)
+    progress(0.7, desc="Creating dataset")
+    dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
+    progress(0.9, desc="Pushing dataset")
+    distiset = Distiset({"default": dataset})
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+    push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
+    progress(1.0, desc="Dataset pushed")
+    return original_dataframe
+def push_dataset(
+    org_name: str,
+    repo_name: str,
+    private: bool,
+    original_repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    temperature_completion: Union[float, None] = None,
+    pipeline_code: str = "",
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = _get_dataframe()
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=original_repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating dataset")
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        num_turns=num_turns,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+    )
+    push_dataset_to_hub(
+        dataframe=dataframe,
+        org_name=org_name,
+        repo_name=repo_name,
+        oauth_token=oauth_token,
+        private=private,
+        pipeline_code=pipeline_code,
+    )
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        client = get_argilla_client()
+        if client is None:
+            return ""
+        progress(0.5, desc="Creating dataset in Argilla")
+        if "messages" in dataframe.columns:
+            settings = rg.Settings(
+                fields=[
+                    rg.ChatField(
+                        name="messages",
+                        description="The messages in the conversation",
+                        title="Messages",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="user_message_length", title="User Message Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="assistant_message_length",
+                        title="Assistant Message Length",
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="messages_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and provide a score for the assistant's response.",
+            )
+            dataframe["user_message_length"] = dataframe["messages"].apply(
+                lambda x: sum([len(y["content"]) for y in x if y["role"] == "user"])
+            )
+            dataframe["assistant_message_length"] = dataframe["messages"].apply(
+                lambda x: sum(
+                    [len(y["content"]) for y in x if y["role"] == "assistant"]
+                )
+            )
+            dataframe["messages_embeddings"] = get_embeddings(
+                dataframe["messages"].apply(
+                    lambda x: " ".join([y["content"] for y in x])
+                )
+            )
+        else:
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name="system_prompt",
+                        title="System Prompt",
+                        description="The system prompt used for the conversation",
+                        required=False,
+                    ),
+                    rg.TextField(
+                        name="prompt",
+                        title="Prompt",
+                        description="The prompt used for the conversation",
+                    ),
+                    rg.TextField(
+                        name="completion",
+                        title="Completion",
+                        description="The completion from the assistant",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="prompt_length", title="Prompt Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="completion_length", title="Completion Length"
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="prompt_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and correct the prompt and completion where needed.",
+            )
+            dataframe["prompt_length"] = dataframe["prompt"].apply(len)
+            dataframe["completion_length"] = dataframe["completion"].apply(len)
+            dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
+        rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=repo_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+def save_local(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+    temperature: float,
+    repo_name: str,
+    temperature_completion: Union[float, None] = None,
+) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = _get_dataframe()
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+        )
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        num_turns=num_turns,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+    )
+    local_dataset = Dataset.from_pandas(dataframe)
+    output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
+    output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
+    local_dataset.to_csv(output_csv, index=False)
+    local_dataset.to_json(output_json, index=False)
+    return output_csv, output_json
+def show_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=True)}
+def hide_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=False)}
+def show_document_column_visibility():
+    return {document_column: gr.Dropdown(visible=True)}
+def hide_document_column_visibility():
+    return {
+        document_column: gr.Dropdown(
+            choices=["Load your data first in step 1."],
+            value="Load your data first in step 1.",
+            visible=False,
+        )
+    }
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+def show_temperature_completion():
+    if MODEL != MODEL_COMPLETION:
+        return {temperature_completion: gr.Slider(value=0.9, visible=True)}
+def show_save_local_button():
+    return {btn_save_local: gr.Button(visible=True)}
+def hide_save_local_button():
+    return {btn_save_local: gr.Button(visible=False)}
+def show_save_local():
+    gr.update(success_message, min_height=0)
+    return {
+        csv_file: gr.File(visible=True),
+        json_file: gr.File(visible=True),
+        success_message: success_message
+    }
+def hide_save_local():
+    gr.update(success_message, min_height=100)
+    return {
+        csv_file: gr.File(visible=False),
+        json_file: gr.File(visible=False),
+        success_message: success_message,
+    }
+######################
+# Gradio UI
+######################
+with gr.Blocks() as app:
+    with gr.Column() as main_ui:
+        if not SFT_AVAILABLE:
+            gr.Markdown(
+                value="\n".join(
+                    [
+                        "## Supervised Fine-Tuning not available",
+                        "",
+                        f"This tool relies on the [Magpie](https://arxiv.org/abs/2406.08464) prequery template, which is not implemented for the {MODEL} with {BASE_URL}.",
+                        "Use Llama3 or Qwen2 models with Hugging Face Inference Endpoints.",
+                    ]
+                )
+            )
+        else:
+            gr.Markdown("## 1. Select your input")
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    input_type = gr.Dropdown(
+                        label="Input type",
+                        choices=["prompt-input", "dataset-input", "file-input"],
+                        value="prompt-input",
+                        multiselect=False,
+                        visible=False,
+                    )
+                    with gr.Tab("Generate from prompt") as tab_prompt_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                dataset_description = gr.Textbox(
+                                    label="Dataset description",
+                                    placeholder="Give a precise description of your desired dataset.",
+                                )
+                                with gr.Row():
+                                    clear_prompt_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_prompt_btn = gr.Button(
+                                        "Create", variant="primary"
+                                    )
+                            with gr.Column(scale=3):
+                                examples = gr.Examples(
+                                    examples=DEFAULT_DATASET_DESCRIPTIONS,
+                                    inputs=[dataset_description],
+                                    cache_examples=False,
+                                    label="Examples",
+                                )
+                    with gr.Tab("Load from Hub") as tab_dataset_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                search_in = HuggingfaceHubSearch(
+                                    label="Search",
+                                    placeholder="Search for a dataset",
+                                    search_type="dataset",
+                                    sumbit_on_select=True,
+                                )
+                                with gr.Row():
+                                    clear_dataset_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_dataset_btn = gr.Button(
+                                        "Load", variant="primary"
+                                    )
+                            with gr.Column(scale=3):
+                                examples = gr.Examples(
+                                    examples=[
+                                        "charris/wikipedia_sample",
+                                        "plaguss/argilla_sdk_docs_raw_unstructured",
+                                        "BeIR/hotpotqa-generated-queries",
+                                    ],
+                                    label="Example datasets",
+                                    fn=lambda x: x,
+                                    inputs=[search_in],
+                                    run_on_click=True,
+                                )
+                                search_out = gr.HTML(
+                                    label="Dataset preview", visible=False
+                                )
+                    with gr.Tab("Load your file") as tab_file_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                file_in = gr.File(
+                                    label="Upload your file. Supported formats: .md, .txt, .docx, .pdf",
+                                    file_count="multiple",
+                                    file_types=[".md", ".txt", ".docx", ".pdf"],
+                                )
+                                with gr.Row():
+                                    clear_file_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_file_btn = gr.Button("Load", variant="primary")
+                            with gr.Column(scale=3):
+                                file_out = gr.HTML(
+                                    label="Dataset preview", visible=False
+                                )
+            gr.HTML(value="<hr>")
+            gr.Markdown(value="## 2. Configure your dataset")
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    system_prompt = gr.Textbox(
+                        label="System prompt",
+                        placeholder="You are a helpful assistant.",
+                    )
+                    document_column = gr.Dropdown(
+                        label="Document Column",
+                        info="Select the document column to generate the chat data",
+                        choices=["Load your data first in step 1."],
+                        value="Load your data first in step 1.",
+                        interactive=False,
+                        multiselect=False,
+                        allow_custom_value=False,
+                        visible=False,
+                    )
+                    num_turns = gr.Number(
+                        value=1,
+                        label="Number of turns in the conversation",
+                        minimum=1,
+                        maximum=4,
+                        step=1,
+                        interactive=True,
+                        info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
+                    )
+                    with gr.Row():
+                        clear_btn_full = gr.Button(
+                            "Clear",
+                            variant="secondary",
+                        )
+                        btn_apply_to_sample_dataset = gr.Button(
+                            "Save", variant="primary"
+                        )
+                with gr.Column(scale=3):
+                    dataframe = _get_dataframe()
+            gr.HTML(value="<hr>")
+            gr.Markdown(value="## 3. Generate your dataset")
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    org_name = get_org_dropdown()
+                    repo_name = gr.Textbox(
+                        label="Repo name",
+                        placeholder="dataset_name",
+                        value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                        interactive=True,
+                    )
+                    num_rows = gr.Number(
+                        label="Number of rows",
+                        value=10,
+                        interactive=True,
+                        scale=1,
+                    )
+                    temperature = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=1.5,
+                        value=0.9,
+                        step=0.1,
+                        interactive=True,
+                    )
+                    temperature_completion = gr.Slider(
+                        label="Temperature for completion",
+                        minimum=0.1,
+                        maximum=1.5,
+                        value=None,
+                        step=0.1,
+                        interactive=True,
+                        visible=False,
+                    )
+                    private = gr.Checkbox(
+                        label="Private dataset",
+                        value=False,
+                        interactive=True,
+                        scale=1,
+                    )
+                    btn_push_to_hub = gr.Button(
+                        "Push to Hub", variant="primary", scale=2
+                    )
+                    btn_save_local = gr.Button(
+                        "Save locally", variant="primary", scale=2, visible=False
+                    )
+                with gr.Column(scale=3):
+                    csv_file = gr.File(
+                        label="CSV",
+                        elem_classes="datasets",
+                        visible=False,
+                    )
+                    json_file = gr.File(
+                        label="JSON",
+                        elem_classes="datasets",
+                        visible=False,
+                    )
+                    success_message = gr.Markdown(
+                        visible=False,
+                        min_height=0 # don't remove this otherwise progress is not visible
+                    )
+                    with gr.Accordion(
+                        "Customize your pipeline with distilabel",
+                        open=False,
+                        visible=False,
+                    ) as pipeline_code_ui:
+                        code = generate_pipeline_code(
+                            repo_id=search_in.value,
+                            input_type=input_type.value,
+                            system_prompt=system_prompt.value,
+                            document_column=document_column.value,
+                            num_turns=num_turns.value,
+                            num_rows=num_rows.value,
+                        )
+                        pipeline_code = gr.Code(
+                            value=code,
+                            language="python",
+                            label="Distilabel Pipeline Code",
+                        )
+    tab_prompt_input.select(
+        fn=lambda: "prompt-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=show_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=hide_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_dataset_input.select(
+        fn=lambda: "dataset-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_file_input.select(
+        fn=lambda: "file-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    search_in.submit(
+        fn=lambda df: pd.DataFrame(columns=df.columns),
+        inputs=[dataframe],
+        outputs=[dataframe],
+    )
+    load_prompt_btn.click(
+        fn=generate_system_prompt,
+        inputs=[dataset_description],
+        outputs=[system_prompt],
+    ).success(
+        fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=dataframe,
+    )
+    gr.on(
+        triggers=[load_dataset_btn.click, load_file_btn.click],
+        fn=load_dataset_file,
+        inputs=[search_in, file_in, input_type],
+        outputs=[dataframe, document_column],
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=dataframe,
+    )
+    btn_push_to_hub.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
+        outputs=[success_message],
+    ).then(
+        fn=validate_push_to_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=hide_save_local,
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
+        inputs=[
+            org_name,
+            repo_name,
+            private,
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+            temperature,
+            temperature_completion,
+            pipeline_code,
+        ],
+        outputs=[success_message],
+    ).success(
+        fn=show_success_message,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
+    btn_save_local.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=show_save_local,
+        inputs=[],
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        save_local,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+            temperature,
+            repo_name,
+            temperature_completion,
+        ],
+        outputs=[csv_file, json_file],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
+    clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
+    clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
+    clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
+    clear_btn_full.click(
+        fn=lambda df: ("", "", [], _get_dataframe()),
+        inputs=[dataframe],
+        outputs=[system_prompt, document_column, num_turns, dataframe],
+    )
+    app.load(fn=swap_visibility, outputs=main_ui)
+    app.load(fn=get_org_dropdown, outputs=[org_name])
+    app.load(fn=get_random_repo_name, outputs=[repo_name])
+    app.load(fn=show_temperature_completion, outputs=[temperature_completion])
+    if SAVE_LOCAL_DIR is not None:
+        app.load(fn=show_save_local_button, outputs=btn_save_local)

src/synthetic_dataset_generator/apps/eval.py ADDED Viewed

	@@ -0,0 +1,894 @@

+import json
+import uuid
+from typing import Union
+import argilla as rg
+import gradio as gr
+import numpy as np
+import pandas as pd
+from datasets import (
+    Dataset,
+    get_dataset_config_names,
+    get_dataset_split_names,
+    load_dataset,
+)
+from distilabel.distiset import Distiset
+from gradio.oauth import OAuthToken  #
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
+from synthetic_dataset_generator.apps.base import (
+    combine_datasets,
+    get_iframe,
+    hide_success_message,
+    push_pipeline_code_to_hub,
+    show_success_message,
+    test_max_num_rows,
+    validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
+)
+from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
+from synthetic_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from synthetic_dataset_generator.pipelines.eval import (
+    generate_pipeline_code,
+    get_custom_evaluator,
+    get_ultrafeedback_evaluator,
+)
+from synthetic_dataset_generator.utils import (
+    column_to_list,
+    extract_column_names,
+    get_argilla_client,
+    get_org_dropdown,
+    get_random_repo_name,
+    pad_or_truncate_list,
+    process_columns,
+    swap_visibility,
+)
+def get_valid_columns(dataframe: pd.DataFrame):
+    instruction_valid_columns = []
+    response_valid_columns = []
+    for col in dataframe.columns:
+        sample_val = dataframe[col].iloc[0]
+        if isinstance(sample_val, str) or (
+            isinstance(sample_val, (list, np.ndarray))
+            and all(isinstance(item, dict) and "role" in item for item in sample_val)
+        ):
+            instruction_valid_columns.append(col)
+            response_valid_columns.append(col)
+        if isinstance(sample_val, (list, np.ndarray)) and all(
+            isinstance(item, str) for item in sample_val
+        ):
+            response_valid_columns.append(col)
+    return instruction_valid_columns, response_valid_columns
+def load_dataset_from_hub(
+    repo_id: str, num_rows: int = 10, token: Union[OAuthToken, None] = None
+):
+    if not repo_id:
+        raise gr.Error("Hub repo id is required")
+    subsets = get_dataset_config_names(repo_id, token=token)
+    splits = get_dataset_split_names(repo_id, subsets[0], token=token)
+    ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
+    rows = []
+    for idx, row in enumerate(ds):
+        rows.append(row)
+        if idx == num_rows:
+            break
+    ds = Dataset.from_list(rows)
+    dataframe = ds.to_pandas()
+    instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
+    col_instruction = instruction_valid_columns[0] if instruction_valid_columns else ""
+    col_response = "No valid response columns found."
+    for col in response_valid_columns:
+        if col != col_instruction:
+            col_response = col
+            break
+    prompt_template = gr.Code(
+        label="Prompt template",
+        value="\n".join(
+            [
+                "Evaluate the following text based on criteria.",
+                "Criteria: quality.",
+                "Score: between 1 and 10.",
+                "Text: {{" + col_response + "}}",
+            ]
+        ),
+        language="jinja2",
+        interactive=True,
+    )
+    structured_output = gr.Code(
+        label="Structured output",
+        value=json.dumps(
+            {
+                "type": "object",
+                "properties": {"quality": {"type": "integer"}},
+                "required": ["quality"],
+            },
+            indent=4,
+        ),
+        language="json",
+        interactive=True,
+    )
+    return (
+        dataframe,
+        gr.Dropdown(
+            choices=instruction_valid_columns,
+            label="Instruction column",
+            value=col_instruction,
+            interactive=True,
+        ),
+        gr.Dropdown(
+            choices=response_valid_columns,
+            label="Response column",
+            value=col_response,
+            interactive=(
+                False if col_response == "No valid response columns found." else True
+            ),
+        ),
+        prompt_template,
+        structured_output,
+    )
+def define_evaluation_aspects(task_type: str):
+    if task_type == "chat-eval":
+        return gr.Dropdown(
+            value=["overall-rating"],
+            choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
+            label="Evaluation Aspects",
+            multiselect=True,
+            interactive=True,
+        )
+    else:
+        return gr.Dropdown(interactive=False, visible=False)
+def evaluate_instruction_response(
+    dataframe: pd.DataFrame,
+    aspects: list[str],
+    instruction_column: str,
+    response_columns: str,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+):
+    progress(0.0, desc="Evaluating instructions and responses")
+    data = process_columns(dataframe, instruction_column, response_columns)
+    num_generations = len(data[0]["generations"])
+    evaluated_results = []
+    for entry in data:
+        result_row = {
+            "instruction": entry["instruction"],
+            "generations": entry["generations"],
+        }
+        for aspect in aspects:
+            result_row[f"ratings_{aspect}"] = None
+            result_row[f"rationale_for_ratings_{aspect}"] = None
+            if aspect in ["truthfulness", "helpfulness"]:
+                result_row[f"type_{aspect}"] = None
+                result_row[f"rationale_for_type_{aspect}"] = None
+        result_row["model_name"] = None
+        evaluated_results.append(result_row)
+    batch_size = DEFAULT_BATCH_SIZE
+    total_steps: int = len(aspects) * num_rows
+    # evaluate instructions and responses
+    for aspect in aspects:
+        ultrafeedback_evaluator = get_ultrafeedback_evaluator(aspect, is_sample)
+        n_processed = 0
+        while n_processed < num_rows:
+            progress(
+                (len(aspects) * n_processed) / total_steps,
+                total=total_steps,
+                desc=f"Evaluating aspect: {aspect}",
+            )
+            remaining_rows = num_rows - n_processed
+            batch_size = min(batch_size, remaining_rows)
+            inputs = data[n_processed : n_processed + batch_size]
+            batch_results = list(ultrafeedback_evaluator.process(inputs=inputs))
+            for j, result in enumerate(batch_results[0]):
+                idx = n_processed + j
+                evaluated_results[idx][f"ratings_{aspect}"] = pad_or_truncate_list(
+                    result.get("ratings"), num_generations
+                )
+                evaluated_results[idx]["model_name"] = result.get("model_name")
+                if aspect in ["truthfulness", "helpfulness"]:
+                    evaluated_results[idx][f"type_{aspect}"] = pad_or_truncate_list(
+                        result.get("types"), num_generations
+                    )
+                    evaluated_results[idx][f"rationale_for_type_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(
+                            result.get("rationales-for-ratings"), num_generations
+                        )
+                    )
+                else:
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+            n_processed += batch_size
+    # create final dataset
+    dataframe = pd.DataFrame(evaluated_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
+def evaluate_custom(
+    dataframe: pd.DataFrame,
+    prompt_template: str,
+    structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+):
+    progress(0.0, desc="Evaluating dataset")
+    columns = extract_column_names(prompt_template)
+    input_columns = {column: column_to_list(dataframe, column) for column in columns}
+    custom_evaluator = get_custom_evaluator(
+        prompt_template, structured_output, columns, is_sample
+    )
+    batch_size = DEFAULT_BATCH_SIZE
+    # evaluate the data
+    n_processed = 0
+    evaluation_results = []
+    while n_processed < num_rows:
+        progress(
+            n_processed / num_rows,
+            desc="Evaluating dataset",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = []
+        for idx in range(n_processed, n_processed + batch_size):
+            input = {column: input_columns[column][idx] for column in input_columns}
+            inputs.append(input)
+        batch = list(custom_evaluator.process(inputs=inputs))
+        evaluation_results.extend(batch[0])
+        n_processed += batch_size
+    # create final dataset
+    distiset_results = []
+    for result in evaluation_results:
+        record = {key: result[key] for key in result if key != "distilabel_metadata"}
+        distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
+def _evaluate_dataset(
+    dataframe: pd.DataFrame,
+    eval_type: str,
+    aspects_instruction_response: list[str],
+    instruction_instruction_response: str,
+    response_instruction_response: str,
+    prompt_template: str,
+    structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
+):
+    num_rows = test_max_num_rows(num_rows)
+    if eval_type == "chat-eval":
+        dataframe = evaluate_instruction_response(
+            dataframe=dataframe,
+            aspects=aspects_instruction_response,
+            instruction_column=instruction_instruction_response,
+            response_columns=response_instruction_response,
+            num_rows=num_rows,
+            is_sample=is_sample,
+        )
+    else:
+        dataframe = evaluate_custom(
+            dataframe=dataframe,
+            prompt_template=prompt_template,
+            structured_output=structured_output,
+            num_rows=num_rows,
+            is_sample=is_sample,
+        )
+    return dataframe
+def evaluate_sample_dataset(
+    repo_id: str,
+    eval_type: str,
+    aspects_instruction_response: list[str],
+    instruction_instruction_response: str,
+    response_instruction_response: str,
+    prompt_template: str,
+    structured_output: dict,
+):
+    dataframe, _, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=10,
+        is_sample=True,
+    )
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[gr.OAuthToken, None],
+    private: bool,
+    pipeline_code: str,
+    progress=gr.Progress(),
+):
+    progress(0.0, desc="Validating")
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    progress(0.5, desc="Creating dataset")
+    dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
+    distiset = Distiset({"default": dataset})
+    progress(0.9, desc="Pushing dataset")
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+    push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
+    progress(1.0, desc="Dataset pushed")
+    return dataframe
+def push_dataset(
+    org_name: str,
+    repo_name: str,
+    private: bool,
+    num_rows: int,
+    original_repo_id: str,
+    eval_type: str,
+    aspects_instruction_response: list[str],
+    instruction_instruction_response: str,
+    response_instruction_response: str,
+    prompt_template: str,
+    structured_output: dict,
+    pipeline_code: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    dataframe, _, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=num_rows,
+    )
+    push_dataset_to_hub(
+        dataframe, org_name, repo_name, oauth_token, private, pipeline_code
+    )
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        client = get_argilla_client()
+        if client is None:
+            return ""
+        progress(0.5, desc="Creating dataset in Argilla")
+        if eval_type == "chat-eval":
+            num_generations = len((dataframe["generations"][0]))
+            fields = [
+                rg.ChatField(
+                    name=f"chat_{i}",
+                    title=f"Chat {i+1}",
+                    description=f"User and assistant conversation for generation {i+1}",
+                )
+                for i in range(num_generations)
+            ]
+            questions = []
+            for i in range(num_generations):
+                for aspect in aspects_instruction_response:
+                    questions.append(
+                        rg.RatingQuestion(
+                            name=f"ratings_{aspect}_{i}",
+                            values=list(range(11)),
+                            title=f"Ratings for {aspect} for response {i+1}",
+                            required=True,
+                        )
+                    )
+                    questions.append(
+                        rg.TextQuestion(
+                            name=f"rationale_for_ratings_{aspect}_{i}",
+                            title=f"Rationale for ratings for {aspect} for response {i+1}",
+                            required=False,
+                            use_markdown=True,
+                        )
+                    )
+                    if aspect in ["truthfulness", "helpfulness"]:
+                        questions.append(
+                            rg.RatingQuestion(
+                                name=f"type_{aspect}_{i}",
+                                values=list(range(1, 6)),
+                                title=f"The type of the response {i+1} for {aspect}",
+                                required=True,
+                            )
+                        )
+                        questions.append(
+                            rg.TextQuestion(
+                                name=f"rationale_for_type_{aspect}_{i}",
+                                title=f"Rationale for type of the response {i+1} for {aspect}",
+                                required=False,
+                                use_markdown=True,
+                            )
+                        )
+            metadata = [
+                rg.IntegerMetadataProperty(
+                    name="instruction_length", title="Instruction length"
+                ),
+            ]
+            for i in range(num_generations):
+                metadata.append(
+                    rg.IntegerMetadataProperty(
+                        name=f"response_{i}_length", title=f"Response {i+1} length"
+                    )
+                )
+            vectors = [
+                rg.VectorField(
+                    name="instruction_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ]
+            settings = rg.Settings(
+                fields=fields,
+                questions=questions,
+                metadata=metadata,
+                vectors=vectors,
+                guidelines="Please review the conversation and provide an evaluation.",
+            )
+            dataframe["instruction_length"] = dataframe["instruction"].apply(len)
+            for i in range(num_generations):
+                dataframe[f"response_{i}_length"] = dataframe["generations"].apply(
+                    lambda gens: len(gens[i]) if i < len(gens) else 0
+                )
+            dataframe["instruction_embeddings"] = get_embeddings(
+                dataframe["instruction"].to_list()
+            )
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
+                )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            records = []
+            for sample in hf_dataset:
+                fields = {}
+                metadata = {"instruction_length": sample.get("instruction_length", 0)}
+                vectors = {
+                    "instruction_embeddings": sample.get("instruction_embeddings", [])
+                }
+                suggestions = []
+                generations = sample.get("generations", [])
+                for i in range(num_generations):
+                    fields[f"chat_{i}"] = [
+                        {"role": "user", "content": sample.get("instruction", "")},
+                        {"role": "assistant", "content": generations[i]},
+                    ]
+                    metadata[f"response_{i}_length"] = sample.get(
+                        f"response_{i}_length", 0
+                    )
+                    for aspect in aspects_instruction_response:
+                        ratings = sample.get(f"ratings_{aspect}", [])
+                        rationales = sample.get(f"rationale_for_ratings__{aspect}", [])
+                        rating_value = (
+                            ratings[i]
+                            if ratings and isinstance(ratings[i], int)
+                            else None
+                        )
+                        rationale_value = (
+                            rationales[i]
+                            if rationales and isinstance(rationales[i], str)
+                            else None
+                        )
+                        if rating_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"ratings_{aspect}_{i}",
+                                    value=rating_value,
+                                )
+                            )
+                        if rationale_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"rationale_for_ratings_{aspect}_{i}",
+                                    value=rationale_value,
+                                )
+                            )
+                        if aspect in ["truthfulness", "helpfulness"]:
+                            types = sample.get(f"type_{aspect}", [])
+                            rationale_types = sample.get(
+                                f"rationale_for_type_{aspect}", []
+                            )
+                            type_value = (
+                                types[i]
+                                if types and isinstance(types[i], int)
+                                else None
+                            )
+                            rationale_type_value = (
+                                rationale_types[i]
+                                if rationale_types
+                                and isinstance(rationale_types[i], str)
+                                else None
+                            )
+                            if type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"type_{aspect}_{i}",
+                                        value=type_value,
+                                    )
+                                )
+                            if rationale_type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"rationale_for_type_{aspect}_{i}",
+                                        value=rationale_type_value,
+                                    )
+                                )
+                records.append(
+                    rg.Record(
+                        fields=fields,
+                        metadata=metadata,
+                        vectors=vectors,
+                        suggestions=suggestions,
+                    )
+                )
+            rg_dataset.records.log(records=records)
+            progress(1.0, desc="Dataset pushed to Argilla")
+        else:
+            columns = extract_column_names(prompt_template)
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name=column,
+                        title=column.capitalize(),
+                        description="The column content",
+                    )
+                    for column in columns
+                ],
+                questions=[
+                    rg.TextQuestion(
+                        name="evaluation",
+                        title="Evaluation",
+                        description="The generated evaluation",
+                        use_markdown=True,
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name=f"{column}_length", title=f"{column.capitalize()} length"
+                    )
+                    for column in columns
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name=f"{column}_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                    for column in columns
+                ],
+                guidelines="Please review, correct and provide an accurate evaluation.",
+            )
+            for column in columns:
+                dataframe[f"{column}_length"] = dataframe[column].apply(len)
+                dataframe[f"{column}_embeddings"] = get_embeddings(dataframe[column])
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
+                )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            rg_dataset.records.log(
+                records=hf_dataset, mapping={"generation": "evaluation"}
+            )
+            progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+######################
+# Gradio UI
+######################
+with gr.Blocks() as app:
+    with gr.Column() as main_ui:
+        gr.Markdown("## 1. Select your input dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                search_in = HuggingfaceHubSearch(
+                    label="Search",
+                    placeholder="Search for a dataset",
+                    search_type="dataset",
+                    sumbit_on_select=True,
+                )
+                with gr.Row():
+                    clear_btn_part = gr.Button("Clear", variant="secondary")
+                    load_btn = gr.Button("Load", variant="primary")
+            with gr.Column(scale=3):
+                examples = gr.Examples(
+                    examples=[
+                        "argilla/distilabel-sft-easy",
+                        "HuggingFaceFW/fineweb-edu",
+                        "argilla/distilabel-intel-orca-dpo-pairs",
+                    ],
+                    label="Example datasets",
+                    fn=lambda x: x,
+                    inputs=[search_in],
+                    run_on_click=True,
+                )
+                search_out = gr.HTML(label="Dataset preview", visible=False)
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 2. Configure your task")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                eval_type = gr.Dropdown(
+                    label="Evaluation type",
+                    choices=["chat-eval", "custom-eval"],
+                    value="chat-eval",
+                    multiselect=False,
+                    visible=False,
+                )
+                with gr.Tab("Response Evaluation") as tab_instruction_response:
+                    aspects_instruction_response = define_evaluation_aspects(
+                        "chat-eval"
+                    )
+                    instruction_instruction_response = gr.Dropdown(
+                        label="Instruction Column",
+                        info="Select the instruction column to evaluate",
+                        choices=["Load your data first in step 1."],
+                        value="Load your data first in step 1.",
+                        interactive=False,
+                        multiselect=False,
+                        allow_custom_value=False,
+                    )
+                    response_instruction_response = gr.Dropdown(
+                        label="Response Column",
+                        info="Select the response column(s) to evaluate",
+                        choices=["Load your data first in step 1."],
+                        value="Load your data first in step 1.",
+                        interactive=False,
+                        multiselect=False,
+                        allow_custom_value=False,
+                    )
+                    tab_instruction_response.select(
+                        fn=lambda: "chat-eval",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                with gr.Tab("Custom Evaluation Prompt") as tab_custom:
+                    aspects_custom = define_evaluation_aspects("custom-eval")
+                    prompt_template = gr.Code(
+                        label="Prompt template",
+                        value="Load your data first in step 1.",
+                        language="markdown",
+                        interactive=False,
+                    )
+                    structured_output = gr.Code(
+                        label="Structured output",
+                        value="Load your data first in step 1.",
+                        language="json",
+                        interactive=False,
+                    )
+                    tab_custom.select(
+                        fn=lambda: "custom-eval",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                with gr.Row():
+                    clear_btn_full = gr.Button("Clear", variant="secondary")
+                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
+            with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["prompt", "completion", "evaluation"],
+                    wrap=True,
+                    interactive=False,
+                )
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 3. Evaluate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                org_name = get_org_dropdown()
+                repo_name = gr.Textbox(
+                    label="Repo name",
+                    placeholder="dataset_name",
+                    value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                    interactive=True,
+                )
+                num_rows = gr.Number(
+                    label="Number of rows",
+                    value=10,
+                    interactive=True,
+                    scale=1,
+                )
+                private = gr.Checkbox(
+                    label="Private dataset",
+                    value=False,
+                    interactive=True,
+                    scale=1,
+                )
+                btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+            with gr.Column(scale=3):
+                success_message = gr.Markdown(
+                    visible=True,
+                    min_height=100,  # don't remove this otherwise progress is not visible
+                )
+                with gr.Accordion(
+                    "Customize your pipeline with distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                        repo_id=search_in.value,
+                        aspects=aspects_instruction_response.value,
+                        instruction_column=instruction_instruction_response,
+                        response_columns=response_instruction_response,
+                        prompt_template=prompt_template.value,
+                        structured_output=structured_output.value,
+                        num_rows=num_rows.value,
+                        eval_type=eval_type.value,
+                    )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out).then(
+        fn=lambda df: pd.DataFrame(columns=df.columns),
+        inputs=[dataframe],
+        outputs=[dataframe],
+    )
+    load_btn.click(
+        fn=load_dataset_from_hub,
+        inputs=[search_in],
+        outputs=[
+            dataframe,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+        ],
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=evaluate_sample_dataset,
+        inputs=[
+            search_in,
+            eval_type,
+            aspects_instruction_response,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+        ],
+        outputs=dataframe,
+    )
+    btn_push_to_hub.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
+        outputs=[success_message],
+    ).then(
+        fn=validate_push_to_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
+        inputs=[
+            org_name,
+            repo_name,
+            private,
+            num_rows,
+            search_in,
+            eval_type,
+            aspects_instruction_response,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+            pipeline_code,
+        ],
+        outputs=[success_message],
+    ).success(
+        fn=show_success_message,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            prompt_template,
+            structured_output,
+            eval_type,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
+    clear_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
+    clear_btn_full.click(
+        fn=lambda df: ("", "", pd.DataFrame(columns=df.columns)),
+        inputs=[dataframe],
+        outputs=[
+            instruction_instruction_response,
+            response_instruction_response,
+            dataframe,
+        ],
+    )
+    app.load(fn=swap_visibility, outputs=main_ui)
+    app.load(fn=get_org_dropdown, outputs=[org_name])
+    app.load(fn=get_random_repo_name, outputs=[repo_name])

src/synthetic_dataset_generator/apps/rag.py ADDED Viewed

	@@ -0,0 +1,972 @@

+import os
+import random
+import uuid
+from typing import Union
+import argilla as rg
+import gradio as gr
+import nltk
+import pandas as pd
+from datasets import Dataset
+from distilabel.distiset import Distiset
+from gradio.oauth import OAuthToken
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
+from synthetic_dataset_generator.apps.base import (
+    combine_datasets,
+    hide_success_message,
+    load_dataset_from_hub,
+    preprocess_input_data,
+    push_pipeline_code_to_hub,
+    show_success_message,
+    test_max_num_rows,
+    validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
+)
+from synthetic_dataset_generator.constants import (
+    DEFAULT_BATCH_SIZE,
+    MODEL,
+    MODEL_COMPLETION,
+    SAVE_LOCAL_DIR,
+)
+from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
+from synthetic_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from synthetic_dataset_generator.pipelines.rag import (
+    DEFAULT_DATASET_DESCRIPTIONS,
+    generate_pipeline_code,
+    get_chunks_generator,
+    get_prompt_generator,
+    get_response_generator,
+    get_sentence_pair_generator,
+)
+from synthetic_dataset_generator.utils import (
+    column_to_list,
+    get_argilla_client,
+    get_org_dropdown,
+    get_random_repo_name,
+    swap_visibility,
+)
+os.makedirs("./nltk_data", exist_ok=True)
+nltk.data.path.append("./nltk_data")
+nltk.download("punkt_tab", download_dir="./nltk_data")
+nltk.download("averaged_perceptron_tagger_eng", download_dir="./nltk_data")
+def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
+    progress(0.1, desc="Initializing")
+    generate_description = get_prompt_generator()
+    progress(0.5, desc="Generating")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "instruction": dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+    progress(1.0, desc="Prompt generated")
+    return result
+def load_dataset_file(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    num_rows: int = 10,
+    token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    progress(0.1, desc="Loading the source data")
+    if input_type == "dataset-input":
+        return load_dataset_from_hub(repo_id=repo_id, num_rows=num_rows, token=token)
+    else:
+        return preprocess_input_data(file_paths=file_paths, num_rows=num_rows)
+def generate_sample_dataset(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    retrieval_reranking: list[str],
+    num_rows: str,
+    oauth_token: Union[OAuthToken, None],
+    progress=gr.Progress(),
+):
+    retrieval = "Retrieval" in retrieval_reranking
+    reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating dataset")
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        retrieval=retrieval,
+        reranking=reranking,
+        num_rows=10,
+        is_sample=True,
+    )
+    progress(1.0, desc="Sample dataset generated")
+    return dataframe
+def generate_dataset(
+    input_type: str,
+    dataframe: pd.DataFrame,
+    system_prompt: str,
+    document_column: str,
+    retrieval: bool = False,
+    reranking: bool = False,
+    num_rows: int = 10,
+    temperature: float = 0.7,
+    temperature_completion: Union[float, None] = None,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+):
+    num_rows = test_max_num_rows(num_rows)
+    progress(0.0, desc="Initializing dataset generation")
+    if input_type == "prompt-input":
+        chunk_generator = get_chunks_generator(
+            temperature=temperature, is_sample=is_sample
+        )
+    else:
+        document_data = column_to_list(dataframe, document_column)
+        if len(document_data) < num_rows:
+            document_data += random.choices(
+                document_data, k=num_rows - len(document_data)
+            )
+    retrieval_generator = get_sentence_pair_generator(
+        action="query",
+        triplet=True if retrieval else False,
+        temperature=temperature,
+        is_sample=is_sample,
+    )
+    response_generator = get_response_generator(
+        temperature=temperature_completion or temperature, is_sample=is_sample
+    )
+    if reranking:
+        reranking_generator = get_sentence_pair_generator(
+            action="semantically-similar",
+            triplet=True,
+            temperature=temperature,
+            is_sample=is_sample,
+        )
+    steps = 2 + sum([1 if reranking else 0, 1 if input_type == "prompt-type" else 0])
+    total_steps: int = num_rows * steps
+    step_progress = round(1 / steps, 2)
+    batch_size = DEFAULT_BATCH_SIZE
+    # generate chunks
+    if input_type == "prompt-input":
+        n_processed = 0
+        chunk_results = []
+        rewritten_system_prompts = get_rewritten_prompts(system_prompt, num_rows)
+        while n_processed < num_rows:
+            progress(
+                step_progress * n_processed / num_rows,
+                total=total_steps,
+                desc="Generating chunks",
+            )
+            remaining_rows = num_rows - n_processed
+            batch_size = min(batch_size, remaining_rows)
+            inputs = [
+                {"task": random.choice(rewritten_system_prompts)}
+                for _ in range(batch_size)
+            ]
+            chunks = list(chunk_generator.process(inputs=inputs))
+            chunk_results.extend(chunks[0])
+            n_processed += batch_size
+            random.seed(a=random.randint(0, 2**32 - 1))
+        document_data = [chunk["generation"] for chunk in chunk_results]
+        progress(step_progress, desc="Generating chunks")
+    # generate questions
+    n_processed = 0
+    retrieval_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating questions",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = [
+            {"anchor": document}
+            for document in document_data[n_processed : n_processed + batch_size]
+        ]
+        questions = list(retrieval_generator.process(inputs=inputs))
+        retrieval_results.extend(questions[0])
+        n_processed += batch_size
+    for result in retrieval_results:
+        result["context"] = result["anchor"]
+        if retrieval:
+            result["question"] = result["positive"]
+            result["positive_retrieval"] = result.pop("positive")
+            result["negative_retrieval"] = result.pop("negative")
+        else:
+            result["question"] = result.pop("positive")
+    progress(step_progress, desc="Generating questions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress + step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating responses",
+        )
+        batch = retrieval_results[n_processed : n_processed + batch_size]
+        responses = list(response_generator.process(inputs=batch))
+        response_results.extend(responses[0])
+        n_processed += batch_size
+    for result in response_results:
+        result["response"] = result["generation"]
+    progress(step_progress, desc="Generating responses")
+    # generate reranking
+    if reranking:
+        n_processed = 0
+        reranking_results = []
+        while n_processed < num_rows:
+            progress(
+                step_progress * n_processed / num_rows,
+                total=total_steps,
+                desc="Generating reranking data",
+            )
+            batch = response_results[n_processed : n_processed + batch_size]
+            batch = list(reranking_generator.process(inputs=batch))
+            reranking_results.extend(batch[0])
+            n_processed += batch_size
+        for result in reranking_results:
+            result["positive_reranking"] = result.pop("positive")
+            result["negative_reranking"] = result.pop("negative")
+    progress(
+        1,
+        total=total_steps,
+        desc="Creating dataset",
+    )
+    # create distiset
+    distiset_results = []
+    source_results = reranking_results if reranking else response_results
+    base_keys = ["context", "question", "response"]
+    retrieval_keys = ["positive_retrieval", "negative_retrieval"] if retrieval else []
+    reranking_keys = ["positive_reranking", "negative_reranking"] if reranking else []
+    relevant_keys = base_keys + retrieval_keys + reranking_keys
+    for result in source_results:
+        record = {key: result.get(key) for key in relevant_keys if key in result}
+        distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[gr.OAuthToken, None],
+    private: bool,
+    pipeline_code: str,
+    progress=gr.Progress(),
+):
+    progress(0.0, desc="Validating")
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    progress(0.5, desc="Creating dataset")
+    dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
+    distiset = Distiset({"default": dataset})
+    progress(0.9, desc="Pushing dataset")
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+    push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
+    progress(1.0, desc="Dataset pushed")
+    return dataframe
+def push_dataset(
+    org_name: str,
+    repo_name: str,
+    private: bool,
+    original_repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    retrieval_reranking: list[str],
+    num_rows: int,
+    temperature: float,
+    temperature_completion: float,
+    pipeline_code: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    retrieval = "Retrieval" in retrieval_reranking
+    reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=original_repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating dataset")
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        retrieval=retrieval,
+        reranking=reranking,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+        is_sample=True,
+    )
+    push_dataset_to_hub(
+        dataframe, org_name, repo_name, oauth_token, private, pipeline_code
+    )
+    dataframe = dataframe[
+        dataframe.applymap(lambda x: str(x).strip() if pd.notna(x) else x).apply(
+            lambda row: row.notna().all() and (row != "").all(), axis=1
+        )
+    ]
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        client = get_argilla_client()
+        if client is None:
+            return ""
+        progress(0.5, desc="Creating dataset in Argilla")
+        fields = [
+            rg.TextField(
+                name="context",
+                title="Context",
+                description="Context for the generation",
+            ),
+            rg.ChatField(
+                name="chat",
+                title="Chat",
+                description="User and assistant conversation based on the context",
+            ),
+        ]
+        for item in ["positive", "negative"]:
+            if retrieval:
+                fields.append(
+                    rg.TextField(
+                        name=f"{item}_retrieval",
+                        title=f"{item.capitalize()} retrieval",
+                        description=f"The {item} query for retrieval",
+                    )
+                )
+            if reranking:
+                fields.append(
+                    rg.TextField(
+                        name=f"{item}_reranking",
+                        title=f"{item.capitalize()} reranking",
+                        description=f"The {item} query for reranking",
+                    )
+                )
+        questions = [
+            rg.LabelQuestion(
+                name="relevant",
+                title="Are the question and response relevant to the given context?",
+                labels=["yes", "no"],
+            ),
+            rg.LabelQuestion(
+                name="is_response_correct",
+                title="Is the response correct?",
+                labels=["yes", "no"],
+            ),
+        ]
+        for item in ["positive", "negative"]:
+            if retrieval:
+                questions.append(
+                    rg.LabelQuestion(
+                        name=f"is_{item}_retrieval_relevant",
+                        title=f"Is the {item} retrieval relevant?",
+                        labels=["yes", "no"],
+                        required=False,
+                    )
+                )
+            if reranking:
+                questions.append(
+                    rg.LabelQuestion(
+                        name=f"is_{item}_reranking_relevant",
+                        title=f"Is the {item} reranking relevant?",
+                        labels=["yes", "no"],
+                        required=False,
+                    )
+                )
+        metadata = [
+            rg.IntegerMetadataProperty(
+                name=f"{item}_length", title=f"{item.capitalize()} length"
+            )
+            for item in ["context", "question", "response"]
+        ]
+        vectors = [
+            rg.VectorField(
+                name=f"{item}_embeddings",
+                dimensions=get_sentence_embedding_dimensions(),
+            )
+            for item in ["context", "question", "response"]
+        ]
+        settings = rg.Settings(
+            fields=fields,
+            questions=questions,
+            metadata=metadata,
+            vectors=vectors,
+            guidelines="Please review the conversation and provide an evaluation.",
+        )
+        dataframe["chat"] = dataframe.apply(
+            lambda row: [
+                {"role": "user", "content": row["question"]},
+                {"role": "assistant", "content": row["response"]},
+            ],
+            axis=1,
+        )
+        for item in ["context", "question", "response"]:
+            dataframe[f"{item}_length"] = dataframe[item].apply(
+                lambda x: len(x) if x is not None else 0
+            )
+            dataframe[f"{item}_embeddings"] = get_embeddings(
+                dataframe[item].apply(lambda x: x if x is not None else "").to_list()
+            )
+        rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=repo_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+def save_local(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    retrieval_reranking: list[str],
+    num_rows: int,
+    temperature: float,
+    repo_name: str,
+    temperature_completion: float,
+) -> pd.DataFrame:
+    retrieval = "Retrieval" in retrieval_reranking
+    reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+        )
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        retrieval=retrieval,
+        reranking=reranking,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+    )
+    local_dataset = Dataset.from_pandas(dataframe)
+    output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
+    output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
+    local_dataset.to_csv(output_csv, index=False)
+    local_dataset.to_json(output_json, index=False)
+    return output_csv, output_json
+def show_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=True)}
+def hide_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=False)}
+def show_document_column_visibility():
+    return {document_column: gr.Dropdown(visible=True)}
+def hide_document_column_visibility():
+    return {
+        document_column: gr.Dropdown(
+            choices=["Load your data first in step 1."],
+            value="Load your data first in step 1.",
+            visible=False,
+        )
+    }
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+def show_temperature_completion():
+    if MODEL != MODEL_COMPLETION:
+        return {temperature_completion: gr.Slider(value=0.9, visible=True)}
+def show_save_local_button():
+    return {btn_save_local: gr.Button(visible=True)}
+def hide_save_local_button():
+    return {btn_save_local: gr.Button(visible=False)}
+def show_save_local():
+    gr.update(success_message, min_height=0)
+    return {
+        csv_file: gr.File(visible=True),
+        json_file: gr.File(visible=True),
+        success_message: success_message,
+    }
+def hide_save_local():
+    gr.update(success_message, min_height=100)
+    return {
+        csv_file: gr.File(visible=False),
+        json_file: gr.File(visible=False),
+        success_message: success_message,
+    }
+######################
+# Gradio UI
+######################
+with gr.Blocks() as app:
+    with gr.Column() as main_ui:
+        gr.Markdown("## 1. Select your input")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                input_type = gr.Dropdown(
+                    label="Input type",
+                    choices=["dataset-input", "file-input", "prompt-input"],
+                    value="dataset-input",
+                    multiselect=False,
+                    visible=False,
+                )
+                with gr.Tab("Load from Hub") as tab_dataset_input:
+                    with gr.Row(equal_height=False):
+                        with gr.Column(scale=2):
+                            search_in = HuggingfaceHubSearch(
+                                label="Search",
+                                placeholder="Search for a dataset",
+                                search_type="dataset",
+                                sumbit_on_select=True,
+                            )
+                            with gr.Row():
+                                clear_dataset_btn_part = gr.Button(
+                                    "Clear", variant="secondary"
+                                )
+                                load_dataset_btn = gr.Button("Load", variant="primary")
+                        with gr.Column(scale=3):
+                            examples = gr.Examples(
+                                examples=[
+                                    "charris/wikipedia_sample",
+                                    "plaguss/argilla_sdk_docs_raw_unstructured",
+                                    "BeIR/hotpotqa-generated-queries",
+                                ],
+                                label="Example datasets",
+                                fn=lambda x: x,
+                                inputs=[search_in],
+                                run_on_click=True,
+                            )
+                            search_out = gr.HTML(label="Dataset preview", visible=False)
+                with gr.Tab("Load your file") as tab_file_input:
+                    with gr.Row(equal_height=False):
+                        with gr.Column(scale=2):
+                            file_in = gr.File(
+                                label="Upload your file. Supported formats: .md, .txt, .docx, .pdf",
+                                file_count="multiple",
+                                file_types=[".md", ".txt", ".docx", ".pdf"],
+                            )
+                            with gr.Row():
+                                clear_file_btn_part = gr.Button(
+                                    "Clear", variant="secondary"
+                                )
+                                load_file_btn = gr.Button("Load", variant="primary")
+                        with gr.Column(scale=3):
+                            file_out = gr.HTML(label="Dataset preview", visible=False)
+                with gr.Tab("Generate from prompt") as tab_prompt_input:
+                    with gr.Row(equal_height=False):
+                        with gr.Column(scale=2):
+                            dataset_description = gr.Textbox(
+                                label="Dataset description",
+                                placeholder="Give a precise description of your desired dataset.",
+                            )
+                            with gr.Row():
+                                clear_prompt_btn_part = gr.Button(
+                                    "Clear", variant="secondary"
+                                )
+                                load_prompt_btn = gr.Button("Create", variant="primary")
+                        with gr.Column(scale=3):
+                            examples = gr.Examples(
+                                examples=DEFAULT_DATASET_DESCRIPTIONS,
+                                inputs=[dataset_description],
+                                cache_examples=False,
+                                label="Examples",
+                            )
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 2. Configure your task")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                system_prompt = gr.Textbox(
+                    label="System prompt",
+                    placeholder="You are a helpful assistant.",
+                    visible=False,
+                )
+                document_column = gr.Dropdown(
+                    label="Document Column",
+                    info="Select the document column to generate the RAG dataset",
+                    choices=["Load your data first in step 1."],
+                    value="Load your data first in step 1.",
+                    interactive=False,
+                    multiselect=False,
+                    allow_custom_value=False,
+                )
+                retrieval_reranking = gr.CheckboxGroup(
+                    choices=[("Retrieval", "Retrieval"), ("Reranking", "Reranking")],
+                    type="value",
+                    label="Data for RAG",
+                    info="Indicate the additional data you want to generate for RAG.",
+                )
+                with gr.Row():
+                    clear_btn_full = gr.Button("Clear", variant="secondary")
+                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
+            with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["context", "question", "response"],
+                    wrap=True,
+                    interactive=False,
+                )
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 3. Generate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                org_name = get_org_dropdown()
+                repo_name = gr.Textbox(
+                    label="Repo name",
+                    placeholder="dataset_name",
+                    value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                    interactive=True,
+                )
+                num_rows = gr.Number(
+                    label="Number of rows",
+                    value=10,
+                    interactive=True,
+                    scale=1,
+                )
+                temperature = gr.Slider(
+                    label="Temperature",
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=0.7,
+                    step=0.1,
+                    interactive=True,
+                )
+                temperature_completion = gr.Slider(
+                    label="Temperature for completion",
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=None,
+                    step=0.1,
+                    interactive=True,
+                    visible=False,
+                )
+                private = gr.Checkbox(
+                    label="Private dataset",
+                    value=False,
+                    interactive=True,
+                    scale=1,
+                )
+                btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+                btn_save_local = gr.Button(
+                    "Save locally", variant="primary", scale=2, visible=False
+                )
+            with gr.Column(scale=3):
+                csv_file = gr.File(
+                    label="CSV",
+                    elem_classes="datasets",
+                    visible=False,
+                )
+                json_file = gr.File(
+                    label="JSON",
+                    elem_classes="datasets",
+                    visible=False,
+                )
+                success_message = gr.Markdown(
+                    visible=False,
+                    min_height=0,  # don't remove this otherwise progress is not visible
+                )
+                with gr.Accordion(
+                    "Customize your pipeline with distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                        repo_id=search_in.value,
+                        input_type=input_type.value,
+                        system_prompt=system_prompt.value,
+                        document_column=document_column.value,
+                        retrieval_reranking=retrieval_reranking.value,
+                        num_rows=num_rows.value,
+                    )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    tab_dataset_input.select(
+        fn=lambda: "dataset-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_file_input.select(
+        fn=lambda: "file-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_prompt_input.select(
+        fn=lambda: "prompt-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=show_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=hide_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    search_in.submit(
+        fn=lambda df: pd.DataFrame(columns=df.columns),
+        inputs=[dataframe],
+        outputs=[dataframe],
+    )
+    gr.on(
+        triggers=[load_dataset_btn.click, load_file_btn.click],
+        fn=load_dataset_file,
+        inputs=[search_in, file_in, input_type],
+        outputs=[dataframe, document_column],
+    )
+    load_prompt_btn.click(
+        fn=generate_system_prompt,
+        inputs=[dataset_description],
+        outputs=[system_prompt],
+    ).success(
+        fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+        ],
+        outputs=dataframe,
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+        ],
+        outputs=dataframe,
+    )
+    btn_push_to_hub.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
+        outputs=[success_message],
+    ).then(
+        fn=validate_push_to_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=hide_save_local,
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
+        inputs=[
+            org_name,
+            repo_name,
+            private,
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+            temperature,
+            temperature_completion,
+            pipeline_code,
+        ],
+        outputs=[success_message],
+    ).success(
+        fn=show_success_message,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
+    btn_save_local.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=show_save_local,
+        inputs=[],
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        save_local,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+            temperature,
+            repo_name,
+            temperature_completion,
+        ],
+        outputs=[csv_file, json_file],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
+    clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
+    clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
+    clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
+    clear_btn_full.click(
+        fn=lambda df: ("", [], pd.DataFrame(columns=df.columns)),
+        inputs=[dataframe],
+        outputs=[document_column, retrieval_reranking, dataframe],
+    )
+    app.load(fn=swap_visibility, outputs=main_ui)
+    app.load(fn=get_org_dropdown, outputs=[org_name])
+    app.load(fn=get_random_repo_name, outputs=[repo_name])
+    app.load(fn=show_temperature_completion, outputs=[temperature_completion])
+    if SAVE_LOCAL_DIR is not None:
+        app.load(fn=show_save_local_button, outputs=btn_save_local)