domain-specific-datasets-welcome

Runtime error

App Files Files Community

burtenshaw HF staff commited on Apr 26

Commit

f92d1a9

•

1 Parent(s): 8c543d4

Upload 12 files

Browse files

Files changed (12) hide show

README.md +4 -4
app.py +66 -90
defaults.py +42 -0
domain.py +89 -0
hub.py +98 -32
infer.py +18 -0
pages/2_👩🏼‍🔬 Describe Domain.py +281 -0
pages/3_🌱 Generate Dataset.py +257 -0
pages/4_🔍 Review Generated Data.py +48 -0
pipeline.py +208 -0
requirements.txt +8 -1
utils.py +33 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Domain Specific Welcome
-emoji: 🐢
-colorFrom: gray
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.33.0
 app_file: app.py

 ---
+title: Domain Specific Seed
+emoji: 💻
+colorFrom: purple
+colorTo: red
 sdk: streamlit
 sdk_version: 1.33.0
 app_file: app.py

app.py CHANGED Viewed

@@ -1,118 +1,94 @@
-import time
-from hub import (
-    setup_dataset_on_hub,
-    duplicate_space_on_hub,
-    add_project_config_to_space_repo,
 )
-import streamlit as st
-# Constants
-# Written here to avoid defaults.py
-DEFAULT_DOMAIN = "farming"
-st.set_page_config(
-    "Domain Data Grower", page_icon="🧑‍🌾", initial_sidebar_state="collapsed"
-)
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
-st.sidebar.link_button(
-    "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
-)
-################################################################################
-# APP MARKDOWN
-################################################################################
-st.header("🌱 Create a domain specific dataset")
 st.markdown(
-    """This space will set up your domain specific dataset project. It will
-create the resources that you need to build a dataset. Those resources include:
-- A dataset repository on the Hub
-- Another space to define expert domain and run generation pipelines
-For a complete overview of the project. Check out the README
 """
 )
-st.page_link(
-    "pages/🧑‍🌾 Domain Data Grower.py",
-    label="Domain Data Grower",
-    icon="🧑‍🌾",
-)
-################################################################################
-# CONFIGURATION
-################################################################################
-st.subheader("🌾 Project Configuration")
-project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
-hub_username = st.text_input("Hub Username", "argilla")
-hub_token = st.text_input("Hub Token", type="password")
-private_selector = st.checkbox("Private Space", value=False)
-if st.button("🤗 Setup Project Resources"):
-    repo_id = f"{hub_username}/{project_name}"
-    setup_dataset_on_hub(
-        repo_id=repo_id,
-        hub_token=hub_token,
-    )
-    st.success(
-        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}).  Hold on the repo_id: {repo_id}, we will need it in the next steps."
-    )
-    space_name = f"{project_name}_config_space"
-    duplicate_space_on_hub(
-        source_repo="argilla/domain-specific-datasets-template",
-        target_repo=space_name,
-        hub_token=hub_token,
-        private=private_selector,
-    )
-    st.success(
-        f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
-    )
-    argilla_name = f"{project_name}_argilla_space"
-    duplicate_space_on_hub(
-        source_repo="argilla/argilla-template-space",
-        target_repo=argilla_name,
-        hub_token=hub_token,
-        private=private_selector,
-    )
-    st.success(
-        f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
-    )
-    seconds = 5
-    with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
-        time.sleep(seconds)
-        add_project_config_to_space_repo(
-            dataset_repo_id=repo_id,
-            hub_token=hub_token,
-            project_name=project_name,
-            argilla_space_repo_id=f"{hub_username}/{argilla_name}",
-            project_space_repo_id=f"{hub_username}/{space_name}",
-        )
-    st.subheader("👢 Next Steps")
-    st.write("Go to you project specific space!")
-    st.link_button(
-        "🧑‍🌾 Open Configuration Space",
-        f"https://huggingface.co/spaces/{hub_username}/{space_name}",
-    )

+import streamlit as st
+from defaults import (
+    PROJECT_NAME,
+    ARGILLA_SPACE_REPO_ID,
+    DATASET_REPO_ID,
+    ARGILLA_URL,
+    PROJECT_SPACE_REPO_ID,
+    DIBT_PARENT_APP_URL,
 )
+from utils import project_sidebar
+st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
+project_sidebar()
+if PROJECT_NAME == "DEFAULT_DOMAIN":
+    st.warning(
+        "Please set up the project configuration in the parent app before proceeding."
+    )
+    st.stop()
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.markdown(
+    """
+## 🌱 Create a dataset seed for aligning models to a specific domain
+This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
+Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
 """
 )
+st.markdown(
+    """
+## 🚜 How it works
+You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
+The dataset seed is then used to generate synthetic data for training a language model.
+"""
+)
+st.markdown(
+    """
+## 🗺️ The process
+### Step 1: ~~Setup the project~~
+~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
+"""
+)
+st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
+st.markdown(
+    """
+### Step 2: Describe the Domain
+Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
+You can collaborate with domain experts to define the domain expertise and perspectives.
+"""
+)
+st.page_link(
+    "pages/2_👩🏼‍🔬 Describe Domain.py",
+    label="Describe Domain",
+    icon="👩🏼‍🔬",
+)
+st.markdown(
+    """
+### Step 3: Generate Synthetic Data
+Use distilabel to generate synthetic data for your domain-specific dataset.
+You can run the pipeline locally or in this space to generate synthetic data.
+"""
+)
+st.page_link(
+    "pages/3_🌱 Generate Dataset.py",
+    label="Generate Dataset",
+    icon="🌱",
+)
+st.markdown(
+    """
+### Step 4: Review the Dataset
+Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
+"""
+)
+st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL)

defaults.py CHANGED Viewed

@@ -1,7 +1,49 @@
 import json
 SEED_DATA_PATH = "seed_data.json"
 with open(SEED_DATA_PATH) as f:
     DEFAULT_DATA = json.load(f)
 DEFAULT_DOMAIN = DEFAULT_DATA["domain"]

+import os
 import json
 SEED_DATA_PATH = "seed_data.json"
+PIPELINE_PATH = "pipeline.yaml"
+REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
+DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
+N_PERSPECTIVES = 5
+N_TOPICS = 5
+N_EXAMPLES = 5
+CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
+################################################
+# DEFAULTS ON FARMING
+################################################
 with open(SEED_DATA_PATH) as f:
     DEFAULT_DATA = json.load(f)
 DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
+DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"]
+DEFAULT_TOPICS = DEFAULT_DATA["topics"]
+DEFAULT_EXAMPLES = DEFAULT_DATA["examples"]
+DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
+################################################
+# PROJECT CONFIG FROM PARENT APP
+################################################
+try:
+    with open("project_config.json") as f:
+        PROJECT_CONFIG = json.load(f)
+    PROJECT_NAME = PROJECT_CONFIG["project_name"]
+    ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
+    DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
+    ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
+    ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
+    PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
+    DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
+    HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
+except FileNotFoundError:
+    PROJECT_NAME = "DEFAULT_DOMAIN"
+    ARGILLA_SPACE_REPO_ID = ""
+    DATASET_REPO_ID = ""
+    ARGILLA_URL = ""
+    PROJECT_SPACE_REPO_ID = ""
+    DATASET_URL = ""
+    HUB_USERNAME = ""

domain.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+from typing import Any, Dict, List
+from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.text_generation import TextGeneration
+from distilabel.steps import StepInput, StepOutput, Step
+from dotenv import load_dotenv
+from defaults import (
+    DEFAULT_DOMAIN,
+    DEFAULT_PERSPECTIVES,
+    DEFAULT_TOPICS,
+    DEFAULT_EXAMPLES,
+    DEFAULT_SYSTEM_PROMPT,
+    N_PERSPECTIVES,
+    N_TOPICS,
+    N_EXAMPLES,
+)
+load_dotenv()
+# Application description used for SelfInstruct
+APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
+Your should not expect basic but profound questions from your users.
+The queries should reflect a diversity of vision and economic positions and political positions.
+The queries may know about different methods of {DEFAULT_DOMAIN}.
+The queries can be positioned politically, economically, socially, or practically.
+Also take into account the impact of diverse causes on diverse domains."""
+TOPICS = DEFAULT_TOPICS[:N_TOPICS]
+PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
+EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
+def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
+    questions = """ Examples of high quality questions:"""
+    answers = """ Examples of high quality answers:"""
+    for example in examples:
+        questions += f"""\n- Question: {example["question"]}\n"""
+        answers += f"""\n- Answer: {example["answer"]}\n"""
+    _template: str = (
+        """{instruction}\nThis is the the instruction.\n Examples: """
+        + questions
+        + answers
+    )
+    return _template
+def create_topics(topics: List[str], positions: List[str]) -> List[str]:
+    return [
+        f"{topic} from a {position} perspective"
+        for topic in topics
+        for position in positions
+    ]
+class DomainExpert(TextGeneration):
+    """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
+    _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
+    _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
+    def format_input(self, input: Dict[str, Any]) -> "ChatType":
+        return [
+            {
+                "role": "system",
+                "content": self._system_prompt,
+            },
+            {
+                "role": "user",
+                "content": self._template.format(**input),
+            },
+        ]
+class CleanNumberedList(Step):
+    """A step to clean the numbered list of questions."""
+    def process(self, inputs: StepInput) -> StepOutput:
+        import re
+        pattern = r"^\d+\.\s"
+        for input in inputs:
+            input["question"] = re.sub(pattern, "", input["question"])
+        yield inputs

hub.py CHANGED Viewed

@@ -1,10 +1,43 @@
 import json
-from huggingface_hub import duplicate_space, HfApi
 hf_api = HfApi()
 def setup_dataset_on_hub(repo_id, hub_token):
     # create an empty dataset repo on the hub
@@ -12,52 +45,85 @@ def setup_dataset_on_hub(repo_id, hub_token):
         repo_id=repo_id,
         token=hub_token,
         repo_type="dataset",
     )
-    # upload the seed data
     hf_api.upload_file(
-        path_or_fileobj="seed_data.json",
         path_in_repo="seed_data.json",
         repo_id=repo_id,
         repo_type="dataset",
-        token=hub_token,
     )
-def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
-    duplicate_space(
-        from_id=source_repo,
-        to_id=target_repo,
         token=hub_token,
-        private=private,
-        exist_ok=True,
     )
-def add_project_config_to_space_repo(
-    dataset_repo_id,
-    hub_token,
     project_name,
-    argilla_space_repo_id,
-    project_space_repo_id,
 ):
-    #  upload the seed data and readme to the hub
-    with open("project_config.json", "w") as f:
-        json.dump(
-            {
-                "project_name": project_name,
-                "argilla_space_repo_id": argilla_space_repo_id,
-                "project_space_repo_id": project_space_repo_id,
-                "dataset_repo_id": dataset_repo_id,
-            },
-            f,
-        )
     hf_api.upload_file(
-        path_or_fileobj="project_config.json",
-        path_in_repo="project_config.json",
         token=hub_token,
-        repo_id=project_space_repo_id,
-        repo_type="space",
     )

 import json
+from tempfile import mktemp
+import argilla as rg
+from huggingface_hub import HfApi
+from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
 hf_api = HfApi()
+with open("DATASET_README_BASE.md") as f:
+    DATASET_README_BASE = f.read()
+def create_readme(domain_seed_data, project_name, domain):
+    # create a readme for the project that shows the domain and project name
+    readme = DATASET_README_BASE
+    readme += f"# {project_name}\n\n## Domain: {domain}"
+    perspectives = domain_seed_data.get("perspectives")
+    topics = domain_seed_data.get("topics")
+    examples = domain_seed_data.get("examples")
+    if perspectives:
+        readme += "\n\n## Perspectives\n\n"
+        for p in perspectives:
+            readme += f"- {p}\n"
+    if topics:
+        readme += "\n\n## Topics\n\n"
+        for t in topics:
+            readme += f"- {t}\n"
+    if examples:
+        readme += "\n\n## Examples\n\n"
+        for example in examples:
+            readme += f"### {example['question']}\n\n{example['answer']}\n\n"
+    temp_file = mktemp()
+    with open(temp_file, "w") as f:
+        f.write(readme)
+    return temp_file
 def setup_dataset_on_hub(repo_id, hub_token):
     # create an empty dataset repo on the hub
         repo_id=repo_id,
         token=hub_token,
         repo_type="dataset",
+        exist_ok=True,
     )
+def push_dataset_to_hub(
+    domain_seed_data_path,
+    project_name,
+    domain,
+    pipeline_path,
+    hub_username,
+    hub_token: str,
+):
+    repo_id = f"{hub_username}/{project_name}"
+    setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
+    #  upload the seed data and readme to the hub
     hf_api.upload_file(
+        path_or_fileobj=domain_seed_data_path,
         path_in_repo="seed_data.json",
+        token=hub_token,
         repo_id=repo_id,
         repo_type="dataset",
     )
+    # upload the readme to the hub
+    domain_seed_data = json.load(open(domain_seed_data_path))
+    hf_api.upload_file(
+        path_or_fileobj=create_readme(
+            domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
+        ),
+        path_in_repo="README.md",
         token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
     )
+def push_pipeline_to_hub(
+    pipeline_path,
+    hub_username,
+    hub_token: str,
     project_name,
 ):
+    repo_id = f"{hub_username}/{project_name}"
+    # upload the pipeline to the hub
     hf_api.upload_file(
+        path_or_fileobj=pipeline_path,
+        path_in_repo="pipeline.yaml",
         token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    for code_path in REMOTE_CODE_PATHS:
+        hf_api.upload_file(
+            path_or_fileobj=code_path,
+            path_in_repo=code_path,
+            token=hub_token,
+            repo_id=repo_id,
+            repo_type="dataset",
+        )
+    print(f"Dataset uploaded to {repo_id}")
+def pull_seed_data_from_repo(repo_id, hub_token):
+    # pull the dataset repo from the hub
+    hf_api.hf_hub_download(
+        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
     )
+    return json.load(open(SEED_DATA_PATH))
+def push_argilla_dataset_to_hub(
+    name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
+):
+    rg.init(api_url=url, api_key=api_key)
+    feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
+    local_dataset = feedback_dataset.pull()
+    local_dataset.push_to_huggingface(repo_id=repo_id)

infer.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+import requests
+API_URL = (
+    "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
+)
+def query(question, hub_token: str):
+    payload = {
+        "inputs": question,
+    }
+    headers = {"Authorization": f"Bearer {hub_token}"}
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()[0]["generated_text"]

pages/2_👩🏼‍🔬 Describe Domain.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import json
+import streamlit as st
+from hub import push_dataset_to_hub
+from infer import query
+from defaults import (
+    DEFAULT_DOMAIN,
+    DEFAULT_PERSPECTIVES,
+    DEFAULT_TOPICS,
+    DEFAULT_EXAMPLES,
+    DEFAULT_SYSTEM_PROMPT,
+    N_PERSPECTIVES,
+    N_TOPICS,
+    SEED_DATA_PATH,
+    PIPELINE_PATH,
+    DATASET_REPO_ID,
+)
+from utils import project_sidebar
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.subheader(
+    "Step 2. Define the specific domain that you want to generate synthetic data for.",
+)
+st.write(
+    "Define the project details, including the project name, domain, and API credentials"
+)
+################################################################################
+# Domain Expert Section
+################################################################################
+(
+    tab_domain_expert,
+    tab_domain_perspectives,
+    tab_domain_topics,
+    tab_examples,
+    tab_raw_seed,
+) = st.tabs(
+    tabs=[
+        "👩🏼‍🔬 Domain Expert",
+        "🔍 Domain Perspectives",
+        "🕸️ Domain Topics",
+        "📚 Examples",
+        "🌱 Raw Seed Data",
+    ]
+)
+with tab_domain_expert:
+    st.text("Define the domain expertise that you want to train a language model")
+    st.info(
+        "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
+    )
+    domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
+    domain_expert_prompt = st.text_area(
+        label="Domain Expert Definition",
+        value=DEFAULT_SYSTEM_PROMPT,
+        height=200,
+    )
+################################################################################
+# Domain Perspectives
+################################################################################
+with tab_domain_perspectives:
+    st.text("Define the different perspectives from which the domain can be viewed")
+    st.info(
+        """
+    Perspectives are different viewpoints or angles from which a domain can be viewed.
+    For example, the domain of farming can be viewed from the perspective of a commercial
+    farmer or an independent family farmer."""
+    )
+    perspectives = st.session_state.get(
+        "perspectives",
+        [DEFAULT_PERSPECTIVES[0]],
+    )
+    perspectives_container = st.container()
+    perspectives = [
+        perspectives_container.text_input(
+            f"Domain Perspective {i + 1}", value=perspective
+        )
+        for i, perspective in enumerate(perspectives)
+    ]
+    if st.button("Add Perspective", key="add_perspective"):
+        n = len(perspectives)
+        value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
+        perspectives.append(
+            perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
+        )
+    st.session_state["perspectives"] = perspectives
+################################################################################
+# Domain Topics
+################################################################################
+with tab_domain_topics:
+    st.text("Define the main themes or subjects that are relevant to the domain")
+    st.info(
+        """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
+    )
+    topics = st.session_state.get(
+        "topics",
+        [DEFAULT_TOPICS[0]],
+    )
+    topics_container = st.container()
+    topics = [
+        topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
+        for i, topic in enumerate(topics)
+    ]
+    if st.button("Add Topic", key="add_topic"):
+        n = len(topics)
+        value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
+        topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
+    st.session_state["topics"] = topics
+################################################################################
+# Examples Section
+################################################################################
+with tab_examples:
+    st.text(
+        "Add high-quality questions and answers that can be used to generate synthetic data"
+    )
+    st.info(
+        """
+    Examples are high-quality questions and answers that can be used to generate
+    synthetic data for the domain. These examples will be used to train the language model
+    to generate questions and answers.
+    """
+    )
+    examples = st.session_state.get(
+        "examples",
+        [
+            {
+                "question": "",
+                "answer": "",
+            }
+        ],
+    )
+    for n, example in enumerate(examples, 1):
+        question = example["question"]
+        answer = example["answer"]
+        examples_container = st.container()
+        question_column, answer_column = examples_container.columns(2)
+        if st.button(f"Generate Answer {n}"):
+            if st.session_state["hub_token"] is None:
+                st.error("Please provide a Hub token to generate answers")
+            else:
+                answer = query(question, st.session_state["hub_token"])
+        with question_column:
+            question = st.text_area(f"Question {n}", value=question)
+        with answer_column:
+            answer = st.text_area(f"Answer {n}", value=answer)
+        examples[n - 1] = {"question": question, "answer": answer}
+        st.session_state["examples"] = examples
+        st.divider()
+    if st.button("Add Example"):
+        examples.append({"question": "", "answer": ""})
+        st.session_state["examples"] = examples
+        st.rerun()
+################################################################################
+# Save Domain Data
+################################################################################
+perspectives = list(filter(None, perspectives))
+topics = list(filter(None, topics))
+domain_data = {
+    "domain": domain,
+    "perspectives": perspectives,
+    "topics": topics,
+    "examples": examples,
+    "domain_expert_prompt": domain_expert_prompt,
+}
+with open(SEED_DATA_PATH, "w") as f:
+    json.dump(domain_data, f, indent=2)
+with tab_raw_seed:
+    st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
+################################################################################
+# Setup Dataset on the Hub
+################################################################################
+st.divider()
+hub_username = DATASET_REPO_ID.split("/")[0]
+project_name = DATASET_REPO_ID.split("/")[1]
+st.write("Define the dataset repo details on the Hub")
+st.session_state["project_name"] = st.text_input("Project Name", project_name)
+st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
+st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
+if all(
+    (
+        st.session_state.get("project_name"),
+        st.session_state.get("hub_username"),
+        st.session_state.get("hub_token"),
+    )
+):
+    st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
+if st.button("🤗 Push Dataset Seed") and all(
+    (
+        domain,
+        domain_expert_prompt,
+        perspectives,
+        topics,
+        questions_answers,
+    )
+):
+    if all(
+        (
+            st.session_state.get("project_name"),
+            st.session_state.get("hub_username"),
+            st.session_state.get("hub_token"),
+        )
+    ):
+        project_name = st.session_state["project_name"]
+        hub_username = st.session_state["hub_username"]
+        hub_token = st.session_state["hub_token"]
+    else:
+        st.error(
+            "Please create a dataset repo on the Hub before pushing the dataset seed"
+        )
+        st.stop()
+    push_dataset_to_hub(
+        domain_seed_data_path=SEED_DATA_PATH,
+        project_name=project_name,
+        domain=domain,
+        hub_username=hub_username,
+        hub_token=hub_token,
+        pipeline_path=PIPELINE_PATH,
+    )
+    st.success(
+        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
+    )
+    st.write("You can now move on to runnning your distilabel pipeline.")
+    st.page_link(
+        page="pages/3_🌱 Generate Dataset.py",
+        label="Generate Dataset",
+        icon="🌱",
+    )
+else:
+    st.info(
+        "Please fill in all the required domain fields to push the dataset seed to the Hub"
+    )

pages/3_🌱 Generate Dataset.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import streamlit as st
+from hub import pull_seed_data_from_repo, push_pipeline_to_hub
+from defaults import (
+    DEFAULT_SYSTEM_PROMPT,
+    PIPELINE_PATH,
+    PROJECT_NAME,
+    ARGILLA_URL,
+    HUB_USERNAME,
+    CODELESS_DISTILABEL,
+)
+from utils import project_sidebar
+from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.subheader("Step 3. Run the pipeline to generate synthetic data")
+st.write("Define the project repos and models that the pipeline will use.")
+st.divider()
+###############################################################
+# CONFIGURATION
+###############################################################
+st.markdown("## Pipeline Configuration")
+st.markdown("#### 🤗 Hub details to pull the seed data")
+hub_username = st.text_input("Hub Username", HUB_USERNAME)
+project_name = st.text_input("Project Name", PROJECT_NAME)
+repo_id = f"{hub_username}/{project_name}"
+hub_token = st.text_input("Hub Token", type="password")
+st.divider()
+st.markdown("#### 🤖 Inference configuration")
+st.write(
+    "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
+)
+with st.expander("🤗 Recommended Models"):
+    st.write("All inference endpoint compatible models can be found via the link below")
+    st.link_button(
+        "🤗 Inference compaptible models on the hub",
+        "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
+    )
+    st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
+    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
+    st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
+    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
+    st.write("🍃Projects with even less resources could take advantage of Phi-2")
+    st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
+    st.write("Note Hugggingface Pro gives access to more compute resources")
+    st.link_button(
+        "🤗 Huggingface Pro",
+        "https://huggingface.co/pricing",
+    )
+base_url = st.text_input(
+    label="Base URL for the Inference API",
+    value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
+)
+st.divider()
+st.markdown("#### 🔬 Argilla API details to push the generated dataset")
+argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
+argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
+argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
+st.divider()
+###############################################################
+# LOCAL
+###############################################################
+st.markdown("## Run the pipeline")
+st.write(
+    "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
+)
+if CODELESS_DISTILABEL:
+    st.write(
+        """We recommend running the pipeline locally if you're planning on generating a large dataset. \
+            But running the pipeline on this space is a handy way to get started quickly. Your synthetic
+            samples will be pushed to Argilla and available for review.
+            """
+    )
+    st.write(
+        """If you're planning on running the pipeline on the space, be aware that it \
+            will take some time to complete and you will need to maintain a \
+            connection to the space."""
+    )
+if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
+    if all(
+        [
+            argilla_api_key,
+            argilla_url,
+            base_url,
+            hub_username,
+            project_name,
+            hub_token,
+            argilla_dataset_name,
+        ]
+    ):
+        with st.spinner("Pulling seed data from the Hub..."):
+            try:
+                seed_data = pull_seed_data_from_repo(
+                    repo_id=f"{hub_username}/{project_name}",
+                    hub_token=hub_token,
+                )
+            except Exception:
+                st.error(
+                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
+                )
+            domain = seed_data["domain"]
+            perspectives = seed_data["perspectives"]
+            topics = seed_data["topics"]
+            examples = seed_data["examples"]
+            domain_expert_prompt = seed_data["domain_expert_prompt"]
+        with st.spinner("Serializing the pipeline configuration..."):
+            serialize_pipeline(
+                argilla_api_key=argilla_api_key,
+                argilla_dataset_name=argilla_dataset_name,
+                argilla_api_url=argilla_url,
+                topics=topics,
+                perspectives=perspectives,
+                pipeline_config_path=PIPELINE_PATH,
+                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
+                hub_token=hub_token,
+                endpoint_base_url=base_url,
+                examples=examples,
+            )
+            push_pipeline_to_hub(
+                pipeline_path=PIPELINE_PATH,
+                hub_token=hub_token,
+                hub_username=hub_username,
+                project_name=project_name,
+            )
+        st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
+        st.info(
+            "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
+        )
+        st.text(
+            "Execute the following command to generate a synthetic dataset from the seed data:"
+        )
+        command_to_run = create_pipelines_run_command(
+            hub_token=hub_token,
+            pipeline_config_path=PIPELINE_PATH,
+            argilla_dataset_name=argilla_dataset_name,
+            argilla_api_key=argilla_api_key,
+            argilla_api_url=argilla_url,
+        )
+        st.code(
+            f"""
+            pip install git+https://github.com/argilla-io/distilabel.git
+            git clone https://huggingface.co/datasets/{hub_username}/{project_name}
+            cd {project_name}
+            pip install -r requirements.txt
+            {' '.join(["python"] + command_to_run[1:])}
+        """,
+            language="bash",
+        )
+        st.subheader(
+            "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
+        )
+        st.code(
+            """
+            git clone https://github.com/huggingface/data-is-better-together
+            cd domain-specific-datasets
+            """
+        )
+    else:
+        st.error("Please fill all the required fields.")
+###############################################################
+# SPACE
+###############################################################
+if CODELESS_DISTILABEL:
+    if st.button("🔥 Run pipeline right here, right now!"):
+        if all(
+            [
+                argilla_api_key,
+                argilla_url,
+                base_url,
+                hub_username,
+                project_name,
+                hub_token,
+                argilla_dataset_name,
+            ]
+        ):
+            with st.spinner("Pulling seed data from the Hub..."):
+                try:
+                    seed_data = pull_seed_data_from_repo(
+                        repo_id=f"{hub_username}/{project_name}",
+                        hub_token=hub_token,
+                    )
+                except Exception as e:
+                    st.error(
+                        "Seed data not found. Please make sure you pushed the data seed in Step 2."
+                    )
+                domain = seed_data["domain"]
+                perspectives = seed_data["perspectives"]
+                topics = seed_data["topics"]
+                examples = seed_data["examples"]
+                domain_expert_prompt = seed_data["domain_expert_prompt"]
+                serialize_pipeline(
+                    argilla_api_key=argilla_api_key,
+                    argilla_dataset_name=argilla_dataset_name,
+                    argilla_api_url=argilla_url,
+                    topics=topics,
+                    perspectives=perspectives,
+                    pipeline_config_path=PIPELINE_PATH,
+                    domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
+                    hub_token=hub_token,
+                    endpoint_base_url=base_url,
+                    examples=examples,
+                )
+            with st.spinner("Starting the pipeline..."):
+                logs = run_pipeline(
+                    pipeline_config_path=PIPELINE_PATH,
+                    argilla_api_key=argilla_api_key,
+                    argilla_api_url=argilla_url,
+                    hub_token=hub_token,
+                    argilla_dataset_name=argilla_dataset_name,
+                )
+            st.success(f"Pipeline started successfully! 🚀")
+            with st.expander(label="View Logs", expanded=True):
+                for out in logs:
+                    st.text(out)
+        else:
+            st.error("Please fill all the required fields.")

pages/4_🔍 Review Generated Data.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
+from utils import project_sidebar
+from hub import push_argilla_dataset_to_hub
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.write(
+    """Once you have reviewed the synthetic data in Argilla, you can publish the
+    generated dataset to the Hub."""
+)
+################################################################################
+# Configuration
+################################################################################
+st.divider()
+st.write("🔬 Argilla API details to push the generated dataset")
+argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
+argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
+argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
+dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
+st.divider()
+if st.button("🚀 Publish the generated dataset"):
+    with st.spinner("Publishing the generated dataset..."):
+        push_argilla_dataset_to_hub(
+            name=argilla_dataset_name,
+            repo_id=dataset_repo_id,
+            url=argilla_url,
+            api_key=argilla_api_key,
+            workspace="admin",
+        )
+    st.success("The generated dataset has been published to the Hub.")

pipeline.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import subprocess
+import sys
+import time
+from typing import List
+from distilabel.steps.generators.data import LoadDataFromDicts
+from distilabel.steps.expand import ExpandColumns
+from distilabel.steps.keep import KeepColumns
+from distilabel.steps.tasks.self_instruct import SelfInstruct
+from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
+from distilabel.llms.huggingface import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import TextGenerationToArgilla
+from dotenv import load_dotenv
+from domain import (
+    DomainExpert,
+    CleanNumberedList,
+    create_topics,
+    create_examples_template,
+    APPLICATION_DESCRIPTION,
+)
+load_dotenv()
+def define_pipeline(
+    argilla_api_key: str,
+    argilla_api_url: str,
+    argilla_dataset_name: str,
+    topics: List[str],
+    perspectives: List[str],
+    domain_expert_prompt: str,
+    examples: List[dict],
+    hub_token: str,
+    endpoint_base_url: str,
+):
+    """Define the pipeline for the specific domain."""
+    terms = create_topics(topics, perspectives)
+    template = create_examples_template(examples)
+    with Pipeline("farming") as pipeline:
+        load_data = LoadDataFromDicts(
+            name="load_data",
+            data=[{"input": term} for term in terms],
+            batch_size=64,
+        )
+        llm = InferenceEndpointsLLM(
+            base_url=endpoint_base_url,
+            api_key=hub_token,
+        )
+        self_instruct = SelfInstruct(
+            name="self-instruct",
+            application_description=APPLICATION_DESCRIPTION,
+            num_instructions=5,
+            input_batch_size=8,
+            llm=llm,
+        )
+        evol_instruction_complexity = EvolInstruct(
+            name="evol_instruction_complexity",
+            llm=llm,
+            num_evolutions=2,
+            store_evolutions=True,
+            input_batch_size=8,
+            include_original_instruction=True,
+            input_mappings={"instruction": "question"},
+        )
+        expand_instructions = ExpandColumns(
+            name="expand_columns", columns={"instructions": "question"}
+        )
+        cleaner = CleanNumberedList(name="clean_numbered_list")
+        expand_evolutions = ExpandColumns(
+            name="expand_columns_evolved",
+            columns={"evolved_instructions": "evolved_questions"},
+        )
+        domain_expert = DomainExpert(
+            name="domain_expert",
+            llm=llm,
+            input_batch_size=8,
+            input_mappings={"instruction": "evolved_questions"},
+            output_mappings={"generation": "domain_expert_answer"},
+        )
+        domain_expert._system_prompt = domain_expert_prompt
+        domain_expert._template = template
+        keep_columns = KeepColumns(
+            name="keep_columns",
+            columns=["model_name", "evolved_questions", "domain_expert_answer"],
+        )
+        to_argilla = TextGenerationToArgilla(
+            name="text_generation_to_argilla",
+            dataset_name=argilla_dataset_name,
+            dataset_workspace="admin",
+            api_url=argilla_api_url,
+            api_key=argilla_api_key,
+            input_mappings={
+                "instruction": "evolved_questions",
+                "generation": "domain_expert_answer",
+            },
+        )
+        load_data.connect(self_instruct)
+        self_instruct.connect(expand_instructions)
+        expand_instructions.connect(cleaner)
+        cleaner.connect(evol_instruction_complexity)
+        evol_instruction_complexity.connect(expand_evolutions)
+        expand_evolutions.connect(domain_expert)
+        domain_expert.connect(keep_columns)
+        keep_columns.connect(to_argilla)
+    return pipeline
+def serialize_pipeline(
+    argilla_api_key: str,
+    argilla_api_url: str,
+    argilla_dataset_name: str,
+    topics: List[str],
+    perspectives: List[str],
+    domain_expert_prompt: str,
+    hub_token: str,
+    endpoint_base_url: str,
+    pipeline_config_path: str = "pipeline.yaml",
+    examples: List[dict] = [],
+):
+    """Serialize the pipeline to a yaml file."""
+    pipeline = define_pipeline(
+        argilla_api_key=argilla_api_key,
+        argilla_api_url=argilla_api_url,
+        argilla_dataset_name=argilla_dataset_name,
+        topics=topics,
+        perspectives=perspectives,
+        domain_expert_prompt=domain_expert_prompt,
+        hub_token=hub_token,
+        endpoint_base_url=endpoint_base_url,
+        examples=examples,
+    )
+    pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
+def create_pipelines_run_command(
+    hub_token: str,
+    argilla_api_key: str,
+    argilla_api_url: str,
+    pipeline_config_path: str = "pipeline.yaml",
+    argilla_dataset_name: str = "domain_specific_datasets",
+):
+    """Create the command to run the pipeline."""
+    command_to_run = [
+        sys.executable,
+        "-m",
+        "distilabel",
+        "pipeline",
+        "run",
+        "--config",
+        pipeline_config_path,
+        "--param",
+        f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
+        "--param",
+        f"text_generation_to_argilla.api_key={argilla_api_key}",
+        "--param",
+        f"text_generation_to_argilla.api_url={argilla_api_url}",
+        "--param",
+        f"self-instruct.llm.api_key={hub_token}",
+        "--param",
+        f"evol_instruction_complexity.llm.api_key={hub_token}",
+        "--param",
+        f"domain_expert.llm.api_key={hub_token}",
+        "--ignore-cache",
+    ]
+    return command_to_run
+def run_pipeline(
+    hub_token: str,
+    argilla_api_key: str,
+    argilla_api_url: str,
+    pipeline_config_path: str = "pipeline.yaml",
+    argilla_dataset_name: str = "domain_specific_datasets",
+):
+    """Run the pipeline and yield the output as a generator of logs."""
+    command_to_run = create_pipelines_run_command(
+        hub_token=hub_token,
+        pipeline_config_path=pipeline_config_path,
+        argilla_dataset_name=argilla_dataset_name,
+        argilla_api_key=argilla_api_key,
+        argilla_api_url=argilla_api_url,
+    )
+    # Run the script file
+    process = subprocess.Popen(
+        args=command_to_run,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env={"HF_TOKEN": hub_token},
+    )
+    while process.stdout and process.stdout.readable():
+        time.sleep(0.2)
+        line = process.stdout.readline()
+        if not line:
+            break
+        yield line.decode("utf-8")

requirements.txt CHANGED Viewed

	@@ -1 +1,8 @@
1	- ~~huggingface_hub~~

+datasets
+python_dotenv
+sentence_transformers
+streamlit
+huggingface_hub
+mistralai
+argilla
+git+https://github.com/argilla-io/distilabel.git

utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+from defaults import (
+    ARGILLA_SPACE_REPO_ID,
+    PROJECT_NAME,
+    ARGILLA_URL,
+    DIBT_PARENT_APP_URL,
+    DATASET_URL,
+    DATASET_REPO_ID,
+    ARGILLA_SPACE_REPO_ID,
+)
+def project_sidebar():
+    if PROJECT_NAME == "DEFAULT_DOMAIN":
+        st.warning(
+            "Please set up the project configuration in the parent app before proceeding."
+        )
+        st.stop()
+    st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
+    st.sidebar.markdown(
+        """
+        This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
+        """
+    )
+    st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
+    st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
+    st.sidebar.divider()
+    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
+    st.sidebar.link_button(
+        "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
+    )