domain-specific-datasets-welcome

Runtime error

App Files Files Community

Ben Burtenshaw commited on Apr 26

Commit

0ac0929

•

1 Parent(s): 01af24e

fix expose pages on parent app

Browse files

Files changed (7) hide show

domain.py +0 -89
infer.py +0 -18
pages/2_👩🏼‍🔬 Describe Domain.py +0 -281
pages/3_🌱 Generate Dataset.py +0 -257
pages/4_🔍 Review Generated Data.py +0 -48
pipeline.py +0 -208
requirements.txt +0 -4

domain.py DELETED Viewed

@@ -1,89 +0,0 @@
-import json
-from typing import Any, Dict, List
-from distilabel.steps.tasks.typing import ChatType
-from distilabel.steps.tasks.text_generation import TextGeneration
-from distilabel.steps import StepInput, StepOutput, Step
-from dotenv import load_dotenv
-from defaults import (
-    DEFAULT_DOMAIN,
-    DEFAULT_PERSPECTIVES,
-    DEFAULT_TOPICS,
-    DEFAULT_EXAMPLES,
-    DEFAULT_SYSTEM_PROMPT,
-    N_PERSPECTIVES,
-    N_TOPICS,
-    N_EXAMPLES,
-)
-load_dotenv()
-# Application description used for SelfInstruct
-APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
-Your should not expect basic but profound questions from your users.
-The queries should reflect a diversity of vision and economic positions and political positions.
-The queries may know about different methods of {DEFAULT_DOMAIN}.
-The queries can be positioned politically, economically, socially, or practically.
-Also take into account the impact of diverse causes on diverse domains."""
-TOPICS = DEFAULT_TOPICS[:N_TOPICS]
-PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
-EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
-def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
-    questions = """ Examples of high quality questions:"""
-    answers = """ Examples of high quality answers:"""
-    for example in examples:
-        questions += f"""\n- Question: {example["question"]}\n"""
-        answers += f"""\n- Answer: {example["answer"]}\n"""
-    _template: str = (
-        """{instruction}\nThis is the the instruction.\n Examples: """
-        + questions
-        + answers
-    )
-    return _template
-def create_topics(topics: List[str], positions: List[str]) -> List[str]:
-    return [
-        f"{topic} from a {position} perspective"
-        for topic in topics
-        for position in positions
-    ]
-class DomainExpert(TextGeneration):
-    """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
-    _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
-    _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
-    def format_input(self, input: Dict[str, Any]) -> "ChatType":
-        return [
-            {
-                "role": "system",
-                "content": self._system_prompt,
-            },
-            {
-                "role": "user",
-                "content": self._template.format(**input),
-            },
-        ]
-class CleanNumberedList(Step):
-    """A step to clean the numbered list of questions."""
-    def process(self, inputs: StepInput) -> StepOutput:
-        import re
-        pattern = r"^\d+\.\s"
-        for input in inputs:
-            input["question"] = re.sub(pattern, "", input["question"])
-        yield inputs

infer.py DELETED Viewed

@@ -1,18 +0,0 @@
-import os
-import requests
-API_URL = (
-    "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
-)
-def query(question, hub_token: str):
-    payload = {
-        "inputs": question,
-    }
-    headers = {"Authorization": f"Bearer {hub_token}"}
-    response = requests.post(API_URL, headers=headers, json=payload)
-    return response.json()[0]["generated_text"]

pages/2_👩🏼‍🔬 Describe Domain.py DELETED Viewed

@@ -1,281 +0,0 @@
-import json
-import streamlit as st
-from hub import push_dataset_to_hub
-from infer import query
-from defaults import (
-    DEFAULT_DOMAIN,
-    DEFAULT_PERSPECTIVES,
-    DEFAULT_TOPICS,
-    DEFAULT_EXAMPLES,
-    DEFAULT_SYSTEM_PROMPT,
-    N_PERSPECTIVES,
-    N_TOPICS,
-    SEED_DATA_PATH,
-    PIPELINE_PATH,
-    DATASET_REPO_ID,
-)
-from utils import project_sidebar
-st.set_page_config(
-    page_title="Domain Data Grower",
-    page_icon="🧑‍🌾",
-)
-project_sidebar()
-################################################################################
-# HEADER
-################################################################################
-st.header("🧑‍🌾 Domain Data Grower")
-st.divider()
-st.subheader(
-    "Step 2. Define the specific domain that you want to generate synthetic data for.",
-)
-st.write(
-    "Define the project details, including the project name, domain, and API credentials"
-)
-################################################################################
-# Domain Expert Section
-################################################################################
-(
-    tab_domain_expert,
-    tab_domain_perspectives,
-    tab_domain_topics,
-    tab_examples,
-    tab_raw_seed,
-) = st.tabs(
-    tabs=[
-        "👩🏼‍🔬 Domain Expert",
-        "🔍 Domain Perspectives",
-        "🕸️ Domain Topics",
-        "📚 Examples",
-        "🌱 Raw Seed Data",
-    ]
-)
-with tab_domain_expert:
-    st.text("Define the domain expertise that you want to train a language model")
-    st.info(
-        "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
-    )
-    domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
-    domain_expert_prompt = st.text_area(
-        label="Domain Expert Definition",
-        value=DEFAULT_SYSTEM_PROMPT,
-        height=200,
-    )
-################################################################################
-# Domain Perspectives
-################################################################################
-with tab_domain_perspectives:
-    st.text("Define the different perspectives from which the domain can be viewed")
-    st.info(
-        """
-    Perspectives are different viewpoints or angles from which a domain can be viewed.
-    For example, the domain of farming can be viewed from the perspective of a commercial
-    farmer or an independent family farmer."""
-    )
-    perspectives = st.session_state.get(
-        "perspectives",
-        [DEFAULT_PERSPECTIVES[0]],
-    )
-    perspectives_container = st.container()
-    perspectives = [
-        perspectives_container.text_input(
-            f"Domain Perspective {i + 1}", value=perspective
-        )
-        for i, perspective in enumerate(perspectives)
-    ]
-    if st.button("Add Perspective", key="add_perspective"):
-        n = len(perspectives)
-        value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
-        perspectives.append(
-            perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
-        )
-    st.session_state["perspectives"] = perspectives
-################################################################################
-# Domain Topics
-################################################################################
-with tab_domain_topics:
-    st.text("Define the main themes or subjects that are relevant to the domain")
-    st.info(
-        """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
-    )
-    topics = st.session_state.get(
-        "topics",
-        [DEFAULT_TOPICS[0]],
-    )
-    topics_container = st.container()
-    topics = [
-        topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
-        for i, topic in enumerate(topics)
-    ]
-    if st.button("Add Topic", key="add_topic"):
-        n = len(topics)
-        value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
-        topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
-    st.session_state["topics"] = topics
-################################################################################
-# Examples Section
-################################################################################
-with tab_examples:
-    st.text(
-        "Add high-quality questions and answers that can be used to generate synthetic data"
-    )
-    st.info(
-        """
-    Examples are high-quality questions and answers that can be used to generate
-    synthetic data for the domain. These examples will be used to train the language model
-    to generate questions and answers.
-    """
-    )
-    examples = st.session_state.get(
-        "examples",
-        [
-            {
-                "question": "",
-                "answer": "",
-            }
-        ],
-    )
-    for n, example in enumerate(examples, 1):
-        question = example["question"]
-        answer = example["answer"]
-        examples_container = st.container()
-        question_column, answer_column = examples_container.columns(2)
-        if st.button(f"Generate Answer {n}"):
-            if st.session_state["hub_token"] is None:
-                st.error("Please provide a Hub token to generate answers")
-            else:
-                answer = query(question, st.session_state["hub_token"])
-        with question_column:
-            question = st.text_area(f"Question {n}", value=question)
-        with answer_column:
-            answer = st.text_area(f"Answer {n}", value=answer)
-        examples[n - 1] = {"question": question, "answer": answer}
-        st.session_state["examples"] = examples
-        st.divider()
-    if st.button("Add Example"):
-        examples.append({"question": "", "answer": ""})
-        st.session_state["examples"] = examples
-        st.rerun()
-################################################################################
-# Save Domain Data
-################################################################################
-perspectives = list(filter(None, perspectives))
-topics = list(filter(None, topics))
-domain_data = {
-    "domain": domain,
-    "perspectives": perspectives,
-    "topics": topics,
-    "examples": examples,
-    "domain_expert_prompt": domain_expert_prompt,
-}
-with open(SEED_DATA_PATH, "w") as f:
-    json.dump(domain_data, f, indent=2)
-with tab_raw_seed:
-    st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
-################################################################################
-# Setup Dataset on the Hub
-################################################################################
-st.divider()
-hub_username = DATASET_REPO_ID.split("/")[0]
-project_name = DATASET_REPO_ID.split("/")[1]
-st.write("Define the dataset repo details on the Hub")
-st.session_state["project_name"] = st.text_input("Project Name", project_name)
-st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
-st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
-if all(
-    (
-        st.session_state.get("project_name"),
-        st.session_state.get("hub_username"),
-        st.session_state.get("hub_token"),
-    )
-):
-    st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
-if st.button("🤗 Push Dataset Seed") and all(
-    (
-        domain,
-        domain_expert_prompt,
-        perspectives,
-        topics,
-        questions_answers,
-    )
-):
-    if all(
-        (
-            st.session_state.get("project_name"),
-            st.session_state.get("hub_username"),
-            st.session_state.get("hub_token"),
-        )
-    ):
-        project_name = st.session_state["project_name"]
-        hub_username = st.session_state["hub_username"]
-        hub_token = st.session_state["hub_token"]
-    else:
-        st.error(
-            "Please create a dataset repo on the Hub before pushing the dataset seed"
-        )
-        st.stop()
-    push_dataset_to_hub(
-        domain_seed_data_path=SEED_DATA_PATH,
-        project_name=project_name,
-        domain=domain,
-        hub_username=hub_username,
-        hub_token=hub_token,
-        pipeline_path=PIPELINE_PATH,
-    )
-    st.success(
-        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
-    )
-    st.write("You can now move on to runnning your distilabel pipeline.")
-    st.page_link(
-        page="pages/3_🌱 Generate Dataset.py",
-        label="Generate Dataset",
-        icon="🌱",
-    )
-else:
-    st.info(
-        "Please fill in all the required domain fields to push the dataset seed to the Hub"
-    )

pages/3_🌱 Generate Dataset.py DELETED Viewed

@@ -1,257 +0,0 @@
-import streamlit as st
-from hub import pull_seed_data_from_repo, push_pipeline_to_hub
-from defaults import (
-    DEFAULT_SYSTEM_PROMPT,
-    PIPELINE_PATH,
-    PROJECT_NAME,
-    ARGILLA_URL,
-    HUB_USERNAME,
-    CODELESS_DISTILABEL,
-)
-from utils import project_sidebar
-from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
-st.set_page_config(
-    page_title="Domain Data Grower",
-    page_icon="🧑‍🌾",
-)
-project_sidebar()
-################################################################################
-# HEADER
-################################################################################
-st.header("🧑‍🌾 Domain Data Grower")
-st.divider()
-st.subheader("Step 3. Run the pipeline to generate synthetic data")
-st.write("Define the project repos and models that the pipeline will use.")
-st.divider()
-###############################################################
-# CONFIGURATION
-###############################################################
-st.markdown("## Pipeline Configuration")
-st.markdown("#### 🤗 Hub details to pull the seed data")
-hub_username = st.text_input("Hub Username", HUB_USERNAME)
-project_name = st.text_input("Project Name", PROJECT_NAME)
-repo_id = f"{hub_username}/{project_name}"
-hub_token = st.text_input("Hub Token", type="password")
-st.divider()
-st.markdown("#### 🤖 Inference configuration")
-st.write(
-    "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
-)
-with st.expander("🤗 Recommended Models"):
-    st.write("All inference endpoint compatible models can be found via the link below")
-    st.link_button(
-        "🤗 Inference compaptible models on the hub",
-        "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
-    )
-    st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
-    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
-    st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
-    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
-    st.write("🍃Projects with even less resources could take advantage of Phi-2")
-    st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
-    st.write("Note Hugggingface Pro gives access to more compute resources")
-    st.link_button(
-        "🤗 Huggingface Pro",
-        "https://huggingface.co/pricing",
-    )
-base_url = st.text_input(
-    label="Base URL for the Inference API",
-    value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
-)
-st.divider()
-st.markdown("#### 🔬 Argilla API details to push the generated dataset")
-argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
-argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
-argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
-st.divider()
-###############################################################
-# LOCAL
-###############################################################
-st.markdown("## Run the pipeline")
-st.write(
-    "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
-)
-if CODELESS_DISTILABEL:
-    st.write(
-        """We recommend running the pipeline locally if you're planning on generating a large dataset. \
-            But running the pipeline on this space is a handy way to get started quickly. Your synthetic
-            samples will be pushed to Argilla and available for review.
-            """
-    )
-    st.write(
-        """If you're planning on running the pipeline on the space, be aware that it \
-            will take some time to complete and you will need to maintain a \
-            connection to the space."""
-    )
-if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
-    if all(
-        [
-            argilla_api_key,
-            argilla_url,
-            base_url,
-            hub_username,
-            project_name,
-            hub_token,
-            argilla_dataset_name,
-        ]
-    ):
-        with st.spinner("Pulling seed data from the Hub..."):
-            try:
-                seed_data = pull_seed_data_from_repo(
-                    repo_id=f"{hub_username}/{project_name}",
-                    hub_token=hub_token,
-                )
-            except Exception:
-                st.error(
-                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                )
-            domain = seed_data["domain"]
-            perspectives = seed_data["perspectives"]
-            topics = seed_data["topics"]
-            examples = seed_data["examples"]
-            domain_expert_prompt = seed_data["domain_expert_prompt"]
-        with st.spinner("Serializing the pipeline configuration..."):
-            serialize_pipeline(
-                argilla_api_key=argilla_api_key,
-                argilla_dataset_name=argilla_dataset_name,
-                argilla_api_url=argilla_url,
-                topics=topics,
-                perspectives=perspectives,
-                pipeline_config_path=PIPELINE_PATH,
-                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                hub_token=hub_token,
-                endpoint_base_url=base_url,
-                examples=examples,
-            )
-            push_pipeline_to_hub(
-                pipeline_path=PIPELINE_PATH,
-                hub_token=hub_token,
-                hub_username=hub_username,
-                project_name=project_name,
-            )
-        st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
-        st.info(
-            "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
-        )
-        st.text(
-            "Execute the following command to generate a synthetic dataset from the seed data:"
-        )
-        command_to_run = create_pipelines_run_command(
-            hub_token=hub_token,
-            pipeline_config_path=PIPELINE_PATH,
-            argilla_dataset_name=argilla_dataset_name,
-            argilla_api_key=argilla_api_key,
-            argilla_api_url=argilla_url,
-        )
-        st.code(
-            f"""
-            pip install git+https://github.com/argilla-io/distilabel.git
-            git clone https://huggingface.co/datasets/{hub_username}/{project_name}
-            cd {project_name}
-            pip install -r requirements.txt
-            {' '.join(["python"] + command_to_run[1:])}
-        """,
-            language="bash",
-        )
-        st.subheader(
-            "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
-        )
-        st.code(
-            """
-            git clone https://github.com/huggingface/data-is-better-together
-            cd domain-specific-datasets
-            """
-        )
-    else:
-        st.error("Please fill all the required fields.")
-###############################################################
-# SPACE
-###############################################################
-if CODELESS_DISTILABEL:
-    if st.button("🔥 Run pipeline right here, right now!"):
-        if all(
-            [
-                argilla_api_key,
-                argilla_url,
-                base_url,
-                hub_username,
-                project_name,
-                hub_token,
-                argilla_dataset_name,
-            ]
-        ):
-            with st.spinner("Pulling seed data from the Hub..."):
-                try:
-                    seed_data = pull_seed_data_from_repo(
-                        repo_id=f"{hub_username}/{project_name}",
-                        hub_token=hub_token,
-                    )
-                except Exception as e:
-                    st.error(
-                        "Seed data not found. Please make sure you pushed the data seed in Step 2."
-                    )
-                domain = seed_data["domain"]
-                perspectives = seed_data["perspectives"]
-                topics = seed_data["topics"]
-                examples = seed_data["examples"]
-                domain_expert_prompt = seed_data["domain_expert_prompt"]
-                serialize_pipeline(
-                    argilla_api_key=argilla_api_key,
-                    argilla_dataset_name=argilla_dataset_name,
-                    argilla_api_url=argilla_url,
-                    topics=topics,
-                    perspectives=perspectives,
-                    pipeline_config_path=PIPELINE_PATH,
-                    domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                    hub_token=hub_token,
-                    endpoint_base_url=base_url,
-                    examples=examples,
-                )
-            with st.spinner("Starting the pipeline..."):
-                logs = run_pipeline(
-                    pipeline_config_path=PIPELINE_PATH,
-                    argilla_api_key=argilla_api_key,
-                    argilla_api_url=argilla_url,
-                    hub_token=hub_token,
-                    argilla_dataset_name=argilla_dataset_name,
-                )
-            st.success(f"Pipeline started successfully! 🚀")
-            with st.expander(label="View Logs", expanded=True):
-                for out in logs:
-                    st.text(out)
-        else:
-            st.error("Please fill all the required fields.")

pages/4_🔍 Review Generated Data.py DELETED Viewed

@@ -1,48 +0,0 @@
-import streamlit as st
-from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
-from utils import project_sidebar
-from hub import push_argilla_dataset_to_hub
-st.set_page_config(
-    page_title="Domain Data Grower",
-    page_icon="🧑‍🌾",
-)
-project_sidebar()
-################################################################################
-# HEADER
-################################################################################
-st.header("🧑‍🌾 Domain Data Grower")
-st.divider()
-st.write(
-    """Once you have reviewed the synthetic data in Argilla, you can publish the
-    generated dataset to the Hub."""
-)
-################################################################################
-# Configuration
-################################################################################
-st.divider()
-st.write("🔬 Argilla API details to push the generated dataset")
-argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
-argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
-argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
-dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
-st.divider()
-if st.button("🚀 Publish the generated dataset"):
-    with st.spinner("Publishing the generated dataset..."):
-        push_argilla_dataset_to_hub(
-            name=argilla_dataset_name,
-            repo_id=dataset_repo_id,
-            url=argilla_url,
-            api_key=argilla_api_key,
-            workspace="admin",
-        )
-    st.success("The generated dataset has been published to the Hub.")

pipeline.py DELETED Viewed

@@ -1,208 +0,0 @@
-import subprocess
-import sys
-import time
-from typing import List
-from distilabel.steps.generators.data import LoadDataFromDicts
-from distilabel.steps.expand import ExpandColumns
-from distilabel.steps.keep import KeepColumns
-from distilabel.steps.tasks.self_instruct import SelfInstruct
-from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
-from distilabel.llms.huggingface import InferenceEndpointsLLM
-from distilabel.pipeline import Pipeline
-from distilabel.steps import TextGenerationToArgilla
-from dotenv import load_dotenv
-from domain import (
-    DomainExpert,
-    CleanNumberedList,
-    create_topics,
-    create_examples_template,
-    APPLICATION_DESCRIPTION,
-)
-load_dotenv()
-def define_pipeline(
-    argilla_api_key: str,
-    argilla_api_url: str,
-    argilla_dataset_name: str,
-    topics: List[str],
-    perspectives: List[str],
-    domain_expert_prompt: str,
-    examples: List[dict],
-    hub_token: str,
-    endpoint_base_url: str,
-):
-    """Define the pipeline for the specific domain."""
-    terms = create_topics(topics, perspectives)
-    template = create_examples_template(examples)
-    with Pipeline("farming") as pipeline:
-        load_data = LoadDataFromDicts(
-            name="load_data",
-            data=[{"input": term} for term in terms],
-            batch_size=64,
-        )
-        llm = InferenceEndpointsLLM(
-            base_url=endpoint_base_url,
-            api_key=hub_token,
-        )
-        self_instruct = SelfInstruct(
-            name="self-instruct",
-            application_description=APPLICATION_DESCRIPTION,
-            num_instructions=5,
-            input_batch_size=8,
-            llm=llm,
-        )
-        evol_instruction_complexity = EvolInstruct(
-            name="evol_instruction_complexity",
-            llm=llm,
-            num_evolutions=2,
-            store_evolutions=True,
-            input_batch_size=8,
-            include_original_instruction=True,
-            input_mappings={"instruction": "question"},
-        )
-        expand_instructions = ExpandColumns(
-            name="expand_columns", columns={"instructions": "question"}
-        )
-        cleaner = CleanNumberedList(name="clean_numbered_list")
-        expand_evolutions = ExpandColumns(
-            name="expand_columns_evolved",
-            columns={"evolved_instructions": "evolved_questions"},
-        )
-        domain_expert = DomainExpert(
-            name="domain_expert",
-            llm=llm,
-            input_batch_size=8,
-            input_mappings={"instruction": "evolved_questions"},
-            output_mappings={"generation": "domain_expert_answer"},
-        )
-        domain_expert._system_prompt = domain_expert_prompt
-        domain_expert._template = template
-        keep_columns = KeepColumns(
-            name="keep_columns",
-            columns=["model_name", "evolved_questions", "domain_expert_answer"],
-        )
-        to_argilla = TextGenerationToArgilla(
-            name="text_generation_to_argilla",
-            dataset_name=argilla_dataset_name,
-            dataset_workspace="admin",
-            api_url=argilla_api_url,
-            api_key=argilla_api_key,
-            input_mappings={
-                "instruction": "evolved_questions",
-                "generation": "domain_expert_answer",
-            },
-        )
-        load_data.connect(self_instruct)
-        self_instruct.connect(expand_instructions)
-        expand_instructions.connect(cleaner)
-        cleaner.connect(evol_instruction_complexity)
-        evol_instruction_complexity.connect(expand_evolutions)
-        expand_evolutions.connect(domain_expert)
-        domain_expert.connect(keep_columns)
-        keep_columns.connect(to_argilla)
-    return pipeline
-def serialize_pipeline(
-    argilla_api_key: str,
-    argilla_api_url: str,
-    argilla_dataset_name: str,
-    topics: List[str],
-    perspectives: List[str],
-    domain_expert_prompt: str,
-    hub_token: str,
-    endpoint_base_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    examples: List[dict] = [],
-):
-    """Serialize the pipeline to a yaml file."""
-    pipeline = define_pipeline(
-        argilla_api_key=argilla_api_key,
-        argilla_api_url=argilla_api_url,
-        argilla_dataset_name=argilla_dataset_name,
-        topics=topics,
-        perspectives=perspectives,
-        domain_expert_prompt=domain_expert_prompt,
-        hub_token=hub_token,
-        endpoint_base_url=endpoint_base_url,
-        examples=examples,
-    )
-    pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
-def create_pipelines_run_command(
-    hub_token: str,
-    argilla_api_key: str,
-    argilla_api_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    argilla_dataset_name: str = "domain_specific_datasets",
-):
-    """Create the command to run the pipeline."""
-    command_to_run = [
-        sys.executable,
-        "-m",
-        "distilabel",
-        "pipeline",
-        "run",
-        "--config",
-        pipeline_config_path,
-        "--param",
-        f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
-        "--param",
-        f"text_generation_to_argilla.api_key={argilla_api_key}",
-        "--param",
-        f"text_generation_to_argilla.api_url={argilla_api_url}",
-        "--param",
-        f"self-instruct.llm.api_key={hub_token}",
-        "--param",
-        f"evol_instruction_complexity.llm.api_key={hub_token}",
-        "--param",
-        f"domain_expert.llm.api_key={hub_token}",
-        "--ignore-cache",
-    ]
-    return command_to_run
-def run_pipeline(
-    hub_token: str,
-    argilla_api_key: str,
-    argilla_api_url: str,
-    pipeline_config_path: str = "pipeline.yaml",
-    argilla_dataset_name: str = "domain_specific_datasets",
-):
-    """Run the pipeline and yield the output as a generator of logs."""
-    command_to_run = create_pipelines_run_command(
-        hub_token=hub_token,
-        pipeline_config_path=pipeline_config_path,
-        argilla_dataset_name=argilla_dataset_name,
-        argilla_api_key=argilla_api_key,
-        argilla_api_url=argilla_api_url,
-    )
-    # Run the script file
-    process = subprocess.Popen(
-        args=command_to_run,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        env={"HF_TOKEN": hub_token},
-    )
-    while process.stdout and process.stdout.readable():
-        time.sleep(0.2)
-        line = process.stdout.readline()
-        if not line:
-            break
-        yield line.decode("utf-8")

requirements.txt CHANGED Viewed

@@ -1,8 +1,4 @@
 datasets
 python_dotenv
-sentence_transformers
 streamlit
 huggingface_hub
-mistralai
-argilla
-git+https://github.com/argilla-io/distilabel.git

 datasets
 python_dotenv
 streamlit
 huggingface_hub